WordCount and Summary support CJK Language

* add global `hasCJKLanguage` flag, if true, turn on auto-detecting CJKLanguage
 * add `isCJKLanguage` frontmatter to force specify whether is CJKLanguage or not
 * For .Summary: If isCJKLanguage is true, use the runes as basis for truncation, else keep as today.
 * For WordCount: If isCJKLanguage is true, use the runes as basis for calculation, else keep as today.
 * Unexport RuneCount

Fixes #1377
This commit is contained in:
coderzh 2015-09-03 18:22:20 +08:00 committed by Bjørn Erik Pedersen
parent 2c045ac449
commit 823334875d
5 changed files with 247 additions and 93 deletions

View file

@ -168,6 +168,7 @@ func LoadDefaultSettings() {
viper.SetDefault("RSSUri", "index.xml")
viper.SetDefault("SectionPagesMenu", "")
viper.SetDefault("DisablePathToLower", false)
viper.SetDefault("HasCJKLanguage", false)
}
// InitializeConfig initializes a config file with sensible default configuration flags.

View file

@ -19,9 +19,9 @@ package helpers
import (
"bytes"
"unicode/utf8"
"html/template"
"os/exec"
"unicode/utf8"
"github.com/miekg/mmark"
"github.com/russross/blackfriday"
@ -178,7 +178,6 @@ func GetHTMLRenderer(defaultFlags int, ctx *RenderingContext) blackfriday.Render
}
}
func getMarkdownExtensions(ctx *RenderingContext) int {
flags := 0 | blackfriday.EXTENSION_NO_INTRA_EMPHASIS |
blackfriday.EXTENSION_TABLES | blackfriday.EXTENSION_FENCED_CODE |
@ -385,61 +384,51 @@ func TruncateWords(s string, max int) string {
return strings.Join(words[:max], " ")
}
func TruncateWordsByRune(words []string, max int) (string, bool) {
count := 0
for index, word := range words {
if count >= max {
return strings.Join(words[:index], " "), true
}
runeCount := utf8.RuneCountInString(word)
if len(word) == runeCount {
count++
} else if count+runeCount < max {
count += runeCount
} else {
for ri, _ := range word {
if count >= max {
truncatedWords := append(words[:index], word[:ri])
return strings.Join(truncatedWords, " "), true
} else {
count++
}
}
}
}
return strings.Join(words, " "), false
}
// TruncateWordsToWholeSentence takes content and an int
// and returns entire sentences from content, delimited by the int
// and whether it's truncated or not.
func TruncateWordsToWholeSentence(words []string, max int) (string, bool) {
count := 0
index, word := 0, ""
truncated := false
for index, word = range words {
runeCount := utf8.RuneCountInString(word)
if len(word) == runeCount {
count++;
} else {
if count + runeCount <= max {
count += runeCount
} else {
offset := 0
for count < max {
_, width := utf8.DecodeRuneInString(word[offset:])
offset += width
count++
}
words[index] = word[:offset]
truncated = true
}
if max >= len(words) {
return strings.Join(words, " "), false
}
if count >= max {
if index < len(words) - 1 {
truncated = true
}
break
}
}
index += 1
if index < len(words) {
for counter, word := range words[index:] {
if len(word) != utf8.RuneCountInString(word) {
break
}
for counter, word := range words[max:] {
if strings.HasSuffix(word, ".") ||
strings.HasSuffix(word, "?") ||
strings.HasSuffix(word, ".\"") ||
strings.HasSuffix(word, "!") {
upper := index + counter + 1
upper := max + counter + 1
return strings.Join(words[:upper], " "), (upper < len(words))
}
}
} else if index > len(words) {
return strings.Join(words, " "), truncated
}
return strings.Join(words[:index], " "), truncated
return strings.Join(words[:max], " "), true
}
// GetAsciidocContent calls asciidoctor or asciidoc as an external helper

View file

@ -1,10 +1,11 @@
package helpers
import (
"github.com/stretchr/testify/assert"
"html/template"
"strings"
"testing"
"github.com/stretchr/testify/assert"
)
const tstHTMLContent = "<!DOCTYPE html><html><head><script src=\"http://two/foobar.js\"></script></head><body><nav><ul><li hugo-nav=\"section_0\"></li><li hugo-nav=\"section_1\"></li></ul></nav><article>content <a href=\"http://two/foobar\">foobar</a>. Follow up</article><p>This is some text.<br>And some more.</p></body></html>"
@ -54,8 +55,6 @@ func TestTruncateWordsToWholeSentence(t *testing.T) {
{"a b c", "a b c", 12, false},
{"a b c", "a b c", 3, false},
{"a", "a", 1, false},
{"Hello 中国", "Hello 中", 2, true},
{"Hello 中国", "Hello 中国", 3, false},
{"This is a sentence.", "This is a sentence.", 5, false},
{"This is also a sentence!", "This is also a sentence!", 1, false},
{"To be. Or not to be. That's the question.", "To be.", 1, true},
@ -72,3 +71,36 @@ func TestTruncateWordsToWholeSentence(t *testing.T) {
}
}
}
func TestTruncateWordsByRune(t *testing.T) {
type test struct {
input, expected string
max int
truncated bool
}
data := []test{
{"", "", 1, false},
{"a b c", "a b c", 12, false},
{"a b c", "a b c", 3, false},
{"a", "a", 1, false},
{"Hello 中国", "", 0, true},
{"这是中文,全中文。", "这是中文,", 5, true},
{"Hello 中国", "Hello 中", 2, true},
{"Hello 中国", "Hello 中国", 3, false},
{"Hello中国 Good 好的", "Hello中国 Good 好", 9, true},
{"This is a sentence.", "This is", 2, true},
{"This is also a sentence!", "This", 1, true},
{"To be. Or not to be. That's the question.", "To be. Or not", 4, true},
{" \nThis is not a sentence\n ", "This is not", 3, true},
}
for i, d := range data {
output, truncated := TruncateWordsByRune(strings.Fields(d.input), d.max)
if d.expected != output {
t.Errorf("Test %d failed. Expected %q got %q", i, d.expected, output)
}
if d.truncated != truncated {
t.Errorf("Test %d failed. Expected truncated=%t got %t", i, d.truncated, truncated)
}
}
}

View file

@ -28,6 +28,7 @@ import (
"net/url"
"path"
"path/filepath"
"regexp"
"strings"
"sync"
"time"
@ -42,6 +43,10 @@ import (
"github.com/spf13/viper"
)
var (
cjk = regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}`)
)
type Page struct {
Params map[string]interface{}
Content template.HTML
@ -67,7 +72,6 @@ type Page struct {
contentShortCodes map[string]string
plain string // TODO should be []byte
plainWords []string
plainRuneCount int
plainInit sync.Once
plainSecondaryInit sync.Once
renderingConfig *helpers.Blackfriday
@ -78,6 +82,7 @@ type Page struct {
Node
pageMenus PageMenus
pageMenusInit sync.Once
isCJKLanguage bool
}
type Source struct {
@ -111,12 +116,6 @@ func (p *Page) PlainWords() []string {
return p.plainWords
}
// RuneCount returns the rune count, excluding any whitespace, of the plain content.
func (p *Page) RuneCount() int {
p.initPlainSecondary()
return p.plainRuneCount
}
func (p *Page) initPlain() {
p.plainInit.Do(func() {
p.plain = helpers.StripHTML(string(p.Content))
@ -125,20 +124,6 @@ func (p *Page) initPlain() {
})
}
func (p *Page) initPlainSecondary() {
p.plainSecondaryInit.Do(func() {
p.initPlain()
runeCount := 0
for _, r := range p.plain {
if !helpers.IsWhitespace(r) {
runeCount++
}
}
p.plainRuneCount = runeCount
return
})
}
func (p *Page) IsNode() bool {
return false
}
@ -218,7 +203,13 @@ func (p *Page) setSummary() {
} else {
// If hugo defines split:
// render, strip html, then split
summary, truncated := helpers.TruncateWordsToWholeSentence(p.PlainWords(), helpers.SummaryLength)
var summary string
var truncated bool
if p.isCJKLanguage {
summary, truncated = helpers.TruncateWordsByRune(p.PlainWords(), helpers.SummaryLength)
} else {
summary, truncated = helpers.TruncateWordsToWholeSentence(p.PlainWords(), helpers.SummaryLength)
}
p.Summary = template.HTML(summary)
p.Truncated = truncated
@ -363,6 +354,7 @@ func (p *Page) ReadFrom(buf io.Reader) (int64, error) {
}
func (p *Page) analyzePage() {
if p.isCJKLanguage {
p.WordCount = 0
for _, word := range p.PlainWords() {
runeCount := utf8.RuneCountInString(word)
@ -372,10 +364,18 @@ func (p *Page) analyzePage() {
p.WordCount += runeCount
}
}
} else {
p.WordCount = len(p.PlainWords())
}
p.FuzzyWordCount = int((p.WordCount+100)/100) * 100
if p.isCJKLanguage {
p.ReadingTime = int((p.WordCount + 500) / 501)
} else {
p.ReadingTime = int((p.WordCount + 212) / 213)
}
}
func (p *Page) permalink() (*url.URL, error) {
baseURL := string(p.Site.BaseURL)
@ -481,7 +481,7 @@ func (p *Page) update(f interface{}) error {
}
m := f.(map[string]interface{})
var err error
var draft, published *bool
var draft, published, isCJKLanguage *bool
for k, v := range m {
loki := strings.ToLower(k)
switch loki {
@ -542,6 +542,9 @@ func (p *Page) update(f interface{}) error {
p.Status = cast.ToString(v)
case "sitemap":
p.Sitemap = parseSitemap(cast.ToStringMap(v))
case "iscjklanguage":
isCJKLanguage = new(bool)
*isCJKLanguage = cast.ToBool(v)
default:
// If not one of the explicit values, store in Params
switch vv := v.(type) {
@ -596,6 +599,16 @@ func (p *Page) update(f interface{}) error {
p.Lastmod = p.Date
}
if isCJKLanguage != nil {
p.isCJKLanguage = *isCJKLanguage
} else if viper.GetBool("HasCJKLanguage") {
if cjk.Match(p.rawContent) {
p.isCJKLanguage = true
} else {
p.isCJKLanguage = false
}
}
return nil
}
@ -766,6 +779,8 @@ func (p *Page) parse(reader io.Reader) error {
p.renderable = psr.IsRenderable()
p.frontmatter = psr.FrontMatter()
p.rawContent = psr.Content()
meta, err := psr.Metadata()
if meta != nil {
if err != nil {
@ -778,8 +793,6 @@ func (p *Page) parse(reader io.Reader) error {
}
}
p.rawContent = psr.Content()
return nil
}

View file

@ -146,16 +146,67 @@ Summary Same Line<!--more-->
Some more text
`
SIMPLE_PAGE_WITH_FIVE_MULTIBYTE_UFT8_RUNES = `---
SIMPLE_PAGE_WITH_ALL_CJK_RUNES = `---
title: Simple
---
你好
도형이
カテゴリー
`
SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES = `---
title: Simple
---
In Chinese, means good. In Chinese, means good.
In Chinese, means good. In Chinese, means good.
In Chinese, means good. In Chinese, means good.
In Chinese, means good. In Chinese, means good.
In Chinese, means good. In Chinese, means good.
In Chinese, means good. In Chinese, means good.
In Chinese, means good. In Chinese, means good.
More then 70 words.
`
SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES_SUMMARY = "In Chinese, 好 means good. In Chinese, 好 means good. " +
"In Chinese, 好 means good. In Chinese, 好 means good. " +
"In Chinese, 好 means good. In Chinese, 好 means good. " +
"In Chinese, 好 means good. In Chinese, 好 means good. " +
"In Chinese, 好 means good. In Chinese, 好 means good. " +
"In Chinese, 好 means good. In Chinese, 好 means good. " +
"In Chinese, 好 means good. In Chinese, 好 means good."
SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE = `---
title: Simple
isCJKLanguage: false
---
In Chinese, 好的啊 means good. In Chinese, 好的呀 means good.
In Chinese, 好的啊 means good. In Chinese, 好的呀 means good.
In Chinese, 好的啊 means good. In Chinese, 好的呀 means good.
In Chinese, 好的啊 means good. In Chinese, 好的呀 means good.
In Chinese, 好的啊 means good. In Chinese, 好的呀 means good.
In Chinese, 好的啊 means good. In Chinese, 好的呀 means good.
In Chinese, 好的啊 means good. In Chinese, 好的呀呀 means good enough.
More then 70 words.
`
SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE_SUMMARY = "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
"In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
"In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
"In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
"In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
"In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
"In Chinese, 好的啊 means good. In Chinese, 好的呀呀 means good enough."
SIMPLE_PAGE_WITH_LONG_CONTENT = `---
title: Simple
---
@ -584,18 +635,86 @@ func TestPageWithDate(t *testing.T) {
checkPageDate(t, p, d)
}
func TestRuneCount(t *testing.T) {
func TestWordCountWithAllCJKRunesWithoutHasCJKLanguage(t *testing.T) {
viper.Reset()
p, _ := NewPage("simple.md")
_, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_FIVE_MULTIBYTE_UFT8_RUNES))
_, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_ALL_CJK_RUNES))
p.Convert()
p.analyzePage()
if err != nil {
t.Fatalf("Unable to create a page with frontmatter and body content: %s", err)
}
if p.RuneCount() != 5 {
t.Fatalf("incorrect rune count for content '%s'. expected %v, got %v", p.plain, 5, p.RuneCount())
if p.WordCount != 8 {
t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 8, p.WordCount)
}
}
func TestWordCountWithAllCJKRunesHasCJKLanguage(t *testing.T) {
viper.Reset()
defer viper.Reset()
viper.Set("HasCJKLanguage", true)
p, _ := NewPage("simple.md")
_, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_ALL_CJK_RUNES))
p.Convert()
p.analyzePage()
if err != nil {
t.Fatalf("Unable to create a page with frontmatter and body content: %s", err)
}
if p.WordCount != 15 {
t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 15, p.WordCount)
}
}
func TestWordCountWithMainEnglishWithCJKRunes(t *testing.T) {
viper.Reset()
defer viper.Reset()
viper.Set("HasCJKLanguage", true)
p, _ := NewPage("simple.md")
_, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES))
p.Convert()
p.analyzePage()
if err != nil {
t.Fatalf("Unable to create a page with frontmatter and body content: %s", err)
}
if p.WordCount != 74 {
t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 74, p.WordCount)
}
if p.Summary != SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES_SUMMARY {
t.Fatalf("incorrect Summary for content '%s'. expected %v, got %v", p.plain,
SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES_SUMMARY, p.Summary)
}
}
func TestWordCountWithIsCJKLanguageFalse(t *testing.T) {
viper.Reset()
defer viper.Reset()
viper.Set("HasCJKLanguage", true)
p, _ := NewPage("simple.md")
_, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE))
p.Convert()
p.analyzePage()
if err != nil {
t.Fatalf("Unable to create a page with frontmatter and body content: %s", err)
}
if p.WordCount != 75 {
t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 75, p.WordCount)
}
if p.Summary != SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE_SUMMARY {
t.Fatalf("incorrect Summary for content '%s'. expected %v, got %v", p.plain,
SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE_SUMMARY, p.Summary)
}
}