Lazy calculate WordCount, ReadingTime and FuzzyWordCount

This avoids having to execute these expensive operations for sites not using these values.

This commit sums up a set of wordcounting and autosummary related performance improvements.

The effect of these kind of depends on what features your site use, but a benchmark from 4 Hugo sites in the wild shows promise:

```
benchmark           old ns/op       new ns/op       delta
BenchmarkHugo-4     21293005843     20032857342     -5.92%

benchmark           old allocs     new allocs     delta
BenchmarkHugo-4     65290922       65186032       -0.16%

benchmark           old bytes      new bytes      delta
BenchmarkHugo-4     9771213416     9681866464     -0.91%
```

Closes #2378
This commit is contained in:
Bjørn Erik Pedersen 2016-08-17 13:41:48 +02:00
parent 4abaec5c04
commit dd45e6d7e5
7 changed files with 103 additions and 57 deletions

View file

@ -138,19 +138,28 @@ func StripHTML(s string) string {
// Walk through the string removing all tags
b := bp.GetBuffer()
defer bp.PutBuffer(b)
inTag := false
var inTag, isSpace, wasSpace bool
for _, r := range s {
switch r {
case '<':
if !inTag {
isSpace = false
}
switch {
case r == '<':
inTag = true
case '>':
case r == '>':
inTag = false
case unicode.IsSpace(r):
isSpace = true
fallthrough
default:
if !inTag {
if !inTag && (!isSpace || (isSpace && !wasSpace)) {
b.WriteRune(r)
}
}
wasSpace = isSpace
}
return b.String()
}

View file

@ -34,11 +34,22 @@ func TestStripHTML(t *testing.T) {
}
data := []test{
{"<h1>strip h1 tag <h1>", "strip h1 tag "},
{"<p> strip p tag </p>", " strip p tag \n"},
{"<p> strip p tag </p>", " strip p tag "},
{"</br> strip br<br>", " strip br\n"},
{"</br> strip br2<br />", " strip br2\n"},
{"This <strong>is</strong> a\nnewline", "This is a newline"},
{"No Tags", "No Tags"},
{`<p>Summary Next Line.
<figure >
<img src="/not/real" />
</figure>
.
More text here.</p>
<p>Some more text</p>`, "Summary Next Line. . More text here.\nSome more text\n"},
}
for i, d := range data {
output := StripHTML(d.input)

View file

@ -107,9 +107,10 @@ type Source struct {
source.File
}
type PageMeta struct {
WordCount int
FuzzyWordCount int
ReadingTime int
wordCount int
fuzzyWordCount int
readingTime int
pageMetaInit sync.Once
Weight int
}
@ -485,28 +486,48 @@ func (p *Page) ReadFrom(buf io.Reader) (int64, error) {
return int64(len(p.rawContent)), nil
}
func (p *Page) WordCount() int {
p.analyzePage()
return p.wordCount
}
func (p *Page) ReadingTime() int {
p.analyzePage()
return p.readingTime
}
func (p *Page) FuzzyWordCount() int {
p.analyzePage()
return p.fuzzyWordCount
}
func (p *Page) analyzePage() {
if p.isCJKLanguage {
p.WordCount = 0
for _, word := range p.PlainWords() {
runeCount := utf8.RuneCountInString(word)
if len(word) == runeCount {
p.WordCount++
} else {
p.WordCount += runeCount
p.pageMetaInit.Do(func() {
if p.isCJKLanguage {
p.wordCount = 0
for _, word := range p.PlainWords() {
runeCount := utf8.RuneCountInString(word)
if len(word) == runeCount {
p.wordCount++
} else {
p.wordCount += runeCount
}
}
} else {
p.wordCount = helpers.TotalWords(p.Plain())
}
} else {
p.WordCount = len(p.PlainWords())
}
p.FuzzyWordCount = (p.WordCount + 100) / 100 * 100
// TODO(bep) is set in a test. Fix that.
if p.fuzzyWordCount == 0 {
p.fuzzyWordCount = (p.wordCount + 100) / 100 * 100
}
if p.isCJKLanguage {
p.ReadingTime = (p.WordCount + 500) / 501
} else {
p.ReadingTime = (p.WordCount + 212) / 213
}
if p.isCJKLanguage {
p.readingTime = (p.wordCount + 500) / 501
} else {
p.readingTime = (p.wordCount + 212) / 213
}
})
}
func (p *Page) permalink() (*url.URL, error) {

View file

@ -95,11 +95,11 @@ func TestLimit(t *testing.T) {
func TestPageSortReverse(t *testing.T) {
p1 := createSortTestPages(10)
assert.Equal(t, 0, p1[0].FuzzyWordCount)
assert.Equal(t, 9, p1[9].FuzzyWordCount)
assert.Equal(t, 0, p1[0].fuzzyWordCount)
assert.Equal(t, 9, p1[9].fuzzyWordCount)
p2 := p1.Reverse()
assert.Equal(t, 9, p2[0].FuzzyWordCount)
assert.Equal(t, 0, p2[9].FuzzyWordCount)
assert.Equal(t, 9, p2[0].fuzzyWordCount)
assert.Equal(t, 0, p2[9].fuzzyWordCount)
// cached
assert.True(t, probablyEqualPages(p2, p1.Reverse()))
}
@ -149,7 +149,7 @@ func createSortTestPages(num int) Pages {
if i%2 == 0 {
w = 10
}
pages[i].FuzzyWordCount = i
pages[i].fuzzyWordCount = i
pages[i].Weight = w
pages[i].Description = "initial"
}

View file

@ -504,10 +504,13 @@ func checkPageContent(t *testing.T, page *Page, content string, msg ...interface
}
func normalizeContent(c string) string {
norm := strings.Replace(c, "\n", "", -1)
norm := c
norm = strings.Replace(norm, "\n", " ", -1)
norm = strings.Replace(norm, " ", " ", -1)
norm = strings.Replace(norm, " ", " ", -1)
norm = strings.Replace(norm, " ", " ", -1)
norm = strings.Replace(norm, "p> ", "p>", -1)
norm = strings.Replace(norm, "> <", "> <", -1)
return strings.TrimSpace(norm)
}
@ -710,8 +713,8 @@ func TestPageWithShortCodeInSummary(t *testing.T) {
assertFunc := func(t *testing.T, ext string, p *Page) {
checkPageTitle(t, p, "Simple")
checkPageContent(t, p, normalizeExpected(ext, "<p>Summary Next Line. <figure > <img src=\"/not/real\" /> </figure>.\nMore text here.</p><p>Some more text</p>"), ext)
checkPageSummary(t, p, "Summary Next Line. . More text here. Some more text", ext)
checkPageContent(t, p, normalizeExpected(ext, "<p>Summary Next Line. \n<figure >\n \n <img src=\"/not/real\" />\n \n \n</figure>\n.\nMore text here.</p>\n\n<p>Some more text</p>\n"))
checkPageSummary(t, p, "Summary Next Line. . More text here. Some more text")
checkPageType(t, p, "page")
checkPageLayout(t, p, "page/single.html", "_default/single.html", "theme/page/single.html", "theme/_default/single.html")
}
@ -793,8 +796,8 @@ func TestWordCountWithAllCJKRunesWithoutHasCJKLanguage(t *testing.T) {
testCommonResetState()
assertFunc := func(t *testing.T, ext string, p *Page) {
if p.WordCount != 8 {
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 8, p.WordCount)
if p.WordCount() != 8 {
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 8, p.WordCount())
}
}
@ -806,11 +809,10 @@ func TestWordCountWithAllCJKRunesHasCJKLanguage(t *testing.T) {
viper.Set("HasCJKLanguage", true)
assertFunc := func(t *testing.T, ext string, p *Page) {
if p.WordCount != 15 {
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 15, p.WordCount)
if p.WordCount() != 15 {
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 15, p.WordCount())
}
}
testAllMarkdownEnginesForPage(t, assertFunc, "simple", simplePageWithAllCJKRunes)
}
@ -820,15 +822,14 @@ func TestWordCountWithMainEnglishWithCJKRunes(t *testing.T) {
viper.Set("HasCJKLanguage", true)
assertFunc := func(t *testing.T, ext string, p *Page) {
if p.WordCount != 74 {
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 74, p.WordCount)
if p.WordCount() != 74 {
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 74, p.WordCount())
}
if p.Summary != simplePageWithMainEnglishWithCJKRunesSummary {
t.Fatalf("[%s] incorrect Summary for content '%s'. expected %v, got %v", ext, p.plain,
simplePageWithMainEnglishWithCJKRunesSummary, p.Summary)
}
}
testAllMarkdownEnginesForPage(t, assertFunc, "simple", simplePageWithMainEnglishWithCJKRunes)
@ -839,15 +840,14 @@ func TestWordCountWithIsCJKLanguageFalse(t *testing.T) {
viper.Set("HasCJKLanguage", true)
assertFunc := func(t *testing.T, ext string, p *Page) {
if p.WordCount != 75 {
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 74, p.WordCount)
if p.WordCount() != 75 {
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 74, p.WordCount())
}
if p.Summary != simplePageWithIsCJKLanguageFalseSummary {
t.Fatalf("[%s] incorrect Summary for content '%s'. expected %v, got %v", ext, p.plain,
simplePageWithIsCJKLanguageFalseSummary, p.Summary)
}
}
testAllMarkdownEnginesForPage(t, assertFunc, "simple", simplePageWithIsCJKLanguageFalse)
@ -857,16 +857,16 @@ func TestWordCountWithIsCJKLanguageFalse(t *testing.T) {
func TestWordCount(t *testing.T) {
assertFunc := func(t *testing.T, ext string, p *Page) {
if p.WordCount != 483 {
t.Fatalf("[%s] incorrect word count. expected %v, got %v", ext, 483, p.WordCount)
if p.WordCount() != 483 {
t.Fatalf("[%s] incorrect word count. expected %v, got %v", ext, 483, p.WordCount())
}
if p.FuzzyWordCount != 500 {
t.Fatalf("[%s] incorrect word count. expected %v, got %v", ext, 500, p.WordCount)
if p.FuzzyWordCount() != 500 {
t.Fatalf("[%s] incorrect word count. expected %v, got %v", ext, 500, p.WordCount())
}
if p.ReadingTime != 3 {
t.Fatalf("[%s] incorrect min read. expected %v, got %v", ext, 3, p.ReadingTime)
if p.ReadingTime() != 3 {
t.Fatalf("[%s] incorrect min read. expected %v, got %v", ext, 3, p.ReadingTime())
}
checkTruncation(t, p, true, "long page")

View file

@ -55,7 +55,7 @@ func TestSplitPageGroups(t *testing.T) {
// first group 10 in weight
assert.Equal(t, 10, pg.Key)
for _, p := range pg.Pages {
assert.True(t, p.FuzzyWordCount%2 == 0) // magic test
assert.True(t, p.fuzzyWordCount%2 == 0) // magic test
}
}
} else {
@ -70,7 +70,7 @@ func TestSplitPageGroups(t *testing.T) {
// last should have 5 in weight
assert.Equal(t, 5, pg.Key)
for _, p := range pg.Pages {
assert.True(t, p.FuzzyWordCount%2 != 0) // magic test
assert.True(t, p.fuzzyWordCount%2 != 0) // magic test
}
}
} else {
@ -443,10 +443,10 @@ func TestPage(t *testing.T) {
page21, _ := f2.page(1)
page2Nil, _ := f2.page(3)
assert.Equal(t, 1, page11.FuzzyWordCount)
assert.Equal(t, 3, page11.fuzzyWordCount)
assert.Nil(t, page1Nil)
assert.Equal(t, 1, page21.FuzzyWordCount)
assert.Equal(t, 3, page21.fuzzyWordCount)
assert.Nil(t, page2Nil)
}
@ -468,7 +468,7 @@ func createTestPages(num int) Pages {
if i%2 == 0 {
w = 10
}
pages[i].FuzzyWordCount = i
pages[i].fuzzyWordCount = i + 2
pages[i].Weight = w
}

View file

@ -33,6 +33,11 @@ import (
"github.com/stretchr/testify/require"
)
func init() {
//There are expected ERROR logging in tests that produces a lot of noise.
jww.SetStdoutThreshold(jww.LevelCritical)
}
const (
pageSimpleTitle = `---
title: simple template