mirror of
https://github.com/gohugoio/hugo.git
synced 2024-11-21 20:46:30 -05:00
Lazy calculate WordCount, ReadingTime and FuzzyWordCount
This avoids having to execute these expensive operations for sites not using these values. This commit sums up a set of wordcounting and autosummary related performance improvements. The effect of these kind of depends on what features your site use, but a benchmark from 4 Hugo sites in the wild shows promise: ``` benchmark old ns/op new ns/op delta BenchmarkHugo-4 21293005843 20032857342 -5.92% benchmark old allocs new allocs delta BenchmarkHugo-4 65290922 65186032 -0.16% benchmark old bytes new bytes delta BenchmarkHugo-4 9771213416 9681866464 -0.91% ``` Closes #2378
This commit is contained in:
parent
4abaec5c04
commit
dd45e6d7e5
7 changed files with 103 additions and 57 deletions
|
@ -138,19 +138,28 @@ func StripHTML(s string) string {
|
||||||
// Walk through the string removing all tags
|
// Walk through the string removing all tags
|
||||||
b := bp.GetBuffer()
|
b := bp.GetBuffer()
|
||||||
defer bp.PutBuffer(b)
|
defer bp.PutBuffer(b)
|
||||||
|
var inTag, isSpace, wasSpace bool
|
||||||
inTag := false
|
|
||||||
for _, r := range s {
|
for _, r := range s {
|
||||||
switch r {
|
if !inTag {
|
||||||
case '<':
|
isSpace = false
|
||||||
|
}
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case r == '<':
|
||||||
inTag = true
|
inTag = true
|
||||||
case '>':
|
case r == '>':
|
||||||
inTag = false
|
inTag = false
|
||||||
|
case unicode.IsSpace(r):
|
||||||
|
isSpace = true
|
||||||
|
fallthrough
|
||||||
default:
|
default:
|
||||||
if !inTag {
|
if !inTag && (!isSpace || (isSpace && !wasSpace)) {
|
||||||
b.WriteRune(r)
|
b.WriteRune(r)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
wasSpace = isSpace
|
||||||
|
|
||||||
}
|
}
|
||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,11 +34,22 @@ func TestStripHTML(t *testing.T) {
|
||||||
}
|
}
|
||||||
data := []test{
|
data := []test{
|
||||||
{"<h1>strip h1 tag <h1>", "strip h1 tag "},
|
{"<h1>strip h1 tag <h1>", "strip h1 tag "},
|
||||||
{"<p> strip p tag </p>", " strip p tag \n"},
|
{"<p> strip p tag </p>", " strip p tag "},
|
||||||
{"</br> strip br<br>", " strip br\n"},
|
{"</br> strip br<br>", " strip br\n"},
|
||||||
{"</br> strip br2<br />", " strip br2\n"},
|
{"</br> strip br2<br />", " strip br2\n"},
|
||||||
{"This <strong>is</strong> a\nnewline", "This is a newline"},
|
{"This <strong>is</strong> a\nnewline", "This is a newline"},
|
||||||
{"No Tags", "No Tags"},
|
{"No Tags", "No Tags"},
|
||||||
|
{`<p>Summary Next Line.
|
||||||
|
<figure >
|
||||||
|
|
||||||
|
<img src="/not/real" />
|
||||||
|
|
||||||
|
|
||||||
|
</figure>
|
||||||
|
.
|
||||||
|
More text here.</p>
|
||||||
|
|
||||||
|
<p>Some more text</p>`, "Summary Next Line. . More text here.\nSome more text\n"},
|
||||||
}
|
}
|
||||||
for i, d := range data {
|
for i, d := range data {
|
||||||
output := StripHTML(d.input)
|
output := StripHTML(d.input)
|
||||||
|
|
|
@ -107,9 +107,10 @@ type Source struct {
|
||||||
source.File
|
source.File
|
||||||
}
|
}
|
||||||
type PageMeta struct {
|
type PageMeta struct {
|
||||||
WordCount int
|
wordCount int
|
||||||
FuzzyWordCount int
|
fuzzyWordCount int
|
||||||
ReadingTime int
|
readingTime int
|
||||||
|
pageMetaInit sync.Once
|
||||||
Weight int
|
Weight int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -485,28 +486,48 @@ func (p *Page) ReadFrom(buf io.Reader) (int64, error) {
|
||||||
return int64(len(p.rawContent)), nil
|
return int64(len(p.rawContent)), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (p *Page) WordCount() int {
|
||||||
|
p.analyzePage()
|
||||||
|
return p.wordCount
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *Page) ReadingTime() int {
|
||||||
|
p.analyzePage()
|
||||||
|
return p.readingTime
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *Page) FuzzyWordCount() int {
|
||||||
|
p.analyzePage()
|
||||||
|
return p.fuzzyWordCount
|
||||||
|
}
|
||||||
|
|
||||||
func (p *Page) analyzePage() {
|
func (p *Page) analyzePage() {
|
||||||
if p.isCJKLanguage {
|
p.pageMetaInit.Do(func() {
|
||||||
p.WordCount = 0
|
if p.isCJKLanguage {
|
||||||
for _, word := range p.PlainWords() {
|
p.wordCount = 0
|
||||||
runeCount := utf8.RuneCountInString(word)
|
for _, word := range p.PlainWords() {
|
||||||
if len(word) == runeCount {
|
runeCount := utf8.RuneCountInString(word)
|
||||||
p.WordCount++
|
if len(word) == runeCount {
|
||||||
} else {
|
p.wordCount++
|
||||||
p.WordCount += runeCount
|
} else {
|
||||||
|
p.wordCount += runeCount
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
p.wordCount = helpers.TotalWords(p.Plain())
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
p.WordCount = len(p.PlainWords())
|
|
||||||
}
|
|
||||||
|
|
||||||
p.FuzzyWordCount = (p.WordCount + 100) / 100 * 100
|
// TODO(bep) is set in a test. Fix that.
|
||||||
|
if p.fuzzyWordCount == 0 {
|
||||||
|
p.fuzzyWordCount = (p.wordCount + 100) / 100 * 100
|
||||||
|
}
|
||||||
|
|
||||||
if p.isCJKLanguage {
|
if p.isCJKLanguage {
|
||||||
p.ReadingTime = (p.WordCount + 500) / 501
|
p.readingTime = (p.wordCount + 500) / 501
|
||||||
} else {
|
} else {
|
||||||
p.ReadingTime = (p.WordCount + 212) / 213
|
p.readingTime = (p.wordCount + 212) / 213
|
||||||
}
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *Page) permalink() (*url.URL, error) {
|
func (p *Page) permalink() (*url.URL, error) {
|
||||||
|
|
|
@ -95,11 +95,11 @@ func TestLimit(t *testing.T) {
|
||||||
|
|
||||||
func TestPageSortReverse(t *testing.T) {
|
func TestPageSortReverse(t *testing.T) {
|
||||||
p1 := createSortTestPages(10)
|
p1 := createSortTestPages(10)
|
||||||
assert.Equal(t, 0, p1[0].FuzzyWordCount)
|
assert.Equal(t, 0, p1[0].fuzzyWordCount)
|
||||||
assert.Equal(t, 9, p1[9].FuzzyWordCount)
|
assert.Equal(t, 9, p1[9].fuzzyWordCount)
|
||||||
p2 := p1.Reverse()
|
p2 := p1.Reverse()
|
||||||
assert.Equal(t, 9, p2[0].FuzzyWordCount)
|
assert.Equal(t, 9, p2[0].fuzzyWordCount)
|
||||||
assert.Equal(t, 0, p2[9].FuzzyWordCount)
|
assert.Equal(t, 0, p2[9].fuzzyWordCount)
|
||||||
// cached
|
// cached
|
||||||
assert.True(t, probablyEqualPages(p2, p1.Reverse()))
|
assert.True(t, probablyEqualPages(p2, p1.Reverse()))
|
||||||
}
|
}
|
||||||
|
@ -149,7 +149,7 @@ func createSortTestPages(num int) Pages {
|
||||||
if i%2 == 0 {
|
if i%2 == 0 {
|
||||||
w = 10
|
w = 10
|
||||||
}
|
}
|
||||||
pages[i].FuzzyWordCount = i
|
pages[i].fuzzyWordCount = i
|
||||||
pages[i].Weight = w
|
pages[i].Weight = w
|
||||||
pages[i].Description = "initial"
|
pages[i].Description = "initial"
|
||||||
}
|
}
|
||||||
|
|
|
@ -504,10 +504,13 @@ func checkPageContent(t *testing.T, page *Page, content string, msg ...interface
|
||||||
}
|
}
|
||||||
|
|
||||||
func normalizeContent(c string) string {
|
func normalizeContent(c string) string {
|
||||||
norm := strings.Replace(c, "\n", "", -1)
|
norm := c
|
||||||
|
norm = strings.Replace(norm, "\n", " ", -1)
|
||||||
norm = strings.Replace(norm, " ", " ", -1)
|
norm = strings.Replace(norm, " ", " ", -1)
|
||||||
norm = strings.Replace(norm, " ", " ", -1)
|
norm = strings.Replace(norm, " ", " ", -1)
|
||||||
norm = strings.Replace(norm, " ", " ", -1)
|
norm = strings.Replace(norm, " ", " ", -1)
|
||||||
|
norm = strings.Replace(norm, "p> ", "p>", -1)
|
||||||
|
norm = strings.Replace(norm, "> <", "> <", -1)
|
||||||
return strings.TrimSpace(norm)
|
return strings.TrimSpace(norm)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -710,8 +713,8 @@ func TestPageWithShortCodeInSummary(t *testing.T) {
|
||||||
|
|
||||||
assertFunc := func(t *testing.T, ext string, p *Page) {
|
assertFunc := func(t *testing.T, ext string, p *Page) {
|
||||||
checkPageTitle(t, p, "Simple")
|
checkPageTitle(t, p, "Simple")
|
||||||
checkPageContent(t, p, normalizeExpected(ext, "<p>Summary Next Line. <figure > <img src=\"/not/real\" /> </figure>.\nMore text here.</p><p>Some more text</p>"), ext)
|
checkPageContent(t, p, normalizeExpected(ext, "<p>Summary Next Line. \n<figure >\n \n <img src=\"/not/real\" />\n \n \n</figure>\n.\nMore text here.</p>\n\n<p>Some more text</p>\n"))
|
||||||
checkPageSummary(t, p, "Summary Next Line. . More text here. Some more text", ext)
|
checkPageSummary(t, p, "Summary Next Line. . More text here. Some more text")
|
||||||
checkPageType(t, p, "page")
|
checkPageType(t, p, "page")
|
||||||
checkPageLayout(t, p, "page/single.html", "_default/single.html", "theme/page/single.html", "theme/_default/single.html")
|
checkPageLayout(t, p, "page/single.html", "_default/single.html", "theme/page/single.html", "theme/_default/single.html")
|
||||||
}
|
}
|
||||||
|
@ -793,8 +796,8 @@ func TestWordCountWithAllCJKRunesWithoutHasCJKLanguage(t *testing.T) {
|
||||||
testCommonResetState()
|
testCommonResetState()
|
||||||
|
|
||||||
assertFunc := func(t *testing.T, ext string, p *Page) {
|
assertFunc := func(t *testing.T, ext string, p *Page) {
|
||||||
if p.WordCount != 8 {
|
if p.WordCount() != 8 {
|
||||||
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 8, p.WordCount)
|
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 8, p.WordCount())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -806,11 +809,10 @@ func TestWordCountWithAllCJKRunesHasCJKLanguage(t *testing.T) {
|
||||||
viper.Set("HasCJKLanguage", true)
|
viper.Set("HasCJKLanguage", true)
|
||||||
|
|
||||||
assertFunc := func(t *testing.T, ext string, p *Page) {
|
assertFunc := func(t *testing.T, ext string, p *Page) {
|
||||||
if p.WordCount != 15 {
|
if p.WordCount() != 15 {
|
||||||
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 15, p.WordCount)
|
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 15, p.WordCount())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
testAllMarkdownEnginesForPage(t, assertFunc, "simple", simplePageWithAllCJKRunes)
|
testAllMarkdownEnginesForPage(t, assertFunc, "simple", simplePageWithAllCJKRunes)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -820,15 +822,14 @@ func TestWordCountWithMainEnglishWithCJKRunes(t *testing.T) {
|
||||||
viper.Set("HasCJKLanguage", true)
|
viper.Set("HasCJKLanguage", true)
|
||||||
|
|
||||||
assertFunc := func(t *testing.T, ext string, p *Page) {
|
assertFunc := func(t *testing.T, ext string, p *Page) {
|
||||||
if p.WordCount != 74 {
|
if p.WordCount() != 74 {
|
||||||
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 74, p.WordCount)
|
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 74, p.WordCount())
|
||||||
}
|
}
|
||||||
|
|
||||||
if p.Summary != simplePageWithMainEnglishWithCJKRunesSummary {
|
if p.Summary != simplePageWithMainEnglishWithCJKRunesSummary {
|
||||||
t.Fatalf("[%s] incorrect Summary for content '%s'. expected %v, got %v", ext, p.plain,
|
t.Fatalf("[%s] incorrect Summary for content '%s'. expected %v, got %v", ext, p.plain,
|
||||||
simplePageWithMainEnglishWithCJKRunesSummary, p.Summary)
|
simplePageWithMainEnglishWithCJKRunesSummary, p.Summary)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
testAllMarkdownEnginesForPage(t, assertFunc, "simple", simplePageWithMainEnglishWithCJKRunes)
|
testAllMarkdownEnginesForPage(t, assertFunc, "simple", simplePageWithMainEnglishWithCJKRunes)
|
||||||
|
@ -839,15 +840,14 @@ func TestWordCountWithIsCJKLanguageFalse(t *testing.T) {
|
||||||
viper.Set("HasCJKLanguage", true)
|
viper.Set("HasCJKLanguage", true)
|
||||||
|
|
||||||
assertFunc := func(t *testing.T, ext string, p *Page) {
|
assertFunc := func(t *testing.T, ext string, p *Page) {
|
||||||
if p.WordCount != 75 {
|
if p.WordCount() != 75 {
|
||||||
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 74, p.WordCount)
|
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 74, p.WordCount())
|
||||||
}
|
}
|
||||||
|
|
||||||
if p.Summary != simplePageWithIsCJKLanguageFalseSummary {
|
if p.Summary != simplePageWithIsCJKLanguageFalseSummary {
|
||||||
t.Fatalf("[%s] incorrect Summary for content '%s'. expected %v, got %v", ext, p.plain,
|
t.Fatalf("[%s] incorrect Summary for content '%s'. expected %v, got %v", ext, p.plain,
|
||||||
simplePageWithIsCJKLanguageFalseSummary, p.Summary)
|
simplePageWithIsCJKLanguageFalseSummary, p.Summary)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
testAllMarkdownEnginesForPage(t, assertFunc, "simple", simplePageWithIsCJKLanguageFalse)
|
testAllMarkdownEnginesForPage(t, assertFunc, "simple", simplePageWithIsCJKLanguageFalse)
|
||||||
|
@ -857,16 +857,16 @@ func TestWordCountWithIsCJKLanguageFalse(t *testing.T) {
|
||||||
func TestWordCount(t *testing.T) {
|
func TestWordCount(t *testing.T) {
|
||||||
|
|
||||||
assertFunc := func(t *testing.T, ext string, p *Page) {
|
assertFunc := func(t *testing.T, ext string, p *Page) {
|
||||||
if p.WordCount != 483 {
|
if p.WordCount() != 483 {
|
||||||
t.Fatalf("[%s] incorrect word count. expected %v, got %v", ext, 483, p.WordCount)
|
t.Fatalf("[%s] incorrect word count. expected %v, got %v", ext, 483, p.WordCount())
|
||||||
}
|
}
|
||||||
|
|
||||||
if p.FuzzyWordCount != 500 {
|
if p.FuzzyWordCount() != 500 {
|
||||||
t.Fatalf("[%s] incorrect word count. expected %v, got %v", ext, 500, p.WordCount)
|
t.Fatalf("[%s] incorrect word count. expected %v, got %v", ext, 500, p.WordCount())
|
||||||
}
|
}
|
||||||
|
|
||||||
if p.ReadingTime != 3 {
|
if p.ReadingTime() != 3 {
|
||||||
t.Fatalf("[%s] incorrect min read. expected %v, got %v", ext, 3, p.ReadingTime)
|
t.Fatalf("[%s] incorrect min read. expected %v, got %v", ext, 3, p.ReadingTime())
|
||||||
}
|
}
|
||||||
|
|
||||||
checkTruncation(t, p, true, "long page")
|
checkTruncation(t, p, true, "long page")
|
||||||
|
|
|
@ -55,7 +55,7 @@ func TestSplitPageGroups(t *testing.T) {
|
||||||
// first group 10 in weight
|
// first group 10 in weight
|
||||||
assert.Equal(t, 10, pg.Key)
|
assert.Equal(t, 10, pg.Key)
|
||||||
for _, p := range pg.Pages {
|
for _, p := range pg.Pages {
|
||||||
assert.True(t, p.FuzzyWordCount%2 == 0) // magic test
|
assert.True(t, p.fuzzyWordCount%2 == 0) // magic test
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -70,7 +70,7 @@ func TestSplitPageGroups(t *testing.T) {
|
||||||
// last should have 5 in weight
|
// last should have 5 in weight
|
||||||
assert.Equal(t, 5, pg.Key)
|
assert.Equal(t, 5, pg.Key)
|
||||||
for _, p := range pg.Pages {
|
for _, p := range pg.Pages {
|
||||||
assert.True(t, p.FuzzyWordCount%2 != 0) // magic test
|
assert.True(t, p.fuzzyWordCount%2 != 0) // magic test
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -443,10 +443,10 @@ func TestPage(t *testing.T) {
|
||||||
page21, _ := f2.page(1)
|
page21, _ := f2.page(1)
|
||||||
page2Nil, _ := f2.page(3)
|
page2Nil, _ := f2.page(3)
|
||||||
|
|
||||||
assert.Equal(t, 1, page11.FuzzyWordCount)
|
assert.Equal(t, 3, page11.fuzzyWordCount)
|
||||||
assert.Nil(t, page1Nil)
|
assert.Nil(t, page1Nil)
|
||||||
|
|
||||||
assert.Equal(t, 1, page21.FuzzyWordCount)
|
assert.Equal(t, 3, page21.fuzzyWordCount)
|
||||||
assert.Nil(t, page2Nil)
|
assert.Nil(t, page2Nil)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -468,7 +468,7 @@ func createTestPages(num int) Pages {
|
||||||
if i%2 == 0 {
|
if i%2 == 0 {
|
||||||
w = 10
|
w = 10
|
||||||
}
|
}
|
||||||
pages[i].FuzzyWordCount = i
|
pages[i].fuzzyWordCount = i + 2
|
||||||
pages[i].Weight = w
|
pages[i].Weight = w
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -33,6 +33,11 @@ import (
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
//There are expected ERROR logging in tests that produces a lot of noise.
|
||||||
|
jww.SetStdoutThreshold(jww.LevelCritical)
|
||||||
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
pageSimpleTitle = `---
|
pageSimpleTitle = `---
|
||||||
title: simple template
|
title: simple template
|
||||||
|
|
Loading…
Reference in a new issue