mirror of
https://github.com/gohugoio/hugo.git
synced 2024-11-21 20:46:30 -05:00
Lazy calculate WordCount, ReadingTime and FuzzyWordCount
This avoids having to execute these expensive operations for sites not using these values. This commit sums up a set of wordcounting and autosummary related performance improvements. The effect of these kind of depends on what features your site use, but a benchmark from 4 Hugo sites in the wild shows promise: ``` benchmark old ns/op new ns/op delta BenchmarkHugo-4 21293005843 20032857342 -5.92% benchmark old allocs new allocs delta BenchmarkHugo-4 65290922 65186032 -0.16% benchmark old bytes new bytes delta BenchmarkHugo-4 9771213416 9681866464 -0.91% ``` Closes #2378
This commit is contained in:
parent
4abaec5c04
commit
dd45e6d7e5
7 changed files with 103 additions and 57 deletions
|
@ -138,19 +138,28 @@ func StripHTML(s string) string {
|
|||
// Walk through the string removing all tags
|
||||
b := bp.GetBuffer()
|
||||
defer bp.PutBuffer(b)
|
||||
|
||||
inTag := false
|
||||
var inTag, isSpace, wasSpace bool
|
||||
for _, r := range s {
|
||||
switch r {
|
||||
case '<':
|
||||
if !inTag {
|
||||
isSpace = false
|
||||
}
|
||||
|
||||
switch {
|
||||
case r == '<':
|
||||
inTag = true
|
||||
case '>':
|
||||
case r == '>':
|
||||
inTag = false
|
||||
case unicode.IsSpace(r):
|
||||
isSpace = true
|
||||
fallthrough
|
||||
default:
|
||||
if !inTag {
|
||||
if !inTag && (!isSpace || (isSpace && !wasSpace)) {
|
||||
b.WriteRune(r)
|
||||
}
|
||||
}
|
||||
|
||||
wasSpace = isSpace
|
||||
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
|
|
@ -34,11 +34,22 @@ func TestStripHTML(t *testing.T) {
|
|||
}
|
||||
data := []test{
|
||||
{"<h1>strip h1 tag <h1>", "strip h1 tag "},
|
||||
{"<p> strip p tag </p>", " strip p tag \n"},
|
||||
{"<p> strip p tag </p>", " strip p tag "},
|
||||
{"</br> strip br<br>", " strip br\n"},
|
||||
{"</br> strip br2<br />", " strip br2\n"},
|
||||
{"This <strong>is</strong> a\nnewline", "This is a newline"},
|
||||
{"No Tags", "No Tags"},
|
||||
{`<p>Summary Next Line.
|
||||
<figure >
|
||||
|
||||
<img src="/not/real" />
|
||||
|
||||
|
||||
</figure>
|
||||
.
|
||||
More text here.</p>
|
||||
|
||||
<p>Some more text</p>`, "Summary Next Line. . More text here.\nSome more text\n"},
|
||||
}
|
||||
for i, d := range data {
|
||||
output := StripHTML(d.input)
|
||||
|
|
|
@ -107,9 +107,10 @@ type Source struct {
|
|||
source.File
|
||||
}
|
||||
type PageMeta struct {
|
||||
WordCount int
|
||||
FuzzyWordCount int
|
||||
ReadingTime int
|
||||
wordCount int
|
||||
fuzzyWordCount int
|
||||
readingTime int
|
||||
pageMetaInit sync.Once
|
||||
Weight int
|
||||
}
|
||||
|
||||
|
@ -485,28 +486,48 @@ func (p *Page) ReadFrom(buf io.Reader) (int64, error) {
|
|||
return int64(len(p.rawContent)), nil
|
||||
}
|
||||
|
||||
func (p *Page) WordCount() int {
|
||||
p.analyzePage()
|
||||
return p.wordCount
|
||||
}
|
||||
|
||||
func (p *Page) ReadingTime() int {
|
||||
p.analyzePage()
|
||||
return p.readingTime
|
||||
}
|
||||
|
||||
func (p *Page) FuzzyWordCount() int {
|
||||
p.analyzePage()
|
||||
return p.fuzzyWordCount
|
||||
}
|
||||
|
||||
func (p *Page) analyzePage() {
|
||||
if p.isCJKLanguage {
|
||||
p.WordCount = 0
|
||||
for _, word := range p.PlainWords() {
|
||||
runeCount := utf8.RuneCountInString(word)
|
||||
if len(word) == runeCount {
|
||||
p.WordCount++
|
||||
} else {
|
||||
p.WordCount += runeCount
|
||||
p.pageMetaInit.Do(func() {
|
||||
if p.isCJKLanguage {
|
||||
p.wordCount = 0
|
||||
for _, word := range p.PlainWords() {
|
||||
runeCount := utf8.RuneCountInString(word)
|
||||
if len(word) == runeCount {
|
||||
p.wordCount++
|
||||
} else {
|
||||
p.wordCount += runeCount
|
||||
}
|
||||
}
|
||||
} else {
|
||||
p.wordCount = helpers.TotalWords(p.Plain())
|
||||
}
|
||||
} else {
|
||||
p.WordCount = len(p.PlainWords())
|
||||
}
|
||||
|
||||
p.FuzzyWordCount = (p.WordCount + 100) / 100 * 100
|
||||
// TODO(bep) is set in a test. Fix that.
|
||||
if p.fuzzyWordCount == 0 {
|
||||
p.fuzzyWordCount = (p.wordCount + 100) / 100 * 100
|
||||
}
|
||||
|
||||
if p.isCJKLanguage {
|
||||
p.ReadingTime = (p.WordCount + 500) / 501
|
||||
} else {
|
||||
p.ReadingTime = (p.WordCount + 212) / 213
|
||||
}
|
||||
if p.isCJKLanguage {
|
||||
p.readingTime = (p.wordCount + 500) / 501
|
||||
} else {
|
||||
p.readingTime = (p.wordCount + 212) / 213
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func (p *Page) permalink() (*url.URL, error) {
|
||||
|
|
|
@ -95,11 +95,11 @@ func TestLimit(t *testing.T) {
|
|||
|
||||
func TestPageSortReverse(t *testing.T) {
|
||||
p1 := createSortTestPages(10)
|
||||
assert.Equal(t, 0, p1[0].FuzzyWordCount)
|
||||
assert.Equal(t, 9, p1[9].FuzzyWordCount)
|
||||
assert.Equal(t, 0, p1[0].fuzzyWordCount)
|
||||
assert.Equal(t, 9, p1[9].fuzzyWordCount)
|
||||
p2 := p1.Reverse()
|
||||
assert.Equal(t, 9, p2[0].FuzzyWordCount)
|
||||
assert.Equal(t, 0, p2[9].FuzzyWordCount)
|
||||
assert.Equal(t, 9, p2[0].fuzzyWordCount)
|
||||
assert.Equal(t, 0, p2[9].fuzzyWordCount)
|
||||
// cached
|
||||
assert.True(t, probablyEqualPages(p2, p1.Reverse()))
|
||||
}
|
||||
|
@ -149,7 +149,7 @@ func createSortTestPages(num int) Pages {
|
|||
if i%2 == 0 {
|
||||
w = 10
|
||||
}
|
||||
pages[i].FuzzyWordCount = i
|
||||
pages[i].fuzzyWordCount = i
|
||||
pages[i].Weight = w
|
||||
pages[i].Description = "initial"
|
||||
}
|
||||
|
|
|
@ -504,10 +504,13 @@ func checkPageContent(t *testing.T, page *Page, content string, msg ...interface
|
|||
}
|
||||
|
||||
func normalizeContent(c string) string {
|
||||
norm := strings.Replace(c, "\n", "", -1)
|
||||
norm := c
|
||||
norm = strings.Replace(norm, "\n", " ", -1)
|
||||
norm = strings.Replace(norm, " ", " ", -1)
|
||||
norm = strings.Replace(norm, " ", " ", -1)
|
||||
norm = strings.Replace(norm, " ", " ", -1)
|
||||
norm = strings.Replace(norm, "p> ", "p>", -1)
|
||||
norm = strings.Replace(norm, "> <", "> <", -1)
|
||||
return strings.TrimSpace(norm)
|
||||
}
|
||||
|
||||
|
@ -710,8 +713,8 @@ func TestPageWithShortCodeInSummary(t *testing.T) {
|
|||
|
||||
assertFunc := func(t *testing.T, ext string, p *Page) {
|
||||
checkPageTitle(t, p, "Simple")
|
||||
checkPageContent(t, p, normalizeExpected(ext, "<p>Summary Next Line. <figure > <img src=\"/not/real\" /> </figure>.\nMore text here.</p><p>Some more text</p>"), ext)
|
||||
checkPageSummary(t, p, "Summary Next Line. . More text here. Some more text", ext)
|
||||
checkPageContent(t, p, normalizeExpected(ext, "<p>Summary Next Line. \n<figure >\n \n <img src=\"/not/real\" />\n \n \n</figure>\n.\nMore text here.</p>\n\n<p>Some more text</p>\n"))
|
||||
checkPageSummary(t, p, "Summary Next Line. . More text here. Some more text")
|
||||
checkPageType(t, p, "page")
|
||||
checkPageLayout(t, p, "page/single.html", "_default/single.html", "theme/page/single.html", "theme/_default/single.html")
|
||||
}
|
||||
|
@ -793,8 +796,8 @@ func TestWordCountWithAllCJKRunesWithoutHasCJKLanguage(t *testing.T) {
|
|||
testCommonResetState()
|
||||
|
||||
assertFunc := func(t *testing.T, ext string, p *Page) {
|
||||
if p.WordCount != 8 {
|
||||
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 8, p.WordCount)
|
||||
if p.WordCount() != 8 {
|
||||
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 8, p.WordCount())
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -806,11 +809,10 @@ func TestWordCountWithAllCJKRunesHasCJKLanguage(t *testing.T) {
|
|||
viper.Set("HasCJKLanguage", true)
|
||||
|
||||
assertFunc := func(t *testing.T, ext string, p *Page) {
|
||||
if p.WordCount != 15 {
|
||||
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 15, p.WordCount)
|
||||
if p.WordCount() != 15 {
|
||||
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 15, p.WordCount())
|
||||
}
|
||||
}
|
||||
|
||||
testAllMarkdownEnginesForPage(t, assertFunc, "simple", simplePageWithAllCJKRunes)
|
||||
}
|
||||
|
||||
|
@ -820,15 +822,14 @@ func TestWordCountWithMainEnglishWithCJKRunes(t *testing.T) {
|
|||
viper.Set("HasCJKLanguage", true)
|
||||
|
||||
assertFunc := func(t *testing.T, ext string, p *Page) {
|
||||
if p.WordCount != 74 {
|
||||
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 74, p.WordCount)
|
||||
if p.WordCount() != 74 {
|
||||
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 74, p.WordCount())
|
||||
}
|
||||
|
||||
if p.Summary != simplePageWithMainEnglishWithCJKRunesSummary {
|
||||
t.Fatalf("[%s] incorrect Summary for content '%s'. expected %v, got %v", ext, p.plain,
|
||||
simplePageWithMainEnglishWithCJKRunesSummary, p.Summary)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
testAllMarkdownEnginesForPage(t, assertFunc, "simple", simplePageWithMainEnglishWithCJKRunes)
|
||||
|
@ -839,15 +840,14 @@ func TestWordCountWithIsCJKLanguageFalse(t *testing.T) {
|
|||
viper.Set("HasCJKLanguage", true)
|
||||
|
||||
assertFunc := func(t *testing.T, ext string, p *Page) {
|
||||
if p.WordCount != 75 {
|
||||
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 74, p.WordCount)
|
||||
if p.WordCount() != 75 {
|
||||
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 74, p.WordCount())
|
||||
}
|
||||
|
||||
if p.Summary != simplePageWithIsCJKLanguageFalseSummary {
|
||||
t.Fatalf("[%s] incorrect Summary for content '%s'. expected %v, got %v", ext, p.plain,
|
||||
simplePageWithIsCJKLanguageFalseSummary, p.Summary)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
testAllMarkdownEnginesForPage(t, assertFunc, "simple", simplePageWithIsCJKLanguageFalse)
|
||||
|
@ -857,16 +857,16 @@ func TestWordCountWithIsCJKLanguageFalse(t *testing.T) {
|
|||
func TestWordCount(t *testing.T) {
|
||||
|
||||
assertFunc := func(t *testing.T, ext string, p *Page) {
|
||||
if p.WordCount != 483 {
|
||||
t.Fatalf("[%s] incorrect word count. expected %v, got %v", ext, 483, p.WordCount)
|
||||
if p.WordCount() != 483 {
|
||||
t.Fatalf("[%s] incorrect word count. expected %v, got %v", ext, 483, p.WordCount())
|
||||
}
|
||||
|
||||
if p.FuzzyWordCount != 500 {
|
||||
t.Fatalf("[%s] incorrect word count. expected %v, got %v", ext, 500, p.WordCount)
|
||||
if p.FuzzyWordCount() != 500 {
|
||||
t.Fatalf("[%s] incorrect word count. expected %v, got %v", ext, 500, p.WordCount())
|
||||
}
|
||||
|
||||
if p.ReadingTime != 3 {
|
||||
t.Fatalf("[%s] incorrect min read. expected %v, got %v", ext, 3, p.ReadingTime)
|
||||
if p.ReadingTime() != 3 {
|
||||
t.Fatalf("[%s] incorrect min read. expected %v, got %v", ext, 3, p.ReadingTime())
|
||||
}
|
||||
|
||||
checkTruncation(t, p, true, "long page")
|
||||
|
|
|
@ -55,7 +55,7 @@ func TestSplitPageGroups(t *testing.T) {
|
|||
// first group 10 in weight
|
||||
assert.Equal(t, 10, pg.Key)
|
||||
for _, p := range pg.Pages {
|
||||
assert.True(t, p.FuzzyWordCount%2 == 0) // magic test
|
||||
assert.True(t, p.fuzzyWordCount%2 == 0) // magic test
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
@ -70,7 +70,7 @@ func TestSplitPageGroups(t *testing.T) {
|
|||
// last should have 5 in weight
|
||||
assert.Equal(t, 5, pg.Key)
|
||||
for _, p := range pg.Pages {
|
||||
assert.True(t, p.FuzzyWordCount%2 != 0) // magic test
|
||||
assert.True(t, p.fuzzyWordCount%2 != 0) // magic test
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
@ -443,10 +443,10 @@ func TestPage(t *testing.T) {
|
|||
page21, _ := f2.page(1)
|
||||
page2Nil, _ := f2.page(3)
|
||||
|
||||
assert.Equal(t, 1, page11.FuzzyWordCount)
|
||||
assert.Equal(t, 3, page11.fuzzyWordCount)
|
||||
assert.Nil(t, page1Nil)
|
||||
|
||||
assert.Equal(t, 1, page21.FuzzyWordCount)
|
||||
assert.Equal(t, 3, page21.fuzzyWordCount)
|
||||
assert.Nil(t, page2Nil)
|
||||
}
|
||||
|
||||
|
@ -468,7 +468,7 @@ func createTestPages(num int) Pages {
|
|||
if i%2 == 0 {
|
||||
w = 10
|
||||
}
|
||||
pages[i].FuzzyWordCount = i
|
||||
pages[i].fuzzyWordCount = i + 2
|
||||
pages[i].Weight = w
|
||||
}
|
||||
|
||||
|
|
|
@ -33,6 +33,11 @@ import (
|
|||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func init() {
|
||||
//There are expected ERROR logging in tests that produces a lot of noise.
|
||||
jww.SetStdoutThreshold(jww.LevelCritical)
|
||||
}
|
||||
|
||||
const (
|
||||
pageSimpleTitle = `---
|
||||
title: simple template
|
||||
|
|
Loading…
Reference in a new issue