mirror of
https://github.com/gohugoio/hugo.git
synced 2024-11-30 00:03:34 -05:00
Avoid splitting words for summary
For people using autogenerated summaries, this is one of the hot spots in the memory department. We don't need to split al the content into words to do proper summary truncation. This is obviously more effective: ``` BenchmarkTestTruncateWordsToWholeSentence-4 300000 4720 ns/op 0 B/op 0 allocs/op BenchmarkTestTruncateWordsToWholeSentenceOld-4 100000 17699 ns/op 3072 B/op 3 allocs/op ```
This commit is contained in:
parent
74ffb45fbe
commit
bcd434794a
3 changed files with 83 additions and 9 deletions
|
@ -21,6 +21,7 @@ import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"html/template"
|
"html/template"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
|
"unicode"
|
||||||
"unicode/utf8"
|
"unicode/utf8"
|
||||||
|
|
||||||
"github.com/miekg/mmark"
|
"github.com/miekg/mmark"
|
||||||
|
@ -424,10 +425,55 @@ func TruncateWordsByRune(words []string, max int) (string, bool) {
|
||||||
return strings.Join(words, " "), false
|
return strings.Join(words, " "), false
|
||||||
}
|
}
|
||||||
|
|
||||||
// TruncateWordsToWholeSentence takes content and an int
|
// TruncateWordsToWholeSentence takes content and truncates to whole sentence
|
||||||
// and returns entire sentences from content, delimited by the int
|
// limited by max number of words. It also returns whether it is truncated.
|
||||||
// and whether it's truncated or not.
|
func TruncateWordsToWholeSentence(s string, max int) (string, bool) {
|
||||||
func TruncateWordsToWholeSentence(words []string, max int) (string, bool) {
|
|
||||||
|
var (
|
||||||
|
wordCount = 0
|
||||||
|
lastWordIndex = -1
|
||||||
|
)
|
||||||
|
|
||||||
|
for i, r := range s {
|
||||||
|
if unicode.IsSpace(r) {
|
||||||
|
wordCount++
|
||||||
|
lastWordIndex = i
|
||||||
|
|
||||||
|
if wordCount >= max {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if lastWordIndex == -1 {
|
||||||
|
return s, false
|
||||||
|
}
|
||||||
|
|
||||||
|
endIndex := -1
|
||||||
|
|
||||||
|
for j, r := range s[lastWordIndex:] {
|
||||||
|
if isEndOfSentence(r) {
|
||||||
|
endIndex = j + lastWordIndex + utf8.RuneLen(r)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if endIndex == -1 {
|
||||||
|
return s, false
|
||||||
|
}
|
||||||
|
|
||||||
|
return strings.TrimSpace(s[:endIndex]), endIndex < len(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
func isEndOfSentence(r rune) bool {
|
||||||
|
return r == '.' || r == '?' || r == '!' || r == '"' || r == '\n'
|
||||||
|
}
|
||||||
|
|
||||||
|
// Kept only for benchmark.
|
||||||
|
func truncateWordsToWholeSentenceOld(content string, max int) (string, bool) {
|
||||||
|
words := strings.Fields(content)
|
||||||
|
|
||||||
if max >= len(words) {
|
if max >= len(words) {
|
||||||
return strings.Join(words, " "), false
|
return strings.Join(words, " "), false
|
||||||
}
|
}
|
||||||
|
|
|
@ -64,6 +64,22 @@ func TestBytesToHTML(t *testing.T) {
|
||||||
assert.Equal(t, template.HTML("dobedobedo"), BytesToHTML([]byte("dobedobedo")))
|
assert.Equal(t, template.HTML("dobedobedo"), BytesToHTML([]byte("dobedobedo")))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var benchmarkTruncateString = strings.Repeat("This is a sentence about nothing.", 20)
|
||||||
|
|
||||||
|
func BenchmarkTestTruncateWordsToWholeSentence(b *testing.B) {
|
||||||
|
b.ResetTimer()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
TruncateWordsToWholeSentence(benchmarkTruncateString, SummaryLength)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkTestTruncateWordsToWholeSentenceOld(b *testing.B) {
|
||||||
|
b.ResetTimer()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
truncateWordsToWholeSentenceOld(benchmarkTruncateString, SummaryLength)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestTruncateWordsToWholeSentence(t *testing.T) {
|
func TestTruncateWordsToWholeSentence(t *testing.T) {
|
||||||
type test struct {
|
type test struct {
|
||||||
input, expected string
|
input, expected string
|
||||||
|
@ -77,10 +93,11 @@ func TestTruncateWordsToWholeSentence(t *testing.T) {
|
||||||
{"This is a sentence.", "This is a sentence.", 5, false},
|
{"This is a sentence.", "This is a sentence.", 5, false},
|
||||||
{"This is also a sentence!", "This is also a sentence!", 1, false},
|
{"This is also a sentence!", "This is also a sentence!", 1, false},
|
||||||
{"To be. Or not to be. That's the question.", "To be.", 1, true},
|
{"To be. Or not to be. That's the question.", "To be.", 1, true},
|
||||||
{" \nThis is not a sentence\n ", "This is not a", 4, true},
|
{" \nThis is not a sentence\nAnd this is another", "This is not a sentence", 4, true},
|
||||||
|
{"", "", 10, false},
|
||||||
}
|
}
|
||||||
for i, d := range data {
|
for i, d := range data {
|
||||||
output, truncated := TruncateWordsToWholeSentence(strings.Fields(d.input), d.max)
|
output, truncated := TruncateWordsToWholeSentence(d.input, d.max)
|
||||||
if d.expected != output {
|
if d.expected != output {
|
||||||
t.Errorf("Test %d failed. Expected %q got %q", i, d.expected, output)
|
t.Errorf("Test %d failed. Expected %q got %q", i, d.expected, output)
|
||||||
}
|
}
|
||||||
|
|
|
@ -89,6 +89,7 @@ type Page struct {
|
||||||
plain string // TODO should be []byte
|
plain string // TODO should be []byte
|
||||||
plainWords []string
|
plainWords []string
|
||||||
plainInit sync.Once
|
plainInit sync.Once
|
||||||
|
plainWordsInit sync.Once
|
||||||
renderingConfig *helpers.Blackfriday
|
renderingConfig *helpers.Blackfriday
|
||||||
renderingConfigInit sync.Once
|
renderingConfigInit sync.Once
|
||||||
pageMenus PageMenus
|
pageMenus PageMenus
|
||||||
|
@ -147,14 +148,20 @@ func (p *Page) Plain() string {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *Page) PlainWords() []string {
|
func (p *Page) PlainWords() []string {
|
||||||
p.initPlain()
|
p.initPlainWords()
|
||||||
return p.plainWords
|
return p.plainWords
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *Page) initPlain() {
|
func (p *Page) initPlain() {
|
||||||
p.plainInit.Do(func() {
|
p.plainInit.Do(func() {
|
||||||
p.plain = helpers.StripHTML(string(p.Content))
|
p.plain = helpers.StripHTML(string(p.Content))
|
||||||
p.plainWords = strings.Fields(p.plain)
|
return
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *Page) initPlainWords() {
|
||||||
|
p.plainWordsInit.Do(func() {
|
||||||
|
p.plainWords = strings.Fields(p.Plain())
|
||||||
return
|
return
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -335,7 +342,7 @@ func (p *Page) setAutoSummary() error {
|
||||||
if p.isCJKLanguage {
|
if p.isCJKLanguage {
|
||||||
summary, truncated = helpers.TruncateWordsByRune(p.PlainWords(), helpers.SummaryLength)
|
summary, truncated = helpers.TruncateWordsByRune(p.PlainWords(), helpers.SummaryLength)
|
||||||
} else {
|
} else {
|
||||||
summary, truncated = helpers.TruncateWordsToWholeSentence(p.PlainWords(), helpers.SummaryLength)
|
summary, truncated = helpers.TruncateWordsToWholeSentence(p.Plain(), helpers.SummaryLength)
|
||||||
}
|
}
|
||||||
p.Summary = template.HTML(summary)
|
p.Summary = template.HTML(summary)
|
||||||
p.Truncated = truncated
|
p.Truncated = truncated
|
||||||
|
@ -479,6 +486,10 @@ func (p *Page) ReadFrom(buf io.Reader) (int64, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *Page) analyzePage() {
|
func (p *Page) analyzePage() {
|
||||||
|
// TODO(bep)
|
||||||
|
if true {
|
||||||
|
return
|
||||||
|
}
|
||||||
if p.isCJKLanguage {
|
if p.isCJKLanguage {
|
||||||
p.WordCount = 0
|
p.WordCount = 0
|
||||||
for _, word := range p.PlainWords() {
|
for _, word := range p.PlainWords() {
|
||||||
|
|
Loading…
Reference in a new issue