mirror of
https://github.com/gohugoio/hugo.git
synced 2024-11-07 20:30:36 -05:00
Don't count HTML markup in auto summaries
This commit also fixes a bug where a `</picture>` end tag was wrongly used to detect a end paragraph. This should be very rare, though. Closes #12837
This commit is contained in:
parent
84ee00bbc2
commit
3d6baedaec
3 changed files with 77 additions and 2 deletions
|
@ -593,7 +593,7 @@ func TestPageSummary(t *testing.T) {
|
|||
// Source is not Asciidoctor- or RST-compatible so don't test them
|
||||
if ext != "ad" && ext != "rst" {
|
||||
checkPageContent(t, p, normalizeExpected(ext, "<p><a href=\"https://lipsum.com/\">Lorem ipsum</a> dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>\n\n<p>Additional text.</p>\n\n<p>Further text.</p>\n"), ext)
|
||||
checkPageSummary(t, p, normalizeExpected(ext, "<p><a href=\"https://lipsum.com/\">Lorem ipsum</a> dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>"), ext)
|
||||
checkPageSummary(t, p, normalizeExpected(ext, "<p><a href=\"https://lipsum.com/\">Lorem ipsum</a> dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p><p>Additional text.</p>"), ext)
|
||||
}
|
||||
checkPageType(t, p, "page")
|
||||
}
|
||||
|
|
|
@ -161,6 +161,16 @@ func (s *HtmlSummary) resolveParagraphTagAndSetWrapper(mt media.Type) tagReStart
|
|||
return ptag
|
||||
}
|
||||
|
||||
// Avoid counting words that are most likely HTML tokens.
|
||||
var (
|
||||
isProbablyHTMLTag = regexp.MustCompile(`^<\/?[A-Za-z]+>?$`)
|
||||
isProablyHTMLAttribute = regexp.MustCompile(`^[A-Za-z]+=["']`)
|
||||
)
|
||||
|
||||
func isProbablyHTMLToken(s string) bool {
|
||||
return s == ">" || isProbablyHTMLTag.MatchString(s) || isProablyHTMLAttribute.MatchString(s)
|
||||
}
|
||||
|
||||
// ExtractSummaryFromHTML extracts a summary from the given HTML content.
|
||||
func ExtractSummaryFromHTML(mt media.Type, input string, numWords int, isCJK bool) (result HtmlSummary) {
|
||||
result.source = input
|
||||
|
@ -173,6 +183,14 @@ func ExtractSummaryFromHTML(mt media.Type, input string, numWords int, isCJK boo
|
|||
var count int
|
||||
|
||||
countWord := func(word string) int {
|
||||
word = strings.TrimSpace(word)
|
||||
if len(word) == 0 {
|
||||
return 0
|
||||
}
|
||||
if isProbablyHTMLToken(word) {
|
||||
return 0
|
||||
}
|
||||
|
||||
if isCJK {
|
||||
word = tpl.StripHTML(word)
|
||||
runeCount := utf8.RuneCountInString(word)
|
||||
|
@ -193,7 +211,7 @@ func ExtractSummaryFromHTML(mt media.Type, input string, numWords int, isCJK boo
|
|||
|
||||
for j := result.WrapperStart.High; j < high; {
|
||||
s := input[j:]
|
||||
closingIndex := strings.Index(s, "</"+ptag.tagName)
|
||||
closingIndex := strings.Index(s, "</"+ptag.tagName+">")
|
||||
|
||||
if closingIndex == -1 {
|
||||
break
|
||||
|
|
|
@ -49,6 +49,46 @@ func TestExtractSummaryFromHTML(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
// See https://discourse.gohugo.io/t/automatic-summarys-summarylength-seems-broken-in-the-case-of-plainify/51466/4
|
||||
// Also issue 12837
|
||||
func TestExtractSummaryFromHTMLLotsOfHTMLInSummary(t *testing.T) {
|
||||
c := qt.New(t)
|
||||
|
||||
input := `
|
||||
<p>
|
||||
<div>
|
||||
<picture>
|
||||
<img src="imgs/1.jpg" alt="1"/>
|
||||
</picture>
|
||||
<picture>
|
||||
<img src="imgs/2.jpg" alt="2"/>
|
||||
</picture>
|
||||
<picture>
|
||||
<img src="imgs/3.jpg" alt="3"/>
|
||||
</picture>
|
||||
<picture>
|
||||
<img src="imgs/4.jpg" alt="4"/>
|
||||
</picture>
|
||||
<picture>
|
||||
<img src="imgs/5.jpg" alt="5"/>
|
||||
</picture>
|
||||
</div>
|
||||
</p>
|
||||
<p>
|
||||
This is a story about a cat.
|
||||
</p>
|
||||
<p>
|
||||
The cat was white and fluffy.
|
||||
</p>
|
||||
<p>
|
||||
And it liked milk.
|
||||
</p>
|
||||
`
|
||||
|
||||
summary := ExtractSummaryFromHTML(media.Builtin.MarkdownType, input, 10, false)
|
||||
c.Assert(strings.HasSuffix(summary.Summary(), "<p>\nThis is a story about a cat.\n</p>\n<p>\nThe cat was white and fluffy.\n</p>"), qt.IsTrue)
|
||||
}
|
||||
|
||||
func TestExtractSummaryFromHTMLWithDivider(t *testing.T) {
|
||||
c := qt.New(t)
|
||||
|
||||
|
@ -114,6 +154,23 @@ func TestExpandDivider(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestIsProbablyHTMLToken(t *testing.T) {
|
||||
c := qt.New(t)
|
||||
|
||||
for i, test := range []struct {
|
||||
input string
|
||||
expect bool
|
||||
}{
|
||||
{"<p>", true},
|
||||
{"<p", true},
|
||||
{"width=\"32\"", true},
|
||||
{"width='32'", true},
|
||||
{"<p>Æøå", false},
|
||||
} {
|
||||
c.Assert(isProbablyHTMLToken(test.input), qt.Equals, test.expect, qt.Commentf("[%d] Test.expect %q", i, test.input))
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkSummaryFromHTML(b *testing.B) {
|
||||
b.StopTimer()
|
||||
input := "<p>First paragraph</p><p>Second paragraph</p>"
|
||||
|
|
Loading…
Reference in a new issue