mirror of
https://github.com/gohugoio/hugo.git
synced 2025-04-20 12:03:09 +00:00
Fix Plainify edge cases
This commit replaces the main part of `helpers.StripHTML` with Go's implementation in its html/template package. It's a little slower, but correctness is more important: ```bash BenchmarkStripHTMLOld-10 680316 1764 ns/op 728 B/op 4 allocs/op BenchmarkStripHTMLNew-10 384520 3099 ns/op 2089 B/op 10 allocs/op ``` Fixes #9199 Fixes #9909 Closes #9410
This commit is contained in:
parent
cd0112a05a
commit
3854a6fa6c
10 changed files with 103 additions and 85 deletions
|
@ -34,7 +34,6 @@ import (
|
||||||
|
|
||||||
"github.com/gohugoio/hugo/markup"
|
"github.com/gohugoio/hugo/markup"
|
||||||
|
|
||||||
bp "github.com/gohugoio/hugo/bufferpool"
|
|
||||||
"github.com/gohugoio/hugo/config"
|
"github.com/gohugoio/hugo/config"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -104,45 +103,6 @@ func NewContentSpec(cfg config.Provider, logger loggers.Logger, contentFs afero.
|
||||||
return spec, nil
|
return spec, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
var stripHTMLReplacer = strings.NewReplacer("\n", " ", "</p>", "\n", "<br>", "\n", "<br />", "\n")
|
|
||||||
|
|
||||||
// StripHTML accepts a string, strips out all HTML tags and returns it.
|
|
||||||
func StripHTML(s string) string {
|
|
||||||
// Shortcut strings with no tags in them
|
|
||||||
if !strings.ContainsAny(s, "<>") {
|
|
||||||
return s
|
|
||||||
}
|
|
||||||
s = stripHTMLReplacer.Replace(s)
|
|
||||||
|
|
||||||
// Walk through the string removing all tags
|
|
||||||
b := bp.GetBuffer()
|
|
||||||
defer bp.PutBuffer(b)
|
|
||||||
var inTag, isSpace, wasSpace bool
|
|
||||||
for _, r := range s {
|
|
||||||
if !inTag {
|
|
||||||
isSpace = false
|
|
||||||
}
|
|
||||||
|
|
||||||
switch {
|
|
||||||
case r == '<':
|
|
||||||
inTag = true
|
|
||||||
case r == '>':
|
|
||||||
inTag = false
|
|
||||||
case unicode.IsSpace(r):
|
|
||||||
isSpace = true
|
|
||||||
fallthrough
|
|
||||||
default:
|
|
||||||
if !inTag && (!isSpace || (isSpace && !wasSpace)) {
|
|
||||||
b.WriteRune(r)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
wasSpace = isSpace
|
|
||||||
|
|
||||||
}
|
|
||||||
return b.String()
|
|
||||||
}
|
|
||||||
|
|
||||||
// stripEmptyNav strips out empty <nav> tags from content.
|
// stripEmptyNav strips out empty <nav> tags from content.
|
||||||
func stripEmptyNav(in []byte) []byte {
|
func stripEmptyNav(in []byte) []byte {
|
||||||
return bytes.Replace(in, []byte("<nav>\n</nav>\n\n"), []byte(``), -1)
|
return bytes.Replace(in, []byte("<nav>\n</nav>\n\n"), []byte(``), -1)
|
||||||
|
|
|
@ -52,44 +52,6 @@ func TestTrimShortHTML(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestStripHTML(t *testing.T) {
|
|
||||||
type test struct {
|
|
||||||
input, expected string
|
|
||||||
}
|
|
||||||
data := []test{
|
|
||||||
{"<h1>strip h1 tag <h1>", "strip h1 tag "},
|
|
||||||
{"<p> strip p tag </p>", " strip p tag "},
|
|
||||||
{"</br> strip br<br>", " strip br\n"},
|
|
||||||
{"</br> strip br2<br />", " strip br2\n"},
|
|
||||||
{"This <strong>is</strong> a\nnewline", "This is a newline"},
|
|
||||||
{"No Tags", "No Tags"},
|
|
||||||
{`<p>Summary Next Line.
|
|
||||||
<figure >
|
|
||||||
|
|
||||||
<img src="/not/real" />
|
|
||||||
|
|
||||||
|
|
||||||
</figure>
|
|
||||||
.
|
|
||||||
More text here.</p>
|
|
||||||
|
|
||||||
<p>Some more text</p>`, "Summary Next Line. . More text here.\nSome more text\n"},
|
|
||||||
}
|
|
||||||
for i, d := range data {
|
|
||||||
output := StripHTML(d.input)
|
|
||||||
if d.expected != output {
|
|
||||||
t.Errorf("Test %d failed. Expected %q got %q", i, d.expected, output)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func BenchmarkStripHTML(b *testing.B) {
|
|
||||||
b.ResetTimer()
|
|
||||||
for i := 0; i < b.N; i++ {
|
|
||||||
StripHTML(tstHTMLContent)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestStripEmptyNav(t *testing.T) {
|
func TestStripEmptyNav(t *testing.T) {
|
||||||
c := qt.New(t)
|
c := qt.New(t)
|
||||||
cleaned := stripEmptyNav([]byte("do<nav>\n</nav>\n\nbedobedo"))
|
cleaned := stripEmptyNav([]byte("do<nav>\n</nav>\n\nbedobedo"))
|
||||||
|
|
|
@ -201,7 +201,7 @@ func newPageContentOutput(p *pageState, po *pageOutput) (*pageContentOutput, err
|
||||||
})
|
})
|
||||||
|
|
||||||
cp.initPlain = cp.initMain.Branch(func() (any, error) {
|
cp.initPlain = cp.initMain.Branch(func() (any, error) {
|
||||||
cp.plain = helpers.StripHTML(string(cp.content))
|
cp.plain = tpl.StripHTML(string(cp.content))
|
||||||
cp.plainWords = strings.Fields(cp.plain)
|
cp.plainWords = strings.Fields(cp.plain)
|
||||||
cp.setWordCounts(p.m.isCJKLanguage)
|
cp.setWordCounts(p.m.isCJKLanguage)
|
||||||
|
|
||||||
|
|
|
@ -26,6 +26,7 @@ import (
|
||||||
"github.com/gohugoio/hugo/htesting"
|
"github.com/gohugoio/hugo/htesting"
|
||||||
"github.com/gohugoio/hugo/markup/asciidocext"
|
"github.com/gohugoio/hugo/markup/asciidocext"
|
||||||
"github.com/gohugoio/hugo/markup/rst"
|
"github.com/gohugoio/hugo/markup/rst"
|
||||||
|
"github.com/gohugoio/hugo/tpl"
|
||||||
|
|
||||||
"github.com/gohugoio/hugo/config"
|
"github.com/gohugoio/hugo/config"
|
||||||
|
|
||||||
|
@ -40,7 +41,6 @@ import (
|
||||||
|
|
||||||
qt "github.com/frankban/quicktest"
|
qt "github.com/frankban/quicktest"
|
||||||
"github.com/gohugoio/hugo/deps"
|
"github.com/gohugoio/hugo/deps"
|
||||||
"github.com/gohugoio/hugo/helpers"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
|
@ -351,7 +351,7 @@ func normalizeExpected(ext, str string) string {
|
||||||
default:
|
default:
|
||||||
return str
|
return str
|
||||||
case "html":
|
case "html":
|
||||||
return strings.Trim(helpers.StripHTML(str), " ")
|
return strings.Trim(tpl.StripHTML(str), " ")
|
||||||
case "ad":
|
case "ad":
|
||||||
paragraphs := strings.Split(str, "</p>")
|
paragraphs := strings.Split(str, "</p>")
|
||||||
expected := ""
|
expected := ""
|
||||||
|
@ -1736,6 +1736,7 @@ Len Summary: {{ len .Summary }}
|
||||||
Len Content: {{ len .Content }}
|
Len Content: {{ len .Content }}
|
||||||
|
|
||||||
SUMMARY:{{ .Summary }}:{{ len .Summary }}:END
|
SUMMARY:{{ .Summary }}:{{ len .Summary }}:END
|
||||||
|
|
||||||
`}
|
`}
|
||||||
|
|
||||||
b := newTestSitesBuilder(t)
|
b := newTestSitesBuilder(t)
|
||||||
|
|
|
@ -34,3 +34,8 @@ func (t *Template) Prepare() (*template.Template, error) {
|
||||||
}
|
}
|
||||||
return t.text, nil
|
return t.text, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// See https://github.com/golang/go/issues/5884
|
||||||
|
func StripTags(html string) string {
|
||||||
|
return stripTags(html)
|
||||||
|
}
|
||||||
|
|
|
@ -25,6 +25,7 @@ import (
|
||||||
"github.com/gohugoio/hugo/common/text"
|
"github.com/gohugoio/hugo/common/text"
|
||||||
"github.com/gohugoio/hugo/deps"
|
"github.com/gohugoio/hugo/deps"
|
||||||
"github.com/gohugoio/hugo/helpers"
|
"github.com/gohugoio/hugo/helpers"
|
||||||
|
"github.com/gohugoio/hugo/tpl"
|
||||||
|
|
||||||
"github.com/spf13/cast"
|
"github.com/spf13/cast"
|
||||||
)
|
)
|
||||||
|
@ -52,7 +53,7 @@ func (ns *Namespace) CountRunes(s any) (int, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
counter := 0
|
counter := 0
|
||||||
for _, r := range helpers.StripHTML(ss) {
|
for _, r := range tpl.StripHTML(ss) {
|
||||||
if !helpers.IsWhitespace(r) {
|
if !helpers.IsWhitespace(r) {
|
||||||
counter++
|
counter++
|
||||||
}
|
}
|
||||||
|
@ -83,11 +84,11 @@ func (ns *Namespace) CountWords(s any) (int, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if !isCJKLanguage {
|
if !isCJKLanguage {
|
||||||
return len(strings.Fields(helpers.StripHTML((ss)))), nil
|
return len(strings.Fields(tpl.StripHTML(ss))), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
counter := 0
|
counter := 0
|
||||||
for _, word := range strings.Fields(helpers.StripHTML(ss)) {
|
for _, word := range strings.Fields(tpl.StripHTML(ss)) {
|
||||||
runeCount := utf8.RuneCountInString(word)
|
runeCount := utf8.RuneCountInString(word)
|
||||||
if len(word) == runeCount {
|
if len(word) == runeCount {
|
||||||
counter++
|
counter++
|
||||||
|
|
|
@ -18,9 +18,14 @@ import (
|
||||||
"io"
|
"io"
|
||||||
"reflect"
|
"reflect"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
"unicode"
|
||||||
|
|
||||||
|
bp "github.com/gohugoio/hugo/bufferpool"
|
||||||
|
|
||||||
"github.com/gohugoio/hugo/output"
|
"github.com/gohugoio/hugo/output"
|
||||||
|
|
||||||
|
htmltemplate "github.com/gohugoio/hugo/tpl/internal/go_templates/htmltemplate"
|
||||||
texttemplate "github.com/gohugoio/hugo/tpl/internal/go_templates/texttemplate"
|
texttemplate "github.com/gohugoio/hugo/tpl/internal/go_templates/texttemplate"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -163,3 +168,44 @@ func GetHasLockFromContext(ctx context.Context) bool {
|
||||||
func SetHasLockInContext(ctx context.Context, hasLock bool) context.Context {
|
func SetHasLockInContext(ctx context.Context, hasLock bool) context.Context {
|
||||||
return context.WithValue(ctx, texttemplate.HasLockContextKey, hasLock)
|
return context.WithValue(ctx, texttemplate.HasLockContextKey, hasLock)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const hugoNewLinePlaceholder = "___hugonl_"
|
||||||
|
|
||||||
|
var (
|
||||||
|
stripHTMLReplacerPre = strings.NewReplacer("\n", " ", "</p>", hugoNewLinePlaceholder, "<br>", hugoNewLinePlaceholder, "<br />", hugoNewLinePlaceholder)
|
||||||
|
whitespaceRe = regexp.MustCompile(`\s+`)
|
||||||
|
)
|
||||||
|
|
||||||
|
// StripHTML strips out all HTML tags in s.
|
||||||
|
func StripHTML(s string) string {
|
||||||
|
// Shortcut strings with no tags in them
|
||||||
|
if !strings.ContainsAny(s, "<>") {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
pre := stripHTMLReplacerPre.Replace(s)
|
||||||
|
preReplaced := pre != s
|
||||||
|
|
||||||
|
s = htmltemplate.StripTags(pre)
|
||||||
|
|
||||||
|
if preReplaced {
|
||||||
|
s = strings.ReplaceAll(s, hugoNewLinePlaceholder, "\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
var wasSpace bool
|
||||||
|
b := bp.GetBuffer()
|
||||||
|
defer bp.PutBuffer(b)
|
||||||
|
for _, r := range s {
|
||||||
|
isSpace := unicode.IsSpace(r)
|
||||||
|
if !(isSpace && wasSpace) {
|
||||||
|
b.WriteRune(r)
|
||||||
|
}
|
||||||
|
wasSpace = isSpace
|
||||||
|
}
|
||||||
|
|
||||||
|
if b.Len() > 0 {
|
||||||
|
s = b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
|
@ -28,3 +28,44 @@ func TestExtractBaseof(t *testing.T) {
|
||||||
c.Assert(extractBaseOf("not baseof for you"), qt.Equals, "")
|
c.Assert(extractBaseOf("not baseof for you"), qt.Equals, "")
|
||||||
c.Assert(extractBaseOf("template: blog/baseof.html:23:11:"), qt.Equals, "blog/baseof.html")
|
c.Assert(extractBaseOf("template: blog/baseof.html:23:11:"), qt.Equals, "blog/baseof.html")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestStripHTML(t *testing.T) {
|
||||||
|
type test struct {
|
||||||
|
input, expected string
|
||||||
|
}
|
||||||
|
data := []test{
|
||||||
|
{"<h1>strip h1 tag <h1>", "strip h1 tag "},
|
||||||
|
{"<p> strip p tag </p>", " strip p tag "},
|
||||||
|
{"</br> strip br<br>", " strip br\n"},
|
||||||
|
{"</br> strip br2<br />", " strip br2\n"},
|
||||||
|
{"This <strong>is</strong> a\nnewline", "This is a newline"},
|
||||||
|
{"No Tags", "No Tags"},
|
||||||
|
{`<p>Summary Next Line.
|
||||||
|
<figure >
|
||||||
|
|
||||||
|
<img src="/not/real" />
|
||||||
|
|
||||||
|
|
||||||
|
</figure>
|
||||||
|
.
|
||||||
|
More text here.</p>
|
||||||
|
|
||||||
|
<p>Some more text</p>`, "Summary Next Line. . More text here.\nSome more text\n"},
|
||||||
|
|
||||||
|
// Issue 9199
|
||||||
|
{"<div data-action='click->my-controller#doThing'>qwe</div>", "qwe"},
|
||||||
|
{"Hello, World!", "Hello, World!"},
|
||||||
|
{"foo&bar", "foo&bar"},
|
||||||
|
{`Hello <a href="www.example.com/">World</a>!`, "Hello World!"},
|
||||||
|
{"Foo <textarea>Bar</textarea> Baz", "Foo Bar Baz"},
|
||||||
|
{"Foo <!-- Bar --> Baz", "Foo Baz"},
|
||||||
|
}
|
||||||
|
for i, d := range data {
|
||||||
|
output := StripHTML(d.input)
|
||||||
|
if d.expected != output {
|
||||||
|
t.Errorf("Test %d failed. Expected %q got %q", i, d.expected, output)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const tstHTMLContent = "<!DOCTYPE html><html><head><script src=\"http://two/foobar.js\"></script></head><body><nav><ul><li hugo-nav=\"section_0\"></li><li hugo-nav=\"section_1\"></li></ul></nav><article>content <a href=\"http://two/foobar\">foobar</a>. Follow up</article><p>This is some text.<br>And some more.</p></body></html>"
|
||||||
|
|
|
@ -22,6 +22,7 @@ import (
|
||||||
"github.com/gohugoio/hugo/cache/namedmemcache"
|
"github.com/gohugoio/hugo/cache/namedmemcache"
|
||||||
"github.com/gohugoio/hugo/markup/converter/hooks"
|
"github.com/gohugoio/hugo/markup/converter/hooks"
|
||||||
"github.com/gohugoio/hugo/markup/highlight"
|
"github.com/gohugoio/hugo/markup/highlight"
|
||||||
|
"github.com/gohugoio/hugo/tpl"
|
||||||
|
|
||||||
"github.com/gohugoio/hugo/deps"
|
"github.com/gohugoio/hugo/deps"
|
||||||
"github.com/gohugoio/hugo/helpers"
|
"github.com/gohugoio/hugo/helpers"
|
||||||
|
@ -141,7 +142,7 @@ func (ns *Namespace) Plainify(s any) (string, error) {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
return helpers.StripHTML(ss), nil
|
return tpl.StripHTML(ss), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// For internal use.
|
// For internal use.
|
||||||
|
|
|
@ -237,6 +237,7 @@ func TestPlainify(t *testing.T) {
|
||||||
expect any
|
expect any
|
||||||
}{
|
}{
|
||||||
{"<em>Note:</em> blah <b>blah</b>", "Note: blah blah"},
|
{"<em>Note:</em> blah <b>blah</b>", "Note: blah blah"},
|
||||||
|
{"<div data-action='click->my-controller#doThing'>qwe</div>", "qwe"},
|
||||||
// errors
|
// errors
|
||||||
{tstNoStringer{}, false},
|
{tstNoStringer{}, false},
|
||||||
} {
|
} {
|
||||||
|
|
Loading…
Add table
Reference in a new issue