publisher: Make the HTML element collector more robust

Fixes #8530
This commit is contained in:
Bjørn Erik Pedersen 2021-05-13 13:10:32 +02:00
parent dc6b7a75ff
commit f518b4f71e
2 changed files with 283 additions and 167 deletions

View file

@ -19,12 +19,51 @@ import (
"sort" "sort"
"strings" "strings"
"sync" "sync"
"unicode"
"unicode/utf8"
"golang.org/x/net/html" "golang.org/x/net/html"
"github.com/gohugoio/hugo/helpers" "github.com/gohugoio/hugo/helpers"
) )
const eof = -1
var (
htmlJsonFixer = strings.NewReplacer(", ", "\n")
jsonAttrRe = regexp.MustCompile(`'?(.*?)'?:.*`)
classAttrRe = regexp.MustCompile(`(?i)^class$|transition`)
skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`)
skipAllElementRe = regexp.MustCompile(`(?i)^!DOCTYPE`)
endTagRe = regexp.MustCompile(`(?i)<\/\s*([a-zA-Z]+)\s*>$`)
exceptionList = map[string]bool{
"thead": true,
"tbody": true,
"tfoot": true,
"td": true,
"tr": true,
}
)
func newHTMLElementsCollector() *htmlElementsCollector {
return &htmlElementsCollector{
elementSet: make(map[string]bool),
}
}
func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter {
w := &htmlElementsCollectorWriter{
collector: collector,
state: htmlLexStart,
}
w.defaultLexElementInside = w.lexElementInside(htmlLexStart)
return w
}
// HTMLElements holds lists of tags and attribute values for classes and id. // HTMLElements holds lists of tags and attribute values for classes and id.
type HTMLElements struct { type HTMLElements struct {
Tags []string `json:"tags"` Tags []string `json:"tags"`
@ -48,6 +87,12 @@ func (h *HTMLElements) Sort() {
sort.Strings(h.IDs) sort.Strings(h.IDs)
} }
type htmlElement struct {
Tag string
Classes []string
IDs []string
}
type htmlElementsCollector struct { type htmlElementsCollector struct {
// Contains the raw HTML string. We will get the same element // Contains the raw HTML string. We will get the same element
// several times, and want to avoid costly reparsing when this // several times, and want to avoid costly reparsing when this
@ -59,12 +104,6 @@ type htmlElementsCollector struct {
mu sync.RWMutex mu sync.RWMutex
} }
func newHTMLElementsCollector() *htmlElementsCollector {
return &htmlElementsCollector{
elementSet: make(map[string]bool),
}
}
func (c *htmlElementsCollector) getHTMLElements() HTMLElements { func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
var ( var (
classes []string classes []string
@ -93,114 +132,125 @@ func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
type htmlElementsCollectorWriter struct { type htmlElementsCollectorWriter struct {
collector *htmlElementsCollector collector *htmlElementsCollector
buff bytes.Buffer
isCollecting bool r rune // Current rune
inPreTag string width int // The width in bytes of r
input []byte // The current slice written to Write
pos int // The current position in input
inQuote bool err error
quoteValue byte
inQuote rune
buff bytes.Buffer
// Current state
state htmlCollectorStateFunc
// Precompiled state funcs
defaultLexElementInside htmlCollectorStateFunc
} }
func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter { // Write collects HTML elements from p, which must contain complete runes.
return &htmlElementsCollectorWriter{ func (w *htmlElementsCollectorWriter) Write(p []byte) (int, error) {
collector: collector, if p == nil {
return 0, nil
} }
w.input = p
for {
w.r = w.next()
if w.r == eof || w.r == utf8.RuneError {
break
}
w.state = w.state(w)
}
w.pos = 0
w.input = nil
return len(p), nil
} }
// Write splits the incoming stream into single html element. func (l *htmlElementsCollectorWriter) backup() {
func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) { l.pos -= l.width
n = len(p) l.r, _ = utf8.DecodeRune(l.input[l.pos:])
i := 0 }
for i < len(p) { func (w *htmlElementsCollectorWriter) consumeBuffUntil(condition func() bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
// If we are not collecting, cycle through byte stream until start bracket "<" is found. var s htmlCollectorStateFunc
if !w.isCollecting { s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
for ; i < len(p); i++ { w.buff.WriteRune(w.r)
b := p[i] if condition() {
if b == '<' { w.buff.Reset()
w.startCollecting() return resolve
break }
} return s
}
return s
}
func (w *htmlElementsCollectorWriter) consumeRuneUntil(condition func(r rune) bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
var s htmlCollectorStateFunc
s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
if condition(w.r) {
return resolve
}
return s
}
return s
}
// Starts with e.g. "<body " or "<div"
func (w *htmlElementsCollectorWriter) lexElementInside(resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
var s htmlCollectorStateFunc
s = func(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
w.buff.WriteRune(w.r)
// Skip any text inside a quote.
if w.r == '\'' || w.r == '"' {
if w.inQuote == w.r {
w.inQuote = 0
} else if w.inQuote == 0 {
w.inQuote = w.r
} }
} }
if w.isCollecting { if w.inQuote != 0 {
// If we are collecting, cycle through byte stream until end bracket ">" is found, return s
// disregard any ">" if within a quote,
// write bytes until found to buffer.
for ; i < len(p); i++ {
b := p[i]
w.toggleIfQuote(b)
w.buff.WriteByte(b)
if !w.inQuote && b == '>' {
w.endCollecting()
break
}
}
} }
// If no end bracket ">" is found while collecting, but the stream ended if w.r == '>' {
// this could mean we received chunks of a stream from e.g. the minify functionality
// next if loop will be skipped.
// At this point we have collected an element line between angle brackets "<" and ">".
if !w.isCollecting {
if w.buff.Len() == 0 {
continue
}
if w.inPreTag != "" { // within preformatted code block
s := w.buff.String()
w.buff.Reset()
if tagName, isEnd := parseEndTag(s); isEnd && w.inPreTag == tagName {
w.inPreTag = ""
}
continue
}
// First check if we have processed this element before.
w.collector.mu.RLock()
// Work with the bytes slice as long as it's practical, // Work with the bytes slice as long as it's practical,
// to save memory allocations. // to save memory allocations.
b := w.buff.Bytes() b := w.buff.Bytes()
// See https://github.com/dominikh/go-tools/issues/723 defer func() {
//lint:ignore S1030 This construct avoids memory allocation for the string. w.buff.Reset()
}()
// First check if we have processed this element before.
w.collector.mu.RLock()
seen := w.collector.elementSet[string(b)] seen := w.collector.elementSet[string(b)]
w.collector.mu.RUnlock() w.collector.mu.RUnlock()
if seen { if seen {
w.buff.Reset() return resolve
continue
}
// Filter out unwanted tags
// if within preformatted code blocks <pre>, <textarea>, <script>, <style>
// comments and doctype tags
// end tags.
switch {
case bytes.HasPrefix(b, []byte("<!")): // comment or doctype tag
w.buff.Reset()
continue
case bytes.HasPrefix(b, []byte("</")): // end tag
w.buff.Reset()
continue
} }
s := w.buff.String() s := w.buff.String()
w.buff.Reset()
// Check if a preformatted code block started. if s == "" {
if tagName, isStart := parseStartTag(s); isStart && isPreFormatted(tagName) { return resolve
w.inPreTag = tagName
} }
// Parse each collected element. // Parse each collected element.
el, err := parseHTMLElement(s) el, err := parseHTMLElement(s)
if err != nil { if err != nil {
return n, err w.err = err
return resolve
} }
// Write this tag to the element set. // Write this tag to the element set.
@ -208,109 +258,138 @@ func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
w.collector.elementSet[s] = true w.collector.elementSet[s] = true
w.collector.elements = append(w.collector.elements, el) w.collector.elements = append(w.collector.elements, el)
w.collector.mu.Unlock() w.collector.mu.Unlock()
return resolve
}
return s
}
return s
}
func (l *htmlElementsCollectorWriter) next() rune {
if l.pos >= len(l.input) {
l.width = 0
return eof
}
runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:])
l.width = runeWidth
l.pos += l.width
return runeValue
}
// returns the next state in HTML element scanner.
type htmlCollectorStateFunc func(*htmlElementsCollectorWriter) htmlCollectorStateFunc
// At "<", buffer empty.
// Potentially starting a HTML element.
func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
if w.r == '>' || unicode.IsSpace(w.r) {
if w.buff.Len() < 2 || bytes.HasPrefix(w.buff.Bytes(), []byte("</")) {
w.buff.Reset()
return htmlLexStart
}
tagName := w.buff.Bytes()[1:]
switch {
case skipInnerElementRe.Match(tagName):
// pre, script etc. We collect classes etc. on the surrounding
// element, but skip the inner content.
w.backup()
// tagName will be overwritten, so make a copy.
tagNameCopy := make([]byte, len(tagName))
copy(tagNameCopy, tagName)
return w.lexElementInside(
w.consumeBuffUntil(
func() bool {
if w.r != '>' {
return false
}
m := endTagRe.FindSubmatch(w.buff.Bytes())
if m == nil {
return false
}
return bytes.EqualFold(m[1], tagNameCopy)
},
htmlLexStart,
))
case skipAllElementRe.Match(tagName):
// E.g. "<!DOCTYPE ..."
w.buff.Reset()
return w.consumeRuneUntil(func(r rune) bool {
return r == '>'
}, htmlLexStart)
default:
w.backup()
return w.defaultLexElementInside
} }
} }
return w.buff.WriteRune(w.r)
}
func (c *htmlElementsCollectorWriter) startCollecting() { // If it's a comment, skip to its end.
c.isCollecting = true if w.r == '-' && bytes.Equal(w.buff.Bytes(), []byte("<!--")) {
} w.buff.Reset()
return htmlLexToEndOfComment
func (c *htmlElementsCollectorWriter) endCollecting() {
c.isCollecting = false
c.inQuote = false
}
func (c *htmlElementsCollectorWriter) toggleIfQuote(b byte) {
if isQuote(b) {
if c.inQuote && b == c.quoteValue {
c.inQuote = false
} else if !c.inQuote {
c.inQuote = true
c.quoteValue = b
}
}
}
func isQuote(b byte) bool {
return b == '"' || b == '\''
}
func parseStartTag(s string) (string, bool) {
s = strings.TrimPrefix(s, "<")
s = strings.TrimSuffix(s, ">")
spaceIndex := strings.Index(s, " ")
if spaceIndex != -1 {
s = s[:spaceIndex]
} }
return strings.ToLower(strings.TrimSpace(s)), true return htmlLexElementStart
} }
func parseEndTag(s string) (string, bool) { // Entry state func.
if !strings.HasPrefix(s, "</") { // Looks for a opening bracket, '<'.
return "", false func htmlLexStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
if w.r == '<' {
w.backup()
w.buff.Reset()
return htmlLexElementStart
} }
s = strings.TrimPrefix(s, "</") return htmlLexStart
s = strings.TrimSuffix(s, ">")
return strings.ToLower(strings.TrimSpace(s)), true
} }
// No need to look inside these for HTML elements. // After "<!--", buff empty.
func isPreFormatted(s string) bool { func htmlLexToEndOfComment(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
return s == "pre" || s == "textarea" || s == "script" || s == "style" w.buff.WriteRune(w.r)
}
type htmlElement struct { if w.r == '>' && bytes.HasSuffix(w.buff.Bytes(), []byte("-->")) {
Tag string // Done, start looking for HTML elements again.
Classes []string return htmlLexStart
IDs []string
}
var (
htmlJsonFixer = strings.NewReplacer(", ", "\n")
jsonAttrRe = regexp.MustCompile(`'?(.*?)'?:.*`)
classAttrRe = regexp.MustCompile(`(?i)^class$|transition`)
exceptionList = map[string]bool{
"thead": true,
"tbody": true,
"tfoot": true,
"td": true,
"tr": true,
} }
)
return htmlLexToEndOfComment
}
func parseHTMLElement(elStr string) (el htmlElement, err error) { func parseHTMLElement(elStr string) (el htmlElement, err error) {
var tagBuffer string = ""
tagName, ok := parseStartTag(elStr) tagName := parseStartTag(elStr)
if !ok {
return el.Tag = strings.ToLower(tagName)
} tagNameToParse := el.Tag
// The net/html parser does not handle single table elements as input, e.g. tbody. // The net/html parser does not handle single table elements as input, e.g. tbody.
// We only care about the element/class/ids, so just store away the original tag name // We only care about the element/class/ids, so just store away the original tag name
// and pretend it's a <div>. // and pretend it's a <div>.
if exceptionList[tagName] { if exceptionList[el.Tag] {
tagBuffer = tagName
elStr = strings.Replace(elStr, tagName, "div", 1) elStr = strings.Replace(elStr, tagName, "div", 1)
tagNameToParse = "div"
} }
n, err := html.Parse(strings.NewReader(elStr)) n, err := html.Parse(strings.NewReader(elStr))
if err != nil { if err != nil {
return return
} }
var walk func(*html.Node) var walk func(*html.Node)
walk = func(n *html.Node) { walk = func(n *html.Node) {
if n.Type == html.ElementNode && strings.Contains(elStr, n.Data) { if n.Type == html.ElementNode && n.Data == tagNameToParse {
el.Tag = n.Data
for _, a := range n.Attr { for _, a := range n.Attr {
switch { switch {
case strings.EqualFold(a.Key, "id"): case strings.EqualFold(a.Key, "id"):
@ -345,10 +424,20 @@ func parseHTMLElement(elStr string) (el htmlElement, err error) {
walk(n) walk(n)
// did we replaced the start tag?
if tagBuffer != "" {
el.Tag = tagBuffer
}
return return
} }
// Variants of s
// <body class="b a">
// <div>
func parseStartTag(s string) string {
spaceIndex := strings.IndexFunc(s, func(r rune) bool {
return unicode.IsSpace(r)
})
if spaceIndex == -1 {
return s[1 : len(s)-1]
}
return s[1:spaceIndex]
}

View file

@ -14,9 +14,13 @@
package publisher package publisher
import ( import (
"bytes"
"fmt" "fmt"
"io"
"math/rand"
"strings" "strings"
"testing" "testing"
"time"
"github.com/gohugoio/hugo/media" "github.com/gohugoio/hugo/media"
"github.com/gohugoio/hugo/minifiers" "github.com/gohugoio/hugo/minifiers"
@ -28,6 +32,7 @@ import (
func TestClassCollector(t *testing.T) { func TestClassCollector(t *testing.T) {
c := qt.New((t)) c := qt.New((t))
rnd := rand.New(rand.NewSource(time.Now().Unix()))
f := func(tags, classes, ids string) HTMLElements { f := func(tags, classes, ids string) HTMLElements {
var tagss, classess, idss []string var tagss, classess, idss []string
@ -57,14 +62,20 @@ func TestClassCollector(t *testing.T) {
expect HTMLElements expect HTMLElements
}{ }{
{"basic", `<body class="b a"></body>`, f("body", "a b", "")}, {"basic", `<body class="b a"></body>`, f("body", "a b", "")},
{"duplicates", `<div class="b a b"></div>`, f("div", "a b", "")}, {"duplicates", `<div class="b a b"></div><div class="b a b"></div>x'`, f("div", "a b", "")},
{"single quote", `<body class='b a'></body>`, f("body", "a b", "")}, {"single quote", `<body class='b a'></body>`, f("body", "a b", "")},
{"no quote", `<body class=b id=myelement></body>`, f("body", "b", "myelement")}, {"no quote", `<body class=b id=myelement></body>`, f("body", "b", "myelement")},
{"short", `<i>`, f("i", "", "")},
{"invalid", `< body class="b a"></body><div></div>`, f("div", "", "")},
// https://github.com/gohugoio/hugo/issues/7318 // https://github.com/gohugoio/hugo/issues/7318
{"thead", `<table class="cl1"> {"thead", `<table class="cl1">
<thead class="cl2"><tr class="cl3"><td class="cl4"></td></tr></thead> <thead class="cl2"><tr class="cl3"><td class="cl4"></td></tr></thead>
<tbody class="cl5"><tr class="cl6"><td class="cl7"></td></tr></tbody> <tbody class="cl5"><tr class="cl6"><td class="cl7"></td></tr></tbody>
</table>`, f("table tbody td thead tr", "cl1 cl2 cl3 cl4 cl5 cl6 cl7", "")}, </table>`, f("table tbody td thead tr", "cl1 cl2 cl3 cl4 cl5 cl6 cl7", "")},
{"thead uppercase", `<TABLE class="CL1">
<THEAD class="CL2"><TR class="CL3"><TD class="CL4"></TD></TR></THEAD>
<TBODY class="CL5"><TR class="CL6"><TD class="CL7"></TD></TR></TBODY>
</TABLE>`, f("table tbody td thead tr", "CL1 CL2 CL3 CL4 CL5 CL6 CL7", "")},
// https://github.com/gohugoio/hugo/issues/7161 // https://github.com/gohugoio/hugo/issues/7161
{"minified a href", `<a class="b a" href=/></a>`, f("a", "a b", "")}, {"minified a href", `<a class="b a" href=/></a>`, f("a", "a b", "")},
{"AlpineJS bind 1", `<body> {"AlpineJS bind 1", `<body>
@ -98,6 +109,11 @@ func TestClassCollector(t *testing.T) {
{"Textarea tags content should be skipped", `<textarea class="textareaclass"><span>foo</span><span>bar</span></textarea><div class="foo"></div>`, f("div textarea", "foo textareaclass", "")}, {"Textarea tags content should be skipped", `<textarea class="textareaclass"><span>foo</span><span>bar</span></textarea><div class="foo"></div>`, f("div textarea", "foo textareaclass", "")},
{"DOCTYPE should beskipped", `<!DOCTYPE html>`, f("", "", "")}, {"DOCTYPE should beskipped", `<!DOCTYPE html>`, f("", "", "")},
{"Comments should be skipped", `<!-- example comment -->`, f("", "", "")}, {"Comments should be skipped", `<!-- example comment -->`, f("", "", "")},
{"Comments with elements before and after", `<div></div><!-- example comment --><span><span>`, f("div span", "", "")},
// Issue #8530
{"Comment with single quote", `<!-- Hero Area Image d'accueil --><i class="foo">`, f("i", "foo", "")},
{"Uppercase tags", `<DIV></DIV>`, f("div", "", "")},
{"Predefined tags with distinct casing", `<script>if (a < b) { nothing(); }</SCRIPT><div></div>`, f("div script", "", "")},
// Issue #8417 // Issue #8417
{"Tabs inline", `<hr id="a" class="foo"><div class="bar">d</div>`, f("div hr", "bar foo", "a")}, {"Tabs inline", `<hr id="a" class="foo"><div class="bar">d</div>`, f("div hr", "bar foo", "a")},
{"Tabs on multiple rows", `<form {"Tabs on multiple rows", `<form
@ -106,26 +122,37 @@ func TestClassCollector(t *testing.T) {
method="post" method="post"
></form> ></form>
<div id="b" class="foo">d</div>`, f("div form", "foo", "a b")}, <div id="b" class="foo">d</div>`, f("div form", "foo", "a b")},
{"Big input, multibyte runes", strings.Repeat(`神真美好 `, rnd.Intn(500)+1) + "<div id=\"神真美好\" class=\"foo\">" + strings.Repeat(`神真美好 `, rnd.Intn(100)+1) + " <span>神真美好</span>", f("div span", "foo", "神真美好")},
} { } {
for _, minify := range []bool{false, true} { for _, variant := range []struct {
c.Run(fmt.Sprintf("%s--minify-%t", test.name, minify), func(c *qt.C) { minify bool
}{
{minify: false},
{minify: true},
} {
c.Run(fmt.Sprintf("%s--minify-%t", test.name, variant.minify), func(c *qt.C) {
w := newHTMLElementsCollectorWriter(newHTMLElementsCollector()) w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
if minify { if variant.minify {
if skipMinifyTest[test.name] { if skipMinifyTest[test.name] {
c.Skip("skip minify test") c.Skip("skip minify test")
} }
v := viper.New() v := viper.New()
m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v) m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
m.Minify(media.HTMLType, w, strings.NewReader(test.html)) m.Minify(media.HTMLType, w, strings.NewReader(test.html))
} else { } else {
fmt.Fprint(w, test.html) var buff bytes.Buffer
buff.WriteString(test.html)
io.Copy(w, &buff)
} }
got := w.collector.getHTMLElements() got := w.collector.getHTMLElements()
c.Assert(got, qt.DeepEquals, test.expect) c.Assert(got, qt.DeepEquals, test.expect)
}) })
} }
} }
} }
func BenchmarkElementsCollectorWriter(b *testing.B) { func BenchmarkElementsCollectorWriter(b *testing.B) {