publisher: Make the HTML element collector more robust

Fixes #8530
2024-11-21 20:46:30 -05:00 · 2021-05-13 13:10:32 +02:00 · 2021-05-13 13:10:32 +02:00 · f518b4f71e
commit f518b4f71e
parent dc6b7a75ff
2 changed files with 283 additions and 167 deletions
--- a/publisher/htmlElementsCollector.go
+++ b/publisher/htmlElementsCollector.go
@ -19,12 +19,51 @@ import (
 	"sort"
 	"strings"
 	"sync"
 	"unicode"
 	"unicode/utf8"
 	"golang.org/x/net/html"
 	"github.com/gohugoio/hugo/helpers"
 )
 const eof = -1
 var (
 	htmlJsonFixer = strings.NewReplacer(", ", "\n")
 	jsonAttrRe    = regexp.MustCompile(`'?(.*?)'?:.*`)
 	classAttrRe   = regexp.MustCompile(`(?i)^class$|transition`)
 	skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`)
 	skipAllElementRe   = regexp.MustCompile(`(?i)^!DOCTYPE`)
 	endTagRe           = regexp.MustCompile(`(?i)<\/\s*([a-zA-Z]+)\s*>$`)
 	exceptionList = map[string]bool{
 		"thead": true,
 		"tbody": true,
 		"tfoot": true,
 		"td":    true,
 		"tr":    true,
 	}
 )
 func newHTMLElementsCollector() *htmlElementsCollector {
 	return &htmlElementsCollector{
 		elementSet: make(map[string]bool),
 	}
 }
 func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter {
 	w := &htmlElementsCollectorWriter{
 		collector: collector,
 		state:     htmlLexStart,
 	}
 	w.defaultLexElementInside = w.lexElementInside(htmlLexStart)
 	return w
 }
 // HTMLElements holds lists of tags and attribute values for classes and id.
 type HTMLElements struct {
 	Tags    []string `json:"tags"`
@ -48,6 +87,12 @@ func (h *HTMLElements) Sort() {
 	sort.Strings(h.IDs)
 }
 type htmlElement struct {
 	Tag     string
 	Classes []string
 	IDs     []string
 }
 type htmlElementsCollector struct {
 	// Contains the raw HTML string. We will get the same element
 	// several times, and want to avoid costly reparsing when this
@ -59,12 +104,6 @@ type htmlElementsCollector struct {
 	mu sync.RWMutex
 }
 func newHTMLElementsCollector() *htmlElementsCollector {
 	return &htmlElementsCollector{
 		elementSet: make(map[string]bool),
 	}
 }
 func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
 	var (
 		classes []string
@ -93,114 +132,125 @@ func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
 type htmlElementsCollectorWriter struct {
 	collector *htmlElementsCollector
 	buff      bytes.Buffer
-	isCollecting bool
+	r     rune   // Current rune
-	inPreTag     string
+	width int    // The width in bytes of r
 	input []byte // The current slice written to Write
 	pos   int    // The current position in input
-	inQuote    bool
+	err error
-	quoteValue byte
+
 	inQuote rune
 	buff bytes.Buffer
 	// Current state
 	state htmlCollectorStateFunc
 	// Precompiled state funcs
 	defaultLexElementInside htmlCollectorStateFunc
 }
-func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter {
+// Write collects HTML elements from p, which must contain complete runes.
-	return &htmlElementsCollectorWriter{
+func (w *htmlElementsCollectorWriter) Write(p []byte) (int, error) {
-		collector: collector,
+	if p == nil {
 		return 0, nil
 	}
 	w.input = p
 	for {
 		w.r = w.next()
 		if w.r == eof || w.r == utf8.RuneError {
 			break
 		}
 		w.state = w.state(w)
 	}
 	w.pos = 0
 	w.input = nil
 	return len(p), nil
 }
-// Write splits the incoming stream into single html element.
+func (l *htmlElementsCollectorWriter) backup() {
-func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
+	l.pos -= l.width
-	n = len(p)
+	l.r, _ = utf8.DecodeRune(l.input[l.pos:])
-	i := 0
+}
-	for i < len(p) {
+func (w *htmlElementsCollectorWriter) consumeBuffUntil(condition func() bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
-		// If we are not collecting, cycle through byte stream until start bracket "<" is found.
+	var s htmlCollectorStateFunc
-		if !w.isCollecting {
+	s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
-			for ; i < len(p); i++ {
+		w.buff.WriteRune(w.r)
-				b := p[i]
+		if condition() {
-				if b == '<' {
+			w.buff.Reset()
-					w.startCollecting()
+			return resolve
-					break
+		}
-				}
+		return s
 	}
 	return s
 }
 func (w *htmlElementsCollectorWriter) consumeRuneUntil(condition func(r rune) bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
 	var s htmlCollectorStateFunc
 	s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
 		if condition(w.r) {
 			return resolve
 		}
 		return s
 	}
 	return s
 }
 // Starts with e.g. "<body " or "<div"
 func (w *htmlElementsCollectorWriter) lexElementInside(resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
 	var s htmlCollectorStateFunc
 	s = func(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
 		w.buff.WriteRune(w.r)
 		// Skip any text inside a quote.
 		if w.r == '\'' || w.r == '"' {
 			if w.inQuote == w.r {
 				w.inQuote = 0
 			} else if w.inQuote == 0 {
 				w.inQuote = w.r
 			}
 		}
-		if w.isCollecting {
+		if w.inQuote != 0 {
-			// If we are collecting, cycle through byte stream until end bracket ">" is found,
+			return s
 			// disregard any ">" if within a quote,
 			// write bytes until found to buffer.
 			for ; i < len(p); i++ {
 				b := p[i]
 				w.toggleIfQuote(b)
 				w.buff.WriteByte(b)
 				if !w.inQuote && b == '>' {
 					w.endCollecting()
 					break
 				}
 			}
 		}
-		// If no end bracket ">" is found while collecting, but the stream ended
+		if w.r == '>' {
 		// this could mean we received chunks of a stream from e.g. the minify functionality
 		// next if loop will be skipped.
 		// At this point we have collected an element line between angle brackets "<" and ">".
 		if !w.isCollecting {
 			if w.buff.Len() == 0 {
 				continue
 			}
 			if w.inPreTag != "" { // within preformatted code block
 				s := w.buff.String()
 				w.buff.Reset()
 				if tagName, isEnd := parseEndTag(s); isEnd && w.inPreTag == tagName {
 					w.inPreTag = ""
 				}
 				continue
 			}
 			// First check if we have processed this element before.
 			w.collector.mu.RLock()
 			// Work with the bytes slice as long as it's practical,
 			// to save memory allocations.
 			b := w.buff.Bytes()
-			// See https://github.com/dominikh/go-tools/issues/723
+			defer func() {
-			//lint:ignore S1030 This construct avoids memory allocation for the string.
+				w.buff.Reset()
 			}()
 			// First check if we have processed this element before.
 			w.collector.mu.RLock()
 			seen := w.collector.elementSet[string(b)]
 			w.collector.mu.RUnlock()
 			if seen {
-				w.buff.Reset()
+				return resolve
 				continue
 			}
 			// Filter out unwanted tags
 			// if within preformatted code blocks <pre>, <textarea>, <script>, <style>
 			// comments and doctype tags
 			// end tags.
 			switch {
 			case bytes.HasPrefix(b, []byte("<!")): // comment or doctype tag
 				w.buff.Reset()
 				continue
 			case bytes.HasPrefix(b, []byte("</")): // end tag
 				w.buff.Reset()
 				continue
 			}
 			s := w.buff.String()
 			w.buff.Reset()
-			// Check if a preformatted code block started.
+			if s == "" {
-			if tagName, isStart := parseStartTag(s); isStart && isPreFormatted(tagName) {
+				return resolve
 				w.inPreTag = tagName
 			}
 			// Parse each collected element.
 			el, err := parseHTMLElement(s)
 			if err != nil {
-				return n, err
+				w.err = err
 				return resolve
 			}
 			// Write this tag to the element set.
@ -208,109 +258,138 @@ func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
 			w.collector.elementSet[s] = true
 			w.collector.elements = append(w.collector.elements, el)
 			w.collector.mu.Unlock()
 			return resolve
 		}
 		return s
 	}
 	return s
 }
 func (l *htmlElementsCollectorWriter) next() rune {
 	if l.pos >= len(l.input) {
 		l.width = 0
 		return eof
 	}
 	runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:])
 	l.width = runeWidth
 	l.pos += l.width
 	return runeValue
 }
 // returns the next state in HTML element scanner.
 type htmlCollectorStateFunc func(*htmlElementsCollectorWriter) htmlCollectorStateFunc
 // At "<", buffer empty.
 // Potentially starting a HTML element.
 func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
 	if w.r == '>' || unicode.IsSpace(w.r) {
 		if w.buff.Len() < 2 || bytes.HasPrefix(w.buff.Bytes(), []byte("</")) {
 			w.buff.Reset()
 			return htmlLexStart
 		}
 		tagName := w.buff.Bytes()[1:]
 		switch {
 		case skipInnerElementRe.Match(tagName):
 			// pre, script etc. We collect classes etc. on the surrounding
 			// element, but skip the inner content.
 			w.backup()
 			// tagName will be overwritten, so make a copy.
 			tagNameCopy := make([]byte, len(tagName))
 			copy(tagNameCopy, tagName)
 			return w.lexElementInside(
 				w.consumeBuffUntil(
 					func() bool {
 						if w.r != '>' {
 							return false
 						}
 						m := endTagRe.FindSubmatch(w.buff.Bytes())
 						if m == nil {
 							return false
 						}
 						return bytes.EqualFold(m[1], tagNameCopy)
 					},
 					htmlLexStart,
 				))
 		case skipAllElementRe.Match(tagName):
 			// E.g. "<!DOCTYPE ..."
 			w.buff.Reset()
 			return w.consumeRuneUntil(func(r rune) bool {
 				return r == '>'
 			}, htmlLexStart)
 		default:
 			w.backup()
 			return w.defaultLexElementInside
 		}
 	}
-	return
+	w.buff.WriteRune(w.r)
 }
-func (c *htmlElementsCollectorWriter) startCollecting() {
+	// If it's a comment, skip to its end.
-	c.isCollecting = true
+	if w.r == '-' && bytes.Equal(w.buff.Bytes(), []byte("<!--")) {
-}
+		w.buff.Reset()
-
+		return htmlLexToEndOfComment
 func (c *htmlElementsCollectorWriter) endCollecting() {
 	c.isCollecting = false
 	c.inQuote = false
 }
 func (c *htmlElementsCollectorWriter) toggleIfQuote(b byte) {
 	if isQuote(b) {
 		if c.inQuote && b == c.quoteValue {
 			c.inQuote = false
 		} else if !c.inQuote {
 			c.inQuote = true
 			c.quoteValue = b
 		}
 	}
 }
 func isQuote(b byte) bool {
 	return b == '"' || b == '\''
 }
 func parseStartTag(s string) (string, bool) {
 	s = strings.TrimPrefix(s, "<")
 	s = strings.TrimSuffix(s, ">")
 	spaceIndex := strings.Index(s, " ")
 	if spaceIndex != -1 {
 		s = s[:spaceIndex]
 	}
-	return strings.ToLower(strings.TrimSpace(s)), true
+	return htmlLexElementStart
 }
-func parseEndTag(s string) (string, bool) {
+// Entry state func.
-	if !strings.HasPrefix(s, "</") {
+// Looks for a opening bracket, '<'.
-		return "", false
+func htmlLexStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
 	if w.r == '<' {
 		w.backup()
 		w.buff.Reset()
 		return htmlLexElementStart
 	}
-	s = strings.TrimPrefix(s, "</")
+	return htmlLexStart
 	s = strings.TrimSuffix(s, ">")
 	return strings.ToLower(strings.TrimSpace(s)), true
 }
-// No need to look inside these for HTML elements.
+// After "<!--", buff empty.
-func isPreFormatted(s string) bool {
+func htmlLexToEndOfComment(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
-	return s == "pre" || s == "textarea" || s == "script" || s == "style"
+	w.buff.WriteRune(w.r)
 }
-type htmlElement struct {
+	if w.r == '>' && bytes.HasSuffix(w.buff.Bytes(), []byte("-->")) {
-	Tag     string
+		// Done, start looking for HTML elements again.
-	Classes []string
+		return htmlLexStart
 	IDs     []string
 }
 var (
 	htmlJsonFixer = strings.NewReplacer(", ", "\n")
 	jsonAttrRe    = regexp.MustCompile(`'?(.*?)'?:.*`)
 	classAttrRe   = regexp.MustCompile(`(?i)^class$|transition`)
 	exceptionList = map[string]bool{
 		"thead": true,
 		"tbody": true,
 		"tfoot": true,
 		"td":    true,
 		"tr":    true,
 	}
-)
+
 	return htmlLexToEndOfComment
 }
 func parseHTMLElement(elStr string) (el htmlElement, err error) {
 	var tagBuffer string = ""
-	tagName, ok := parseStartTag(elStr)
+	tagName := parseStartTag(elStr)
-	if !ok {
+
-		return
+	el.Tag = strings.ToLower(tagName)
-	}
+	tagNameToParse := el.Tag
 	// The net/html parser does not handle single table elements as input, e.g. tbody.
 	// We only care about the element/class/ids, so just store away the original tag name
 	// and pretend it's a <div>.
-	if exceptionList[tagName] {
+	if exceptionList[el.Tag] {
 		tagBuffer = tagName
 		elStr = strings.Replace(elStr, tagName, "div", 1)
 		tagNameToParse = "div"
 	}
 	n, err := html.Parse(strings.NewReader(elStr))
 	if err != nil {
 		return
 	}
 	var walk func(*html.Node)
 	walk = func(n *html.Node) {
-		if n.Type == html.ElementNode && strings.Contains(elStr, n.Data) {
+		if n.Type == html.ElementNode && n.Data == tagNameToParse {
 			el.Tag = n.Data
 			for _, a := range n.Attr {
 				switch {
 				case strings.EqualFold(a.Key, "id"):
@ -345,10 +424,20 @@ func parseHTMLElement(elStr string) (el htmlElement, err error) {
 	walk(n)
 	// did we replaced the start tag?
 	if tagBuffer != "" {
 		el.Tag = tagBuffer
 	}
 	return
 }
 // Variants of s
 //    <body class="b a">
 //    <div>
 func parseStartTag(s string) string {
 	spaceIndex := strings.IndexFunc(s, func(r rune) bool {
 		return unicode.IsSpace(r)
 	})
 	if spaceIndex == -1 {
 		return s[1 : len(s)-1]
 	}
 	return s[1:spaceIndex]
 }
--- a/publisher/htmlElementsCollector_test.go
+++ b/publisher/htmlElementsCollector_test.go
@ -14,9 +14,13 @@
 package publisher
 import (
 	"bytes"
 	"fmt"
 	"io"
 	"math/rand"
 	"strings"
 	"testing"
 	"time"
 	"github.com/gohugoio/hugo/media"
 	"github.com/gohugoio/hugo/minifiers"
@ -28,6 +32,7 @@ import (
 func TestClassCollector(t *testing.T) {
 	c := qt.New((t))
 	rnd := rand.New(rand.NewSource(time.Now().Unix()))
 	f := func(tags, classes, ids string) HTMLElements {
 		var tagss, classess, idss []string
@ -57,14 +62,20 @@ func TestClassCollector(t *testing.T) {
 		expect HTMLElements
 	}{
 		{"basic", `<body class="b a"></body>`, f("body", "a b", "")},
-		{"duplicates", `<div class="b a b"></div>`, f("div", "a b", "")},
+		{"duplicates", `<div class="b a b"></div><div class="b a b"></div>x'`, f("div", "a b", "")},
 		{"single quote", `<body class='b a'></body>`, f("body", "a b", "")},
 		{"no quote", `<body class=b id=myelement></body>`, f("body", "b", "myelement")},
 		{"short", `<i>`, f("i", "", "")},
 		{"invalid", `< body class="b a"></body><div></div>`, f("div", "", "")},
 		// https://github.com/gohugoio/hugo/issues/7318
 		{"thead", `<table class="cl1">
    <thead class="cl2"><tr class="cl3"><td class="cl4"></td></tr></thead>
    <tbody class="cl5"><tr class="cl6"><td class="cl7"></td></tr></tbody>
 </table>`, f("table tbody td thead tr", "cl1 cl2 cl3 cl4 cl5 cl6 cl7", "")},
 		{"thead uppercase", `<TABLE class="CL1">
    <THEAD class="CL2"><TR class="CL3"><TD class="CL4"></TD></TR></THEAD>
    <TBODY class="CL5"><TR class="CL6"><TD class="CL7"></TD></TR></TBODY>
 </TABLE>`, f("table tbody td thead tr", "CL1 CL2 CL3 CL4 CL5 CL6 CL7", "")},
 		// https://github.com/gohugoio/hugo/issues/7161
 		{"minified a href", `<a class="b a" href=/></a>`, f("a", "a b", "")},
 		{"AlpineJS bind 1", `<body>
@ -98,6 +109,11 @@ func TestClassCollector(t *testing.T) {
 		{"Textarea tags content should be skipped", `<textarea class="textareaclass"><span>foo</span><span>bar</span></textarea><div class="foo"></div>`, f("div textarea", "foo textareaclass", "")},
 		{"DOCTYPE should beskipped", `<!DOCTYPE html>`, f("", "", "")},
 		{"Comments should be skipped", `<!-- example comment -->`, f("", "", "")},
 		{"Comments with elements before and after", `<div></div><!-- example comment --><span><span>`, f("div span", "", "")},
 		// Issue #8530
 		{"Comment with single quote", `<!-- Hero Area Image d'accueil --><i class="foo">`, f("i", "foo", "")},
 		{"Uppercase tags", `<DIV></DIV>`, f("div", "", "")},
 		{"Predefined tags with distinct casing", `<script>if (a < b) { nothing(); }</SCRIPT><div></div>`, f("div script", "", "")},
 		// Issue #8417
 		{"Tabs inline", `<hr	id="a" class="foo"><div class="bar">d</div>`, f("div hr", "bar foo", "a")},
 		{"Tabs on multiple rows", `<form
@ -106,26 +122,37 @@ func TestClassCollector(t *testing.T) {
 			method="post"
 ></form>
 <div id="b" class="foo">d</div>`, f("div form", "foo", "a b")},
 		{"Big input, multibyte runes", strings.Repeat(`神真美好 `, rnd.Intn(500)+1) + "<div id=\"神真美好\" class=\"foo\">" + strings.Repeat(`神真美好 `, rnd.Intn(100)+1) + "   <span>神真美好</span>", f("div span", "foo", "神真美好")},
 	} {
-		for _, minify := range []bool{false, true} {
+		for _, variant := range []struct {
-			c.Run(fmt.Sprintf("%s--minify-%t", test.name, minify), func(c *qt.C) {
+			minify bool
 		}{
 			{minify: false},
 			{minify: true},
 		} {
 			c.Run(fmt.Sprintf("%s--minify-%t", test.name, variant.minify), func(c *qt.C) {
 				w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
-				if minify {
+				if variant.minify {
 					if skipMinifyTest[test.name] {
 						c.Skip("skip minify test")
 					}
 					v := viper.New()
 					m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
 					m.Minify(media.HTMLType, w, strings.NewReader(test.html))
 				} else {
-					fmt.Fprint(w, test.html)
+					var buff bytes.Buffer
 					buff.WriteString(test.html)
 					io.Copy(w, &buff)
 				}
 				got := w.collector.getHTMLElements()
 				c.Assert(got, qt.DeepEquals, test.expect)
 			})
 		}
 	}
 }
 func BenchmarkElementsCollectorWriter(b *testing.B) {