publisher: Skip script, pre and textarea content when looking for HTML elements

Updates #7567
2025-04-06 02:45:32 +00:00 · 2021-04-06 18:19:25 +02:00 · 2021-04-06 18:19:25 +02:00 · 8a308944e4
commit 8a308944e4
parent 7b4ade56dd
2 changed files with 66 additions and 39 deletions
--- a/publisher/htmlElementsCollector.go
+++ b/publisher/htmlElementsCollector.go
@ -64,7 +64,7 @@ type cssClassCollectorWriter struct {
 	buff      bytes.Buffer

 	isCollecting bool
-	dropValue    bool
+	inPreTag     string

 	inQuote    bool
 	quoteValue byte
@ -90,16 +90,22 @@ func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) {
 				b := p[i]
 				w.toggleIfQuote(b)
 				if !w.inQuote && b == '>' {
-					w.endCollecting(false)
+					w.endCollecting()
 					break
 				}
 				w.buff.WriteByte(b)
 			}

 			if !w.isCollecting {
-				if w.dropValue {
+				if w.inPreTag != "" {
+					s := w.buff.String()
+					if tagName, isEnd := w.parseEndTag(s); isEnd && w.inPreTag == tagName {
+						w.inPreTag = ""
+					}
 					w.buff.Reset()
-				} else {
+					continue
+				}
+
 				// First check if we have processed this element before.
 				w.collector.mu.RLock()

@ -125,6 +131,9 @@ func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) {
 				s, tagName := w.insertStandinHTMLElement(s)
 				el := parseHTMLElement(s)
 				el.Tag = tagName
+				if w.isPreFormatted(tagName) {
+					w.inPreTag = tagName
+				}

 				w.collector.mu.Lock()
 				w.collector.elementSet[key] = true
@ -132,7 +141,7 @@ func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) {
 					w.collector.elements = append(w.collector.elements, el)
 				}
 				w.collector.mu.Unlock()
-				}
+
 			}
 		}
 	}
@ -140,6 +149,11 @@ func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) {
 	return
 }

+// No need to look inside these for HTML elements.
+func (c *cssClassCollectorWriter) isPreFormatted(s string) bool {
+	return s == "pre" || s == "textarea" || s == "script"
+}
+
 // The net/html parser does not handle single table elements as input, e.g. tbody.
 // We only care about the element/class/ids, so just store away the original tag name
 // and pretend it's a <div>.
@ -154,15 +168,24 @@ func (c *cssClassCollectorWriter) insertStandinHTMLElement(el string) (string, s
 	return newv, strings.ToLower(tag)
 }

-func (c *cssClassCollectorWriter) endCollecting(drop bool) {
+func (c *cssClassCollectorWriter) parseEndTag(s string) (string, bool) {
+	if !strings.HasPrefix(s, "</") {
+		return "", false
+	}
+	s = strings.TrimPrefix(s, "</")
+	s = strings.TrimSuffix(s, ">")
+	return strings.ToLower(strings.TrimSpace(s)), true
+}
+
+func (c *cssClassCollectorWriter) endCollecting() {
 	c.isCollecting = false
 	c.inQuote = false
-	c.dropValue = drop
+
 }

 func (c *cssClassCollectorWriter) startCollecting() {
 	c.isCollecting = true
-	c.dropValue = false
+
 }

 func (c *cssClassCollectorWriter) toggleIfQuote(b byte) {
--- a/publisher/htmlElementsCollector_test.go
+++ b/publisher/htmlElementsCollector_test.go
@ -89,8 +89,12 @@ func TestClassCollector(t *testing.T) {

 		{"Alpine transition 1", `<div x-transition:enter-start="opacity-0 transform mobile:-translate-x-8 sm:-translate-y-8">`, f("div", "mobile:-translate-x-8 opacity-0 sm:-translate-y-8 transform", "")},
 		{"Vue bind", `<div v-bind:class="{ active: isActive }"></div>`, f("div", "active", "")},
-		// https://github.com/gohugoio/hugo/issues/7746
+		// Issue #7746
 		{"Apostrophe inside attribute value", `<a class="missingclass" title="Plus d'information">my text</a><div></div>`, f("a div", "missingclass", "")},
+		// Issue #7567
+		{"Script tags content should be skipped", `<script><span>foo</span><span>bar</span></script><div class="foo"></div>`, f("div script", "foo", "")},
+		{"Pre tags content should be skipped", `<pre class="preclass"><span>foo</span><span>bar</span></pre><div class="foo"></div>`, f("div pre", "foo preclass", "")},
+		{"Textare tags content should be skipped", `<textarea class="textareaclass"><span>foo</span><span>bar</span></textarea><div class="foo"></div>`, f("div textarea", "foo textareaclass", "")},
 	} {
 		c.Run(test.name, func(c *qt.C) {
 			w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())