From d33a7ebcc16e804f1db0dc1f1edad4d9f9e816ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Erik=20Pedersen?= Date: Mon, 6 Feb 2023 17:29:12 +0100 Subject: [PATCH] Make the HTML collector parsing more robust Most notably better handling self-closing elements Closes #10698 --- publisher/htmlElementsCollector.go | 17 +++++++++++++---- publisher/htmlElementsCollector_test.go | 4 ++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go index 91e1237a9..c3b88c4cc 100644 --- a/publisher/htmlElementsCollector.go +++ b/publisher/htmlElementsCollector.go @@ -294,9 +294,10 @@ func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc } tagName := w.buff.Bytes()[1:] + isSelfClosing := tagName[len(tagName)-1] == '/' switch { - case skipInnerElementRe.Match(tagName): + case !isSelfClosing && skipInnerElementRe.Match(tagName): // pre, script etc. We collect classes etc. on the surrounding // element, but skip the inner content. w.backup() @@ -432,10 +433,18 @@ func parseStartTag(s string) string { }) if spaceIndex == -1 { - return s[1 : len(s)-1] + s = s[1 : len(s)-1] + } else { + s = s[1:spaceIndex] } - return s[1:spaceIndex] + if s[len(s)-1] == '/' { + // Self closing. + s = s[:len(s)-1] + } + + return s + } // isClosedByTag reports whether b ends with a closing tag for tagName. @@ -487,7 +496,7 @@ LOOP: } } - if state != 2 { + if state != 2 || lo >= hi { return false } diff --git a/publisher/htmlElementsCollector_test.go b/publisher/htmlElementsCollector_test.go index 11590e0a3..f9c9424cb 100644 --- a/publisher/htmlElementsCollector_test.go +++ b/publisher/htmlElementsCollector_test.go @@ -110,6 +110,9 @@ func TestClassCollector(t *testing.T) { {"DOCTYPE should beskipped", ``, f("", "", "")}, {"Comments should be skipped", ``, f("", "", "")}, {"Comments with elements before and after", `
`, f("div span", "", "")}, + {"Self closing tag", `

`, f("div hr", "", "")}, + // svg with self closing style tag. + {"SVG with self closing style tag", ``, f("g path style svg", "foo", "")}, // Issue #8530 {"Comment with single quote", ``, f("i", "foo", "")}, {"Uppercase tags", `
`, f("div", "", "")}, @@ -174,6 +177,7 @@ func TestEndsWithTag(t *testing.T) { {"match space", "foo< / div>", "div", true}, {"match space 2", "foo< / div \n>", "div", true}, {"match case", "foo", "div", true}, + {"self closing", ``, "div", false}, } { c.Run(test.name, func(c *qt.C) { got := isClosedByTag([]byte(test.s), []byte(test.tagName))