publisher: Exclude comment and doctype elements from writeStats

- Reorder code blocks
- Rename cssClassCollectorWriter to htmlElementCollectorWriter, as it just collect html element information
- Expand benchmark to test for minified and unminified content

Fixes #8396
Fixes #8417
This commit is contained in:
Dirk Olbrich 2021-04-12 23:42:51 +02:00 committed by Bjørn Erik Pedersen
parent 2bb9496ce2
commit bc80022e03
No known key found for this signature in database
GPG key ID: 330E6E2BD4859D8F
3 changed files with 328 additions and 191 deletions

View file

@ -1113,7 +1113,7 @@ ABC.
els := stats.HTMLElements
b.Assert(els.Classes, qt.HasLen, 3606) // (4 * 900) + 4 +2
b.Assert(els.Tags, qt.HasLen, 9)
b.Assert(els.Tags, qt.HasLen, 8)
b.Assert(els.IDs, qt.HasLen, 1)
}
}

View file

@ -20,22 +20,11 @@ import (
"strings"
"sync"
"github.com/gohugoio/hugo/helpers"
"golang.org/x/net/html"
"github.com/gohugoio/hugo/helpers"
)
func newHTMLElementsCollector() *htmlElementsCollector {
return &htmlElementsCollector{
elementSet: make(map[string]bool),
}
}
func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *cssClassCollectorWriter {
return &cssClassCollectorWriter{
collector: collector,
}
}
// HTMLElements holds lists of tags and attribute values for classes and id.
type HTMLElements struct {
Tags []string `json:"tags"`
@ -59,152 +48,6 @@ func (h *HTMLElements) Sort() {
sort.Strings(h.IDs)
}
type cssClassCollectorWriter struct {
collector *htmlElementsCollector
buff bytes.Buffer
isCollecting bool
inPreTag string
inQuote bool
quoteValue byte
}
func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) {
n = len(p)
i := 0
for i < len(p) {
if !w.isCollecting {
for ; i < len(p); i++ {
b := p[i]
if b == '<' {
w.startCollecting()
break
}
}
}
if w.isCollecting {
for ; i < len(p); i++ {
b := p[i]
w.toggleIfQuote(b)
if !w.inQuote && b == '>' {
w.endCollecting()
break
}
w.buff.WriteByte(b)
}
if !w.isCollecting {
if w.inPreTag != "" {
s := w.buff.String()
if tagName, isEnd := w.parseEndTag(s); isEnd && w.inPreTag == tagName {
w.inPreTag = ""
}
w.buff.Reset()
continue
}
// First check if we have processed this element before.
w.collector.mu.RLock()
// See https://github.com/dominikh/go-tools/issues/723
//lint:ignore S1030 This construct avoids memory allocation for the string.
seen := w.collector.elementSet[string(w.buff.Bytes())]
w.collector.mu.RUnlock()
if seen {
w.buff.Reset()
continue
}
s := w.buff.String()
w.buff.Reset()
if strings.HasPrefix(s, "</") {
continue
}
key := s
s, tagName := w.insertStandinHTMLElement(s)
el := parseHTMLElement(s)
el.Tag = tagName
if w.isPreFormatted(tagName) {
w.inPreTag = tagName
}
w.collector.mu.Lock()
w.collector.elementSet[key] = true
if el.Tag != "" {
w.collector.elements = append(w.collector.elements, el)
}
w.collector.mu.Unlock()
}
}
}
return
}
// No need to look inside these for HTML elements.
func (c *cssClassCollectorWriter) isPreFormatted(s string) bool {
return s == "pre" || s == "textarea" || s == "script"
}
// The net/html parser does not handle single table elements as input, e.g. tbody.
// We only care about the element/class/ids, so just store away the original tag name
// and pretend it's a <div>.
func (c *cssClassCollectorWriter) insertStandinHTMLElement(el string) (string, string) {
tag := el[1:]
spacei := strings.Index(tag, " ")
if spacei != -1 {
tag = tag[:spacei]
}
tag = strings.Trim(tag, "\n ")
newv := strings.Replace(el, tag, "div", 1)
return newv, strings.ToLower(tag)
}
func (c *cssClassCollectorWriter) parseEndTag(s string) (string, bool) {
if !strings.HasPrefix(s, "</") {
return "", false
}
s = strings.TrimPrefix(s, "</")
s = strings.TrimSuffix(s, ">")
return strings.ToLower(strings.TrimSpace(s)), true
}
func (c *cssClassCollectorWriter) endCollecting() {
c.isCollecting = false
c.inQuote = false
}
func (c *cssClassCollectorWriter) startCollecting() {
c.isCollecting = true
}
func (c *cssClassCollectorWriter) toggleIfQuote(b byte) {
if isQuote(b) {
if c.inQuote && b == c.quoteValue {
c.inQuote = false
} else if !c.inQuote {
c.inQuote = true
c.quoteValue = b
}
}
}
type htmlElement struct {
Tag string
Classes []string
IDs []string
}
type htmlElementsCollector struct {
// Contains the raw HTML string. We will get the same element
// several times, and want to avoid costly reparsing when this
@ -216,6 +59,12 @@ type htmlElementsCollector struct {
mu sync.RWMutex
}
func newHTMLElementsCollector() *htmlElementsCollector {
return &htmlElementsCollector{
elementSet: make(map[string]bool),
}
}
func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
var (
classes []string
@ -242,21 +91,205 @@ func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
return els
}
type htmlElementsCollectorWriter struct {
collector *htmlElementsCollector
buff bytes.Buffer
isCollecting bool
inPreTag string
inQuote bool
quoteValue byte
}
func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter {
return &htmlElementsCollectorWriter{
collector: collector,
}
}
// Write splits the incoming stream into single html element and writes these into elementSet
func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
n = len(p)
i := 0
for i < len(p) {
// if is not collecting, cycle through byte stream until start bracket "<" is found
if !w.isCollecting {
for ; i < len(p); i++ {
b := p[i]
if b == '<' {
w.startCollecting()
break
}
}
}
if w.isCollecting {
// if is collecting, cycle through byte stream until end bracket ">" is found
// disregard any ">" if within a quote
// write bytes until found to buffer
for ; i < len(p); i++ {
b := p[i]
w.toggleIfQuote(b)
w.buff.WriteByte(b)
if !w.inQuote && b == '>' {
w.endCollecting()
break
}
}
}
// if no end bracket ">" is found while collecting, but the stream ended
// this could mean we received chunks of a stream from e.g. the minify functionality
// next if loop will be skipped
// at this point we have collected an element line between angle brackets "<" and ">"
if !w.isCollecting {
s := w.buff.String()
w.buff.Reset()
// filter out unwanted tags
// empty string, just in case
// if within preformatted code blocks <pre>, <textarea>, <script>, <style>
// comments and doctype tags
// end tags
switch {
case s == "": // empty string
continue
case w.inPreTag != "": // within preformatted code block
if tagName, isEnd := parseEndTag(s); isEnd && w.inPreTag == tagName {
w.inPreTag = ""
}
continue
case strings.HasPrefix(s, "<!"): // comment or doctype tag
continue
case strings.HasPrefix(s, "</"): // end tag
continue
}
// check if we have processed this element before.
w.collector.mu.RLock()
seen := w.collector.elementSet[s]
w.collector.mu.RUnlock()
if seen {
continue
}
// check if a preformatted code block started
if tagName, isStart := parseStartTag(s); isStart && isPreFormatted(tagName) {
w.inPreTag = tagName
}
// parse each collected element
el, err := parseHTMLElement(s)
if err != nil {
return n, err
}
// write this tag to the element set
w.collector.mu.Lock()
w.collector.elementSet[s] = true
w.collector.elements = append(w.collector.elements, el)
w.collector.mu.Unlock()
}
}
return
}
func (c *htmlElementsCollectorWriter) startCollecting() {
c.isCollecting = true
}
func (c *htmlElementsCollectorWriter) endCollecting() {
c.isCollecting = false
c.inQuote = false
}
func (c *htmlElementsCollectorWriter) toggleIfQuote(b byte) {
if isQuote(b) {
if c.inQuote && b == c.quoteValue {
c.inQuote = false
} else if !c.inQuote {
c.inQuote = true
c.quoteValue = b
}
}
}
func isQuote(b byte) bool {
return b == '"' || b == '\''
}
func parseStartTag(s string) (string, bool) {
if strings.HasPrefix(s, "</") || strings.HasPrefix(s, "<!") {
return "", false
}
s = strings.TrimPrefix(s, "<")
s = strings.TrimSuffix(s, ">")
spaceIndex := strings.Index(s, " ")
if spaceIndex != -1 {
s = s[:spaceIndex]
}
return strings.ToLower(strings.TrimSpace(s)), true
}
func parseEndTag(s string) (string, bool) {
if !strings.HasPrefix(s, "</") {
return "", false
}
s = strings.TrimPrefix(s, "</")
s = strings.TrimSuffix(s, ">")
return strings.ToLower(strings.TrimSpace(s)), true
}
// No need to look inside these for HTML elements.
func isPreFormatted(s string) bool {
return s == "pre" || s == "textarea" || s == "script" || s == "style"
}
type htmlElement struct {
Tag string
Classes []string
IDs []string
}
var (
htmlJsonFixer = strings.NewReplacer(", ", "\n")
jsonAttrRe = regexp.MustCompile(`'?(.*?)'?:.*`)
classAttrRe = regexp.MustCompile(`(?i)^class$|transition`)
)
func parseHTMLElement(elStr string) (el htmlElement) {
elStr = strings.TrimSpace(elStr)
if !strings.HasSuffix(elStr, ">") {
elStr += ">"
func parseHTMLElement(elStr string) (el htmlElement, err error) {
var tagBuffer string = ""
exceptionList := map[string]bool{
"thead": true,
"tbody": true,
"tfoot": true,
"td": true,
"tr": true,
}
tagName, ok := parseStartTag(elStr)
if !ok {
return
}
// The net/html parser does not handle single table elements as input, e.g. tbody.
// We only care about the element/class/ids, so just store away the original tag name
// and pretend it's a <div>.
if exceptionList[tagName] {
tagBuffer = tagName
elStr = strings.Replace(elStr, tagName, "div", 1)
}
n, err := html.Parse(strings.NewReader(elStr))
if err != nil {
return
@ -287,7 +320,6 @@ func parseHTMLElement(elStr string) (el htmlElement) {
val = strings.Join(lines, "\n")
val = jsonAttrRe.ReplaceAllString(val, "$1")
el.Classes = append(el.Classes, strings.Fields(val)...)
}
}
}
@ -301,5 +333,10 @@ func parseHTMLElement(elStr string) (el htmlElement) {
walk(n)
// did we replaced the start tag?
if tagBuffer != "" {
el.Tag = tagBuffer
}
return
}

View file

@ -14,17 +14,17 @@
package publisher
import (
"bytes"
"fmt"
"strings"
"testing"
"github.com/gohugoio/hugo/minifiers"
"github.com/gohugoio/hugo/media"
"github.com/gohugoio/hugo/minifiers"
"github.com/gohugoio/hugo/output"
"github.com/spf13/viper"
qt "github.com/frankban/quicktest"
"github.com/spf13/viper"
)
func TestClassCollector(t *testing.T) {
@ -50,7 +50,6 @@ func TestClassCollector(t *testing.T) {
skipMinifyTest := map[string]bool{
"Script tags content should be skipped": true, // https://github.com/tdewolff/minify/issues/396
}
for _, test := range []struct {
@ -62,56 +61,57 @@ func TestClassCollector(t *testing.T) {
{"duplicates", `<div class="b a b"></div>`, f("div", "a b", "")},
{"single quote", `<body class='b a'></body>`, f("body", "a b", "")},
{"no quote", `<body class=b id=myelement></body>`, f("body", "b", "myelement")},
{"thead", `
https://github.com/gohugoio/hugo/issues/7318
<table class="cl1">
// https://github.com/gohugoio/hugo/issues/7318
{"thead", `<table class="cl1">
<thead class="cl2"><tr class="cl3"><td class="cl4"></td></tr></thead>
<tbody class="cl5"><tr class="cl6"><td class="cl7"></td></tr></tbody>
</table>`, f("table tbody td thead tr", "cl1 cl2 cl3 cl4 cl5 cl6 cl7", "")},
// https://github.com/gohugoio/hugo/issues/7161
{"minified a href", `<a class="b a" href=/></a>`, f("a", "a b", "")},
{"AlpineJS bind 1", `<body>
<div x-bind:class="{
<div x-bind:class="{
'class1': data.open,
'class2 class3': data.foo == 'bar'
}">
</div>
</body>`, f("body div", "class1 class2 class3", "")},
{
"Alpine bind 2", `<div x-bind:class="{ 'bg-black': filter.checked }"
class="inline-block mr-1 mb-2 rounded bg-gray-300 px-2 py-2">FOO</div>`,
</div>
</body>`, f("body div", "class1 class2 class3", "")},
{"AlpineJS bind 2", `<div x-bind:class="{ 'bg-black': filter.checked }" class="inline-block mr-1 mb-2 rounded bg-gray-300 px-2 py-2">FOO</div>`,
f("div", "bg-black bg-gray-300 inline-block mb-2 mr-1 px-2 py-2 rounded", ""),
},
{"Alpine bind 3", `<div x-bind:class="{ 'text-gray-800': !checked, 'text-white': checked }"></div>`, f("div", "text-gray-800 text-white", "")},
{"Alpine bind 4", `<div x-bind:class="{ 'text-gray-800': !checked,
{"AlpineJS bind 3", `<div x-bind:class="{ 'text-gray-800': !checked, 'text-white': checked }"></div>`, f("div", "text-gray-800 text-white", "")},
{"AlpineJS bind 4", `<div x-bind:class="{ 'text-gray-800': !checked,
'text-white': checked }"></div>`, f("div", "text-gray-800 text-white", "")},
{"Alpine bind 5", `<a x-bind:class="{
{"AlpineJS bind 5", `<a x-bind:class="{
'text-a': a && b,
'text-b': !a && b || c,
'pl-3': a === 1,
pl-2: b == 3,
'text-gray-600': (a > 1)
}" class="block w-36 cursor-pointer pr-3 no-underline capitalize"></a>`, f("a", "block capitalize cursor-pointer no-underline pl-2 pl-3 pr-3 text-a text-b text-gray-600 w-36", "")},
{"Alpine transition 1", `<div x-transition:enter-start="opacity-0 transform mobile:-translate-x-8 sm:-translate-y-8">`, f("div", "mobile:-translate-x-8 opacity-0 sm:-translate-y-8 transform", "")},
{"AlpineJS transition 1", `<div x-transition:enter-start="opacity-0 transform mobile:-translate-x-8 sm:-translate-y-8">`, f("div", "mobile:-translate-x-8 opacity-0 sm:-translate-y-8 transform", "")},
{"Vue bind", `<div v-bind:class="{ active: isActive }"></div>`, f("div", "active", "")},
// Issue #7746
{"Apostrophe inside attribute value", `<a class="missingclass" title="Plus d'information">my text</a><div></div>`, f("a div", "missingclass", "")},
// Issue #7567
{"Script tags content should be skipped", `<script><span>foo</span><span>bar</span></script><div class="foo"></div>`, f("div script", "foo", "")},
{"Style tags content should be skipped", `<style>p{color: red;font-size: 20px;}</style><div class="foo"></div>`, f("div style", "foo", "")},
{"Pre tags content should be skipped", `<pre class="preclass"><span>foo</span><span>bar</span></pre><div class="foo"></div>`, f("div pre", "foo preclass", "")},
{"Textare tags content should be skipped", `<textarea class="textareaclass"><span>foo</span><span>bar</span></textarea><div class="foo"></div>`, f("div textarea", "foo textareaclass", "")},
{"Textarea tags content should be skipped", `<textarea class="textareaclass"><span>foo</span><span>bar</span></textarea><div class="foo"></div>`, f("div textarea", "foo textareaclass", "")},
{"DOCTYPE should beskipped", `<!DOCTYPE html>`, f("", "", "")},
{"Comments should be skipped", `<!-- example comment -->`, f("", "", "")},
// Issue #8417
{"Tabs inline", `<hr id="a" class="foo"><div class="bar">d</div>`, f("div hr", "bar foo", "a")},
{"Tabs on multiple rows", `<form
id="a"
action="www.example.com"
method="post"
></form>
<div id="b" class="foo">d</div>`, f("div form", "foo", "a b")},
} {
for _, minify := range []bool{false, true} {
c.Run(fmt.Sprintf("%s--minify-%t", test.name, minify), func(c *qt.C) {
w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
if minify {
if skipMinifyTest[test.name] {
c.Skip("skip minify test")
@ -152,6 +152,106 @@ func BenchmarkClassCollectorWriter(b *testing.B) {
for i := 0; i < b.N; i++ {
w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
fmt.Fprint(w, benchHTML)
}
}
const benchHTML = `
<!DOCTYPE html>
<html>
<head>
<title>title</title>
<style>
a {color: red;}
.c {color: blue;}
</style>
</head>
<body id="i1" class="a b c d">
<a class="c d e"></a>
<hr>
<a class="c d e"></a>
<a class="c d e"></a>
<hr>
<a id="i2" class="c d e f"></a>
<a id="i3" class="c d e"></a>
<a class="c d e"></a>
<p>To force<br> line breaks<br> in a text,<br> use the br<br> element.</p>
<hr>
<a class="c d e"></a>
<a class="c d e"></a>
<a class="c d e"></a>
<a class="c d e"></a>
<table>
<thead class="ch">
<tr>
<th>Month</th>
<th>Savings</th>
</tr>
</thead>
<tbody class="cb">
<tr>
<td>January</td>
<td>$100</td>
</tr>
<tr>
<td>February</td>
<td>$200</td>
</tr>
</tbody>
<tfoot class="cf">
<tr>
<td></td>
<td>$300</td>
</tr>
</tfoot>
</table>
</body>
</html>
`
func BenchmarkElementsCollectorWriter(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
fmt.Fprint(w, benchHTML)
}
}
func BenchmarkElementsCollectorWriterMinified(b *testing.B) {
b.ReportAllocs()
v := viper.New()
m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
var buf bytes.Buffer
m.Minify(media.HTMLType, &buf, strings.NewReader(benchHTML))
b.ResetTimer()
for i := 0; i < b.N; i++ {
w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
fmt.Fprint(w, buf.String())
}
}
func BenchmarkElementsCollectorWriterWithMinifyStream(b *testing.B) {
b.ReportAllocs()
v := viper.New()
m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
b.ResetTimer()
for i := 0; i < b.N; i++ {
w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
m.Minify(media.HTMLType, w, strings.NewReader(benchHTML))
}
}
func BenchmarkElementsCollectorWriterWithMinifyString(b *testing.B) {
b.ReportAllocs()
v := viper.New()
m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
b.ResetTimer()
for i := 0; i < b.N; i++ {
var buf bytes.Buffer
m.Minify(media.HTMLType, &buf, strings.NewReader(benchHTML))
w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
fmt.Fprint(w, buf.String())
}
}