diff --git a/hugolib/site_test.go b/hugolib/site_test.go index cd7ce51f8..365679a32 100644 --- a/hugolib/site_test.go +++ b/hugolib/site_test.go @@ -1113,7 +1113,7 @@ ABC. els := stats.HTMLElements b.Assert(els.Classes, qt.HasLen, 3606) // (4 * 900) + 4 +2 - b.Assert(els.Tags, qt.HasLen, 9) + b.Assert(els.Tags, qt.HasLen, 8) b.Assert(els.IDs, qt.HasLen, 1) } } diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go index d9479aafa..9f4be1ff5 100644 --- a/publisher/htmlElementsCollector.go +++ b/publisher/htmlElementsCollector.go @@ -20,22 +20,11 @@ import ( "strings" "sync" - "github.com/gohugoio/hugo/helpers" "golang.org/x/net/html" + + "github.com/gohugoio/hugo/helpers" ) -func newHTMLElementsCollector() *htmlElementsCollector { - return &htmlElementsCollector{ - elementSet: make(map[string]bool), - } -} - -func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *cssClassCollectorWriter { - return &cssClassCollectorWriter{ - collector: collector, - } -} - // HTMLElements holds lists of tags and attribute values for classes and id. type HTMLElements struct { Tags []string `json:"tags"` @@ -59,152 +48,6 @@ func (h *HTMLElements) Sort() { sort.Strings(h.IDs) } -type cssClassCollectorWriter struct { - collector *htmlElementsCollector - buff bytes.Buffer - - isCollecting bool - inPreTag string - - inQuote bool - quoteValue byte -} - -func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) { - n = len(p) - i := 0 - - for i < len(p) { - if !w.isCollecting { - for ; i < len(p); i++ { - b := p[i] - if b == '<' { - w.startCollecting() - break - } - } - } - - if w.isCollecting { - for ; i < len(p); i++ { - b := p[i] - w.toggleIfQuote(b) - if !w.inQuote && b == '>' { - w.endCollecting() - break - } - w.buff.WriteByte(b) - } - - if !w.isCollecting { - if w.inPreTag != "" { - s := w.buff.String() - if tagName, isEnd := w.parseEndTag(s); isEnd && w.inPreTag == tagName { - w.inPreTag = "" - } - w.buff.Reset() - continue - } - - // First check if we have processed this element before. - w.collector.mu.RLock() - - // See https://github.com/dominikh/go-tools/issues/723 - //lint:ignore S1030 This construct avoids memory allocation for the string. - seen := w.collector.elementSet[string(w.buff.Bytes())] - w.collector.mu.RUnlock() - if seen { - w.buff.Reset() - continue - } - - s := w.buff.String() - - w.buff.Reset() - - if strings.HasPrefix(s, ". -func (c *cssClassCollectorWriter) insertStandinHTMLElement(el string) (string, string) { - tag := el[1:] - spacei := strings.Index(tag, " ") - if spacei != -1 { - tag = tag[:spacei] - } - tag = strings.Trim(tag, "\n ") - newv := strings.Replace(el, tag, "div", 1) - return newv, strings.ToLower(tag) -} - -func (c *cssClassCollectorWriter) parseEndTag(s string) (string, bool) { - if !strings.HasPrefix(s, "") - return strings.ToLower(strings.TrimSpace(s)), true -} - -func (c *cssClassCollectorWriter) endCollecting() { - c.isCollecting = false - c.inQuote = false - -} - -func (c *cssClassCollectorWriter) startCollecting() { - c.isCollecting = true - -} - -func (c *cssClassCollectorWriter) toggleIfQuote(b byte) { - if isQuote(b) { - if c.inQuote && b == c.quoteValue { - c.inQuote = false - } else if !c.inQuote { - c.inQuote = true - c.quoteValue = b - } - } -} - -type htmlElement struct { - Tag string - Classes []string - IDs []string -} - type htmlElementsCollector struct { // Contains the raw HTML string. We will get the same element // several times, and want to avoid costly reparsing when this @@ -216,6 +59,12 @@ type htmlElementsCollector struct { mu sync.RWMutex } +func newHTMLElementsCollector() *htmlElementsCollector { + return &htmlElementsCollector{ + elementSet: make(map[string]bool), + } +} + func (c *htmlElementsCollector) getHTMLElements() HTMLElements { var ( classes []string @@ -242,21 +91,205 @@ func (c *htmlElementsCollector) getHTMLElements() HTMLElements { return els } +type htmlElementsCollectorWriter struct { + collector *htmlElementsCollector + buff bytes.Buffer + + isCollecting bool + inPreTag string + + inQuote bool + quoteValue byte +} + +func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter { + return &htmlElementsCollectorWriter{ + collector: collector, + } +} + +// Write splits the incoming stream into single html element and writes these into elementSet +func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) { + n = len(p) + i := 0 + + for i < len(p) { + // if is not collecting, cycle through byte stream until start bracket "<" is found + if !w.isCollecting { + for ; i < len(p); i++ { + b := p[i] + if b == '<' { + w.startCollecting() + break + } + } + } + + if w.isCollecting { + // if is collecting, cycle through byte stream until end bracket ">" is found + // disregard any ">" if within a quote + // write bytes until found to buffer + for ; i < len(p); i++ { + b := p[i] + w.toggleIfQuote(b) + w.buff.WriteByte(b) + + if !w.inQuote && b == '>' { + w.endCollecting() + break + } + } + } + + // if no end bracket ">" is found while collecting, but the stream ended + // this could mean we received chunks of a stream from e.g. the minify functionality + // next if loop will be skipped + + // at this point we have collected an element line between angle brackets "<" and ">" + if !w.isCollecting { + s := w.buff.String() + w.buff.Reset() + + // filter out unwanted tags + // empty string, just in case + // if within preformatted code blocks
, 
`, f("div textarea", "foo textareaclass", "")}, + {"Textarea tags content should be skipped", `
`, f("div textarea", "foo textareaclass", "")}, + {"DOCTYPE should beskipped", ``, f("", "", "")}, + {"Comments should be skipped", ``, f("", "", "")}, + // Issue #8417 + {"Tabs inline", `
d
`, f("div hr", "bar foo", "a")}, + {"Tabs on multiple rows", `
+
d
`, f("div form", "foo", "a b")}, } { for _, minify := range []bool{false, true} { c.Run(fmt.Sprintf("%s--minify-%t", test.name, minify), func(c *qt.C) { w := newHTMLElementsCollectorWriter(newHTMLElementsCollector()) - if minify { if skipMinifyTest[test.name] { c.Skip("skip minify test") @@ -152,6 +152,106 @@ func BenchmarkClassCollectorWriter(b *testing.B) { for i := 0; i < b.N; i++ { w := newHTMLElementsCollectorWriter(newHTMLElementsCollector()) fmt.Fprint(w, benchHTML) - + } +} + +const benchHTML = ` + + + +title + + + + +
+ + +
+ + + +

To force
line breaks
in a text,
use the br
element.

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
MonthSavings
January$100
February$200
$300
+ + +` + +func BenchmarkElementsCollectorWriter(b *testing.B) { + b.ReportAllocs() + for i := 0; i < b.N; i++ { + w := newHTMLElementsCollectorWriter(newHTMLElementsCollector()) + fmt.Fprint(w, benchHTML) + } +} + +func BenchmarkElementsCollectorWriterMinified(b *testing.B) { + b.ReportAllocs() + v := viper.New() + m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v) + var buf bytes.Buffer + m.Minify(media.HTMLType, &buf, strings.NewReader(benchHTML)) + b.ResetTimer() + + for i := 0; i < b.N; i++ { + w := newHTMLElementsCollectorWriter(newHTMLElementsCollector()) + fmt.Fprint(w, buf.String()) + } +} + +func BenchmarkElementsCollectorWriterWithMinifyStream(b *testing.B) { + b.ReportAllocs() + v := viper.New() + m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v) + b.ResetTimer() + + for i := 0; i < b.N; i++ { + w := newHTMLElementsCollectorWriter(newHTMLElementsCollector()) + m.Minify(media.HTMLType, w, strings.NewReader(benchHTML)) + } +} + +func BenchmarkElementsCollectorWriterWithMinifyString(b *testing.B) { + b.ReportAllocs() + v := viper.New() + m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v) + b.ResetTimer() + + for i := 0; i < b.N; i++ { + var buf bytes.Buffer + m.Minify(media.HTMLType, &buf, strings.NewReader(benchHTML)) + w := newHTMLElementsCollectorWriter(newHTMLElementsCollector()) + fmt.Fprint(w, buf.String()) } }