publisher: Some performance tweaks for the HTML elements collector

This commit is contained in:
Bjørn Erik Pedersen 2021-04-20 16:50:03 +02:00
parent bc80022e03
commit ef34dd8f0e
No known key found for this signature in database
GPG key ID: 330E6E2BD4859D8F
2 changed files with 50 additions and 102 deletions

View file

@ -108,13 +108,13 @@ func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlEleme
} }
} }
// Write splits the incoming stream into single html element and writes these into elementSet // Write splits the incoming stream into single html element.
func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) { func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
n = len(p) n = len(p)
i := 0 i := 0
for i < len(p) { for i < len(p) {
// if is not collecting, cycle through byte stream until start bracket "<" is found // If we are not collecting, cycle through byte stream until start bracket "<" is found.
if !w.isCollecting { if !w.isCollecting {
for ; i < len(p); i++ { for ; i < len(p); i++ {
b := p[i] b := p[i]
@ -126,9 +126,9 @@ func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
} }
if w.isCollecting { if w.isCollecting {
// if is collecting, cycle through byte stream until end bracket ">" is found // If we are collecting, cycle through byte stream until end bracket ">" is found,
// disregard any ">" if within a quote // disregard any ">" if within a quote,
// write bytes until found to buffer // write bytes until found to buffer.
for ; i < len(p); i++ { for ; i < len(p); i++ {
b := p[i] b := p[i]
w.toggleIfQuote(b) w.toggleIfQuote(b)
@ -141,54 +141,69 @@ func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
} }
} }
// if no end bracket ">" is found while collecting, but the stream ended // If no end bracket ">" is found while collecting, but the stream ended
// this could mean we received chunks of a stream from e.g. the minify functionality // this could mean we received chunks of a stream from e.g. the minify functionality
// next if loop will be skipped // next if loop will be skipped.
// at this point we have collected an element line between angle brackets "<" and ">" // At this point we have collected an element line between angle brackets "<" and ">".
if !w.isCollecting { if !w.isCollecting {
s := w.buff.String() if w.buff.Len() == 0 {
w.buff.Reset()
// filter out unwanted tags
// empty string, just in case
// if within preformatted code blocks <pre>, <textarea>, <script>, <style>
// comments and doctype tags
// end tags
switch {
case s == "": // empty string
continue continue
case w.inPreTag != "": // within preformatted code block }
if w.inPreTag != "" { // within preformatted code block
s := w.buff.String()
w.buff.Reset()
if tagName, isEnd := parseEndTag(s); isEnd && w.inPreTag == tagName { if tagName, isEnd := parseEndTag(s); isEnd && w.inPreTag == tagName {
w.inPreTag = "" w.inPreTag = ""
} }
continue continue
case strings.HasPrefix(s, "<!"): // comment or doctype tag
continue
case strings.HasPrefix(s, "</"): // end tag
continue
} }
// check if we have processed this element before. // First check if we have processed this element before.
w.collector.mu.RLock() w.collector.mu.RLock()
seen := w.collector.elementSet[s]
// Work with the bytes slice as long as it's practical,
// to save memory allocations.
b := w.buff.Bytes()
// See https://github.com/dominikh/go-tools/issues/723
//lint:ignore S1030 This construct avoids memory allocation for the string.
seen := w.collector.elementSet[string(b)]
w.collector.mu.RUnlock() w.collector.mu.RUnlock()
if seen { if seen {
w.buff.Reset()
continue continue
} }
// check if a preformatted code block started // Filter out unwanted tags
// if within preformatted code blocks <pre>, <textarea>, <script>, <style>
// comments and doctype tags
// end tags.
switch {
case bytes.HasPrefix(b, []byte("<!")): // comment or doctype tag
w.buff.Reset()
continue
case bytes.HasPrefix(b, []byte("</")): // end tag
w.buff.Reset()
continue
}
s := w.buff.String()
w.buff.Reset()
// Check if a preformatted code block started.
if tagName, isStart := parseStartTag(s); isStart && isPreFormatted(tagName) { if tagName, isStart := parseStartTag(s); isStart && isPreFormatted(tagName) {
w.inPreTag = tagName w.inPreTag = tagName
} }
// parse each collected element // Parse each collected element.
el, err := parseHTMLElement(s) el, err := parseHTMLElement(s)
if err != nil { if err != nil {
return n, err return n, err
} }
// write this tag to the element set // Write this tag to the element set.
w.collector.mu.Lock() w.collector.mu.Lock()
w.collector.elementSet[s] = true w.collector.elementSet[s] = true
w.collector.elements = append(w.collector.elements, el) w.collector.elements = append(w.collector.elements, el)
@ -265,17 +280,18 @@ var (
htmlJsonFixer = strings.NewReplacer(", ", "\n") htmlJsonFixer = strings.NewReplacer(", ", "\n")
jsonAttrRe = regexp.MustCompile(`'?(.*?)'?:.*`) jsonAttrRe = regexp.MustCompile(`'?(.*?)'?:.*`)
classAttrRe = regexp.MustCompile(`(?i)^class$|transition`) classAttrRe = regexp.MustCompile(`(?i)^class$|transition`)
)
func parseHTMLElement(elStr string) (el htmlElement, err error) { exceptionList = map[string]bool{
var tagBuffer string = ""
exceptionList := map[string]bool{
"thead": true, "thead": true,
"tbody": true, "tbody": true,
"tfoot": true, "tfoot": true,
"td": true, "td": true,
"tr": true, "tr": true,
} }
)
func parseHTMLElement(elStr string) (el htmlElement, err error) {
var tagBuffer string = ""
tagName, ok := parseStartTag(elStr) tagName, ok := parseStartTag(elStr)
if !ok { if !ok {

View file

@ -14,7 +14,6 @@
package publisher package publisher
import ( import (
"bytes"
"fmt" "fmt"
"strings" "strings"
"testing" "testing"
@ -129,33 +128,8 @@ func TestClassCollector(t *testing.T) {
} }
} }
func BenchmarkClassCollectorWriter(b *testing.B) { func BenchmarkElementsCollectorWriter(b *testing.B) {
const benchHTML = ` const benchHTML = `
<html>
<body id="i1" class="a b c d">
<a class="c d e"></a>
<br>
<a class="c d e"></a>
<a class="c d e"></a>
<br>
<a id="i2" class="c d e f"></a>
<a id="i3" class="c d e"></a>
<a class="c d e"></a>
<br>
<a class="c d e"></a>
<a class="c d e"></a>
<a class="c d e"></a>
<a class="c d e"></a>
</body>
</html>
`
for i := 0; i < b.N; i++ {
w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
fmt.Fprint(w, benchHTML)
}
}
const benchHTML = `
<!DOCTYPE html> <!DOCTYPE html>
<html> <html>
<head> <head>
@ -207,51 +181,9 @@ const benchHTML = `
</body> </body>
</html> </html>
` `
func BenchmarkElementsCollectorWriter(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
w := newHTMLElementsCollectorWriter(newHTMLElementsCollector()) w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
fmt.Fprint(w, benchHTML) fmt.Fprint(w, benchHTML)
}
}
func BenchmarkElementsCollectorWriterMinified(b *testing.B) {
b.ReportAllocs()
v := viper.New()
m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
var buf bytes.Buffer
m.Minify(media.HTMLType, &buf, strings.NewReader(benchHTML))
b.ResetTimer()
for i := 0; i < b.N; i++ {
w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
fmt.Fprint(w, buf.String())
}
}
func BenchmarkElementsCollectorWriterWithMinifyStream(b *testing.B) {
b.ReportAllocs()
v := viper.New()
m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
b.ResetTimer()
for i := 0; i < b.N; i++ {
w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
m.Minify(media.HTMLType, w, strings.NewReader(benchHTML))
}
}
func BenchmarkElementsCollectorWriterWithMinifyString(b *testing.B) {
b.ReportAllocs()
v := viper.New()
m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
b.ResetTimer()
for i := 0; i < b.N; i++ {
var buf bytes.Buffer
m.Minify(media.HTMLType, &buf, strings.NewReader(benchHTML))
w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
fmt.Fprint(w, buf.String())
} }
} }