diff --git a/common/text/transform.go b/common/text/transform.go index 2d51f6c33..f59577803 100644 --- a/common/text/transform.go +++ b/common/text/transform.go @@ -45,25 +45,3 @@ func RemoveAccentsString(s string) string { accentTransformerPool.Put(t) return s } - -// Chunk splits s into strings of size. -func Chunk(s string, size int) []string { - if size >= len(s) { - return []string{s} - } - var chunks []string - chunk := make([]rune, size) - l := 0 - for _, r := range s { - chunk[l] = r - l++ - if l == size { - chunks = append(chunks, string(chunk)) - l = 0 - } - } - if l > 0 { - chunks = append(chunks, string(chunk[:l])) - } - return chunks -} diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go index 1bc1a09bc..9dc28c4c2 100644 --- a/publisher/htmlElementsCollector.go +++ b/publisher/htmlElementsCollector.go @@ -19,51 +19,12 @@ import ( "sort" "strings" "sync" - "unicode" - "unicode/utf8" "golang.org/x/net/html" "github.com/gohugoio/hugo/helpers" ) -const eof = -1 - -var ( - htmlJsonFixer = strings.NewReplacer(", ", "\n") - jsonAttrRe = regexp.MustCompile(`'?(.*?)'?:.*`) - classAttrRe = regexp.MustCompile(`(?i)^class$|transition`) - - skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`) - skipAllElementRe = regexp.MustCompile(`(?i)^!DOCTYPE`) - endTagRe = regexp.MustCompile(`(?i)<\/\s*([a-zA-Z]+)\s*>$`) - - exceptionList = map[string]bool{ - "thead": true, - "tbody": true, - "tfoot": true, - "td": true, - "tr": true, - } -) - -func newHTMLElementsCollector() *htmlElementsCollector { - return &htmlElementsCollector{ - elementSet: make(map[string]bool), - } -} - -func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter { - w := &htmlElementsCollectorWriter{ - collector: collector, - state: htmlLexStart, - } - - w.defaultLexElementInside = w.lexElementInside(htmlLexStart) - - return w -} - // HTMLElements holds lists of tags and attribute values for classes and id. type HTMLElements struct { Tags []string `json:"tags"` @@ -87,12 +48,6 @@ func (h *HTMLElements) Sort() { sort.Strings(h.IDs) } -type htmlElement struct { - Tag string - Classes []string - IDs []string -} - type htmlElementsCollector struct { // Contains the raw HTML string. We will get the same element // several times, and want to avoid costly reparsing when this @@ -104,6 +59,12 @@ type htmlElementsCollector struct { mu sync.RWMutex } +func newHTMLElementsCollector() *htmlElementsCollector { + return &htmlElementsCollector{ + elementSet: make(map[string]bool), + } +} + func (c *htmlElementsCollector) getHTMLElements() HTMLElements { var ( classes []string @@ -132,118 +93,114 @@ func (c *htmlElementsCollector) getHTMLElements() HTMLElements { type htmlElementsCollectorWriter struct { collector *htmlElementsCollector + buff bytes.Buffer - r rune // Current rune - width int // The width in bytes of r - input []byte // The current slice written to Write - pos int // The current position in input + isCollecting bool + inPreTag string - err error - - inQuote rune - - buff bytes.Buffer - - // Current state - state htmlCollectorStateFunc - - // Precompiled state funcs - defaultLexElementInside htmlCollectorStateFunc + inQuote bool + quoteValue byte } -// Write collects HTML elements from p. +func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter { + return &htmlElementsCollectorWriter{ + collector: collector, + } +} + +// Write splits the incoming stream into single html element. func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) { n = len(p) - w.input = p - w.pos = 0 + i := 0 - for { - w.r = w.next() - if w.r == eof { - return - } - w.state = w.state(w) - } -} - -func (l *htmlElementsCollectorWriter) backup() { - l.pos -= l.width - l.r, _ = utf8.DecodeRune(l.input[l.pos:]) -} - -func (w *htmlElementsCollectorWriter) consumeBuffUntil(condition func() bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc { - var s htmlCollectorStateFunc - s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc { - w.buff.WriteRune(w.r) - if condition() { - w.buff.Reset() - return resolve - } - return s - } - return s -} - -func (w *htmlElementsCollectorWriter) consumeRuneUntil(condition func(r rune) bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc { - var s htmlCollectorStateFunc - s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc { - if condition(w.r) { - return resolve - } - return s - } - return s -} - -// Starts with e.g. "
" is found, + // disregard any ">" if within a quote, + // write bytes until found to buffer. + for ; i < len(p); i++ { + b := p[i] + w.toggleIfQuote(b) + w.buff.WriteByte(b) + + if !w.inQuote && b == '>' { + w.endCollecting() + break + } + } } - if w.r == '>' { + // If no end bracket ">" is found while collecting, but the stream ended + // this could mean we received chunks of a stream from e.g. the minify functionality + // next if loop will be skipped. + + // At this point we have collected an element line between angle brackets "<" and ">". + if !w.isCollecting { + if w.buff.Len() == 0 { + continue + } + + if w.inPreTag != "" { // within preformatted code block + s := w.buff.String() + w.buff.Reset() + if tagName, isEnd := parseEndTag(s); isEnd && w.inPreTag == tagName { + w.inPreTag = "" + } + continue + } + + // First check if we have processed this element before. + w.collector.mu.RLock() // Work with the bytes slice as long as it's practical, // to save memory allocations. b := w.buff.Bytes() - defer func() { - w.buff.Reset() - }() - - // First check if we have processed this element before. - w.collector.mu.RLock() - + // See https://github.com/dominikh/go-tools/issues/723 + //lint:ignore S1030 This construct avoids memory allocation for the string. seen := w.collector.elementSet[string(b)] w.collector.mu.RUnlock() if seen { - return resolve + w.buff.Reset() + continue + } + + // Filter out unwanted tags + // if within preformatted code blocks,