// Copyright 2020 The Hugo Authors. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package publisher import ( "bytes" "regexp" "sort" "strings" "sync" "unicode" "unicode/utf8" "golang.org/x/net/html" "github.com/gohugoio/hugo/helpers" ) const eof = -1 var ( htmlJsonFixer = strings.NewReplacer(", ", "\n") jsonAttrRe = regexp.MustCompile(`'?(.*?)'?:.*`) classAttrRe = regexp.MustCompile(`(?i)^class$|transition`) skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`) skipAllElementRe = regexp.MustCompile(`(?i)^!DOCTYPE`) endTagRe = regexp.MustCompile(`(?i)<\/\s*([a-zA-Z]+)\s*>$`) exceptionList = map[string]bool{ "thead": true, "tbody": true, "tfoot": true, "td": true, "tr": true, } ) func newHTMLElementsCollector() *htmlElementsCollector { return &htmlElementsCollector{ elementSet: make(map[string]bool), } } func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter { w := &htmlElementsCollectorWriter{ collector: collector, state: htmlLexStart, } w.defaultLexElementInside = w.lexElementInside(htmlLexStart) return w } // HTMLElements holds lists of tags and attribute values for classes and id. type HTMLElements struct { Tags []string `json:"tags"` Classes []string `json:"classes"` IDs []string `json:"ids"` } func (h *HTMLElements) Merge(other HTMLElements) { h.Tags = append(h.Tags, other.Tags...) h.Classes = append(h.Classes, other.Classes...) h.IDs = append(h.IDs, other.IDs...) h.Tags = helpers.UniqueStringsReuse(h.Tags) h.Classes = helpers.UniqueStringsReuse(h.Classes) h.IDs = helpers.UniqueStringsReuse(h.IDs) } func (h *HTMLElements) Sort() { sort.Strings(h.Tags) sort.Strings(h.Classes) sort.Strings(h.IDs) } type htmlElement struct { Tag string Classes []string IDs []string } type htmlElementsCollector struct { // Contains the raw HTML string. We will get the same element // several times, and want to avoid costly reparsing when this // is used for aggregated data only. elementSet map[string]bool elements []htmlElement mu sync.RWMutex } func (c *htmlElementsCollector) getHTMLElements() HTMLElements { var ( classes []string ids []string tags []string ) for _, el := range c.elements { classes = append(classes, el.Classes...) ids = append(ids, el.IDs...) tags = append(tags, el.Tag) } classes = helpers.UniqueStringsSorted(classes) ids = helpers.UniqueStringsSorted(ids) tags = helpers.UniqueStringsSorted(tags) els := HTMLElements{ Classes: classes, IDs: ids, Tags: tags, } return els } type htmlElementsCollectorWriter struct { collector *htmlElementsCollector r rune // Current rune width int // The width in bytes of r input []byte // The current slice written to Write pos int // The current position in input err error inQuote rune buff bytes.Buffer // Current state state htmlCollectorStateFunc // Precompiled state funcs defaultLexElementInside htmlCollectorStateFunc } // Write collects HTML elements from p, which must contain complete runes. func (w *htmlElementsCollectorWriter) Write(p []byte) (int, error) { if p == nil { return 0, nil } w.input = p for { w.r = w.next() if w.r == eof || w.r == utf8.RuneError { break } w.state = w.state(w) } w.pos = 0 w.input = nil return len(p), nil } func (l *htmlElementsCollectorWriter) backup() { l.pos -= l.width l.r, _ = utf8.DecodeRune(l.input[l.pos:]) } func (w *htmlElementsCollectorWriter) consumeBuffUntil(condition func() bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc { var s htmlCollectorStateFunc s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc { w.buff.WriteRune(w.r) if condition() { w.buff.Reset() return resolve } return s } return s } func (w *htmlElementsCollectorWriter) consumeRuneUntil(condition func(r rune) bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc { var s htmlCollectorStateFunc s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc { if condition(w.r) { return resolve } return s } return s } // Starts with e.g. "
' { // Work with the bytes slice as long as it's practical, // to save memory allocations. b := w.buff.Bytes() defer func() { w.buff.Reset() }() // First check if we have processed this element before. w.collector.mu.RLock() seen := w.collector.elementSet[string(b)] w.collector.mu.RUnlock() if seen { return resolve } s := w.buff.String() if s == "" { return resolve } // Parse each collected element. el, err := parseHTMLElement(s) if err != nil { w.err = err return resolve } // Write this tag to the element set. w.collector.mu.Lock() w.collector.elementSet[s] = true w.collector.elements = append(w.collector.elements, el) w.collector.mu.Unlock() return resolve } return s } return s } func (l *htmlElementsCollectorWriter) next() rune { if l.pos >= len(l.input) { l.width = 0 return eof } runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:]) l.width = runeWidth l.pos += l.width return runeValue } // returns the next state in HTML element scanner. type htmlCollectorStateFunc func(*htmlElementsCollectorWriter) htmlCollectorStateFunc // At "<", buffer empty. // Potentially starting a HTML element. func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc { if w.r == '>' || unicode.IsSpace(w.r) { if w.buff.Len() < 2 || bytes.HasPrefix(w.buff.Bytes(), []byte("")) { w.buff.Reset() return htmlLexStart } tagName := w.buff.Bytes()[1:] switch { case skipInnerElementRe.Match(tagName): // pre, script etc. We collect classes etc. on the surrounding // element, but skip the inner content. w.backup() // tagName will be overwritten, so make a copy. tagNameCopy := make([]byte, len(tagName)) copy(tagNameCopy, tagName) return w.lexElementInside( w.consumeBuffUntil( func() bool { if w.r != '>' { return false } m := endTagRe.FindSubmatch(w.buff.Bytes()) if m == nil { return false } return bytes.EqualFold(m[1], tagNameCopy) }, htmlLexStart, )) case skipAllElementRe.Match(tagName): // E.g. "' }, htmlLexStart) default: w.backup() return w.defaultLexElementInside } } w.buff.WriteRune(w.r) // If it's a comment, skip to its end. if w.r == '-' && bytes.Equal(w.buff.Bytes(), []byte("")) { // Done, start looking for HTML elements again. return htmlLexStart } return htmlLexToEndOfComment } func parseHTMLElement(elStr string) (el htmlElement, err error) { tagName := parseStartTag(elStr) el.Tag = strings.ToLower(tagName) tagNameToParse := el.Tag // The net/html parser does not handle single table elements as input, e.g. tbody. // We only care about the element/class/ids, so just store away the original tag name // and pretend it's a