// Copyright 2020 The Hugo Authors. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package publisher import ( "bytes" "regexp" "sort" "strings" "sync" "github.com/gohugoio/hugo/helpers" "golang.org/x/net/html" ) func newHTMLElementsCollector() *htmlElementsCollector { return &htmlElementsCollector{ elementSet: make(map[string]bool), } } func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *cssClassCollectorWriter { return &cssClassCollectorWriter{ collector: collector, } } // HTMLElements holds lists of tags and attribute values for classes and id. type HTMLElements struct { Tags []string `json:"tags"` Classes []string `json:"classes"` IDs []string `json:"ids"` } func (h *HTMLElements) Merge(other HTMLElements) { h.Tags = append(h.Tags, other.Tags...) h.Classes = append(h.Classes, other.Classes...) h.IDs = append(h.IDs, other.IDs...) h.Tags = helpers.UniqueStringsReuse(h.Tags) h.Classes = helpers.UniqueStringsReuse(h.Classes) h.IDs = helpers.UniqueStringsReuse(h.IDs) } func (h *HTMLElements) Sort() { sort.Strings(h.Tags) sort.Strings(h.Classes) sort.Strings(h.IDs) } type cssClassCollectorWriter struct { collector *htmlElementsCollector buff bytes.Buffer isCollecting bool inPreTag string inQuote bool quoteValue byte } func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) { n = len(p) i := 0 for i < len(p) { if !w.isCollecting { for ; i < len(p); i++ { b := p[i] if b == '<' { w.startCollecting() break } } } if w.isCollecting { for ; i < len(p); i++ { b := p[i] w.toggleIfQuote(b) if !w.inQuote && b == '>' { w.endCollecting() break } w.buff.WriteByte(b) } if !w.isCollecting { if w.inPreTag != "" { s := w.buff.String() if tagName, isEnd := w.parseEndTag(s); isEnd && w.inPreTag == tagName { w.inPreTag = "" } w.buff.Reset() continue } // First check if we have processed this element before. w.collector.mu.RLock() // See https://github.com/dominikh/go-tools/issues/723 //lint:ignore S1030 This construct avoids memory allocation for the string. seen := w.collector.elementSet[string(w.buff.Bytes())] w.collector.mu.RUnlock() if seen { w.buff.Reset() continue } s := w.buff.String() w.buff.Reset() if strings.HasPrefix(s, "") { continue } key := s s, tagName := w.insertStandinHTMLElement(s) el := parseHTMLElement(s) el.Tag = tagName if w.isPreFormatted(tagName) { w.inPreTag = tagName } w.collector.mu.Lock() w.collector.elementSet[key] = true if el.Tag != "" { w.collector.elements = append(w.collector.elements, el) } w.collector.mu.Unlock() } } } return } // No need to look inside these for HTML elements. func (c *cssClassCollectorWriter) isPreFormatted(s string) bool { return s == "pre" || s == "textarea" || s == "script" } // The net/html parser does not handle single table elements as input, e.g. tbody. // We only care about the element/class/ids, so just store away the original tag name // and pretend it's a