// Copyright 2020 The Hugo Authors. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package publisher import ( "regexp" "github.com/gohugoio/hugo/helpers" "golang.org/x/net/html" "bytes" "sort" "strings" "sync" ) func newHTMLElementsCollector() *htmlElementsCollector { return &htmlElementsCollector{ elementSet: make(map[string]bool), } } func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *cssClassCollectorWriter { return &cssClassCollectorWriter{ collector: collector, } } // HTMLElements holds lists of tags and attribute values for classes and id. type HTMLElements struct { Tags []string `json:"tags"` Classes []string `json:"classes"` IDs []string `json:"ids"` } func (h *HTMLElements) Merge(other HTMLElements) { h.Tags = append(h.Tags, other.Tags...) h.Classes = append(h.Classes, other.Classes...) h.IDs = append(h.IDs, other.IDs...) h.Tags = helpers.UniqueStringsReuse(h.Tags) h.Classes = helpers.UniqueStringsReuse(h.Classes) h.IDs = helpers.UniqueStringsReuse(h.IDs) } func (h *HTMLElements) Sort() { sort.Strings(h.Tags) sort.Strings(h.Classes) sort.Strings(h.IDs) } type cssClassCollectorWriter struct { collector *htmlElementsCollector buff bytes.Buffer isCollecting bool dropValue bool inQuote bool quoteValue byte } func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) { n = len(p) i := 0 for i < len(p) { if !w.isCollecting { for ; i < len(p); i++ { b := p[i] if b == '<' { w.startCollecting() break } } } if w.isCollecting { for ; i < len(p); i++ { b := p[i] w.toggleIfQuote(b) if !w.inQuote && b == '>' { w.endCollecting(false) break } w.buff.WriteByte(b) } if !w.isCollecting { if w.dropValue { w.buff.Reset() } else { // First check if we have processed this element before. w.collector.mu.RLock() // See https://github.com/dominikh/go-tools/issues/723 //lint:ignore S1030 This construct avoids memory allocation for the string. seen := w.collector.elementSet[string(w.buff.Bytes())] w.collector.mu.RUnlock() if seen { w.buff.Reset() continue } s := w.buff.String() w.buff.Reset() if strings.HasPrefix(s, "") { continue } s, tagName := w.insertStandinHTMLElement(s) el := parseHTMLElement(s) el.Tag = tagName w.collector.mu.Lock() w.collector.elementSet[s] = true if el.Tag != "" { w.collector.elements = append(w.collector.elements, el) } w.collector.mu.Unlock() } } } } return } // The net/html parser does not handle single table elemnts as input, e.g. tbody. // We only care about the element/class/ids, so just store away the original tag name // and pretend it's a