mirror of
https://github.com/gohugoio/hugo.git
synced 2024-12-27 18:51:19 +00:00
f518b4f71e
Fixes #8530
443 lines
10 KiB
Go
443 lines
10 KiB
Go
// Copyright 2020 The Hugo Authors. All rights reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package publisher
|
|
|
|
import (
|
|
"bytes"
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"unicode"
|
|
"unicode/utf8"
|
|
|
|
"golang.org/x/net/html"
|
|
|
|
"github.com/gohugoio/hugo/helpers"
|
|
)
|
|
|
|
const eof = -1
|
|
|
|
var (
|
|
htmlJsonFixer = strings.NewReplacer(", ", "\n")
|
|
jsonAttrRe = regexp.MustCompile(`'?(.*?)'?:.*`)
|
|
classAttrRe = regexp.MustCompile(`(?i)^class$|transition`)
|
|
|
|
skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`)
|
|
skipAllElementRe = regexp.MustCompile(`(?i)^!DOCTYPE`)
|
|
endTagRe = regexp.MustCompile(`(?i)<\/\s*([a-zA-Z]+)\s*>$`)
|
|
|
|
exceptionList = map[string]bool{
|
|
"thead": true,
|
|
"tbody": true,
|
|
"tfoot": true,
|
|
"td": true,
|
|
"tr": true,
|
|
}
|
|
)
|
|
|
|
func newHTMLElementsCollector() *htmlElementsCollector {
|
|
return &htmlElementsCollector{
|
|
elementSet: make(map[string]bool),
|
|
}
|
|
}
|
|
|
|
func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter {
|
|
w := &htmlElementsCollectorWriter{
|
|
collector: collector,
|
|
state: htmlLexStart,
|
|
}
|
|
|
|
w.defaultLexElementInside = w.lexElementInside(htmlLexStart)
|
|
|
|
return w
|
|
}
|
|
|
|
// HTMLElements holds lists of tags and attribute values for classes and id.
|
|
type HTMLElements struct {
|
|
Tags []string `json:"tags"`
|
|
Classes []string `json:"classes"`
|
|
IDs []string `json:"ids"`
|
|
}
|
|
|
|
func (h *HTMLElements) Merge(other HTMLElements) {
|
|
h.Tags = append(h.Tags, other.Tags...)
|
|
h.Classes = append(h.Classes, other.Classes...)
|
|
h.IDs = append(h.IDs, other.IDs...)
|
|
|
|
h.Tags = helpers.UniqueStringsReuse(h.Tags)
|
|
h.Classes = helpers.UniqueStringsReuse(h.Classes)
|
|
h.IDs = helpers.UniqueStringsReuse(h.IDs)
|
|
}
|
|
|
|
func (h *HTMLElements) Sort() {
|
|
sort.Strings(h.Tags)
|
|
sort.Strings(h.Classes)
|
|
sort.Strings(h.IDs)
|
|
}
|
|
|
|
type htmlElement struct {
|
|
Tag string
|
|
Classes []string
|
|
IDs []string
|
|
}
|
|
|
|
type htmlElementsCollector struct {
|
|
// Contains the raw HTML string. We will get the same element
|
|
// several times, and want to avoid costly reparsing when this
|
|
// is used for aggregated data only.
|
|
elementSet map[string]bool
|
|
|
|
elements []htmlElement
|
|
|
|
mu sync.RWMutex
|
|
}
|
|
|
|
func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
|
|
var (
|
|
classes []string
|
|
ids []string
|
|
tags []string
|
|
)
|
|
|
|
for _, el := range c.elements {
|
|
classes = append(classes, el.Classes...)
|
|
ids = append(ids, el.IDs...)
|
|
tags = append(tags, el.Tag)
|
|
}
|
|
|
|
classes = helpers.UniqueStringsSorted(classes)
|
|
ids = helpers.UniqueStringsSorted(ids)
|
|
tags = helpers.UniqueStringsSorted(tags)
|
|
|
|
els := HTMLElements{
|
|
Classes: classes,
|
|
IDs: ids,
|
|
Tags: tags,
|
|
}
|
|
|
|
return els
|
|
}
|
|
|
|
type htmlElementsCollectorWriter struct {
|
|
collector *htmlElementsCollector
|
|
|
|
r rune // Current rune
|
|
width int // The width in bytes of r
|
|
input []byte // The current slice written to Write
|
|
pos int // The current position in input
|
|
|
|
err error
|
|
|
|
inQuote rune
|
|
|
|
buff bytes.Buffer
|
|
|
|
// Current state
|
|
state htmlCollectorStateFunc
|
|
|
|
// Precompiled state funcs
|
|
defaultLexElementInside htmlCollectorStateFunc
|
|
}
|
|
|
|
// Write collects HTML elements from p, which must contain complete runes.
|
|
func (w *htmlElementsCollectorWriter) Write(p []byte) (int, error) {
|
|
if p == nil {
|
|
return 0, nil
|
|
}
|
|
|
|
w.input = p
|
|
|
|
for {
|
|
w.r = w.next()
|
|
if w.r == eof || w.r == utf8.RuneError {
|
|
break
|
|
}
|
|
w.state = w.state(w)
|
|
}
|
|
|
|
w.pos = 0
|
|
w.input = nil
|
|
|
|
return len(p), nil
|
|
}
|
|
|
|
func (l *htmlElementsCollectorWriter) backup() {
|
|
l.pos -= l.width
|
|
l.r, _ = utf8.DecodeRune(l.input[l.pos:])
|
|
}
|
|
|
|
func (w *htmlElementsCollectorWriter) consumeBuffUntil(condition func() bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
|
|
var s htmlCollectorStateFunc
|
|
s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
|
|
w.buff.WriteRune(w.r)
|
|
if condition() {
|
|
w.buff.Reset()
|
|
return resolve
|
|
}
|
|
return s
|
|
}
|
|
return s
|
|
}
|
|
|
|
func (w *htmlElementsCollectorWriter) consumeRuneUntil(condition func(r rune) bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
|
|
var s htmlCollectorStateFunc
|
|
s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
|
|
if condition(w.r) {
|
|
return resolve
|
|
}
|
|
return s
|
|
}
|
|
return s
|
|
}
|
|
|
|
// Starts with e.g. "<body " or "<div"
|
|
func (w *htmlElementsCollectorWriter) lexElementInside(resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
|
|
var s htmlCollectorStateFunc
|
|
s = func(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
|
|
w.buff.WriteRune(w.r)
|
|
|
|
// Skip any text inside a quote.
|
|
if w.r == '\'' || w.r == '"' {
|
|
if w.inQuote == w.r {
|
|
w.inQuote = 0
|
|
} else if w.inQuote == 0 {
|
|
w.inQuote = w.r
|
|
}
|
|
}
|
|
|
|
if w.inQuote != 0 {
|
|
return s
|
|
}
|
|
|
|
if w.r == '>' {
|
|
|
|
// Work with the bytes slice as long as it's practical,
|
|
// to save memory allocations.
|
|
b := w.buff.Bytes()
|
|
|
|
defer func() {
|
|
w.buff.Reset()
|
|
}()
|
|
|
|
// First check if we have processed this element before.
|
|
w.collector.mu.RLock()
|
|
|
|
seen := w.collector.elementSet[string(b)]
|
|
w.collector.mu.RUnlock()
|
|
if seen {
|
|
return resolve
|
|
}
|
|
|
|
s := w.buff.String()
|
|
|
|
if s == "" {
|
|
return resolve
|
|
}
|
|
|
|
// Parse each collected element.
|
|
el, err := parseHTMLElement(s)
|
|
if err != nil {
|
|
w.err = err
|
|
return resolve
|
|
}
|
|
|
|
// Write this tag to the element set.
|
|
w.collector.mu.Lock()
|
|
w.collector.elementSet[s] = true
|
|
w.collector.elements = append(w.collector.elements, el)
|
|
w.collector.mu.Unlock()
|
|
|
|
return resolve
|
|
|
|
}
|
|
|
|
return s
|
|
}
|
|
|
|
return s
|
|
}
|
|
|
|
func (l *htmlElementsCollectorWriter) next() rune {
|
|
if l.pos >= len(l.input) {
|
|
l.width = 0
|
|
return eof
|
|
}
|
|
|
|
runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:])
|
|
|
|
l.width = runeWidth
|
|
l.pos += l.width
|
|
return runeValue
|
|
}
|
|
|
|
// returns the next state in HTML element scanner.
|
|
type htmlCollectorStateFunc func(*htmlElementsCollectorWriter) htmlCollectorStateFunc
|
|
|
|
// At "<", buffer empty.
|
|
// Potentially starting a HTML element.
|
|
func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
|
|
if w.r == '>' || unicode.IsSpace(w.r) {
|
|
if w.buff.Len() < 2 || bytes.HasPrefix(w.buff.Bytes(), []byte("</")) {
|
|
w.buff.Reset()
|
|
return htmlLexStart
|
|
}
|
|
|
|
tagName := w.buff.Bytes()[1:]
|
|
|
|
switch {
|
|
case skipInnerElementRe.Match(tagName):
|
|
// pre, script etc. We collect classes etc. on the surrounding
|
|
// element, but skip the inner content.
|
|
w.backup()
|
|
|
|
// tagName will be overwritten, so make a copy.
|
|
tagNameCopy := make([]byte, len(tagName))
|
|
copy(tagNameCopy, tagName)
|
|
|
|
return w.lexElementInside(
|
|
w.consumeBuffUntil(
|
|
func() bool {
|
|
if w.r != '>' {
|
|
return false
|
|
}
|
|
m := endTagRe.FindSubmatch(w.buff.Bytes())
|
|
if m == nil {
|
|
return false
|
|
}
|
|
return bytes.EqualFold(m[1], tagNameCopy)
|
|
},
|
|
htmlLexStart,
|
|
))
|
|
case skipAllElementRe.Match(tagName):
|
|
// E.g. "<!DOCTYPE ..."
|
|
w.buff.Reset()
|
|
return w.consumeRuneUntil(func(r rune) bool {
|
|
return r == '>'
|
|
}, htmlLexStart)
|
|
default:
|
|
w.backup()
|
|
return w.defaultLexElementInside
|
|
}
|
|
}
|
|
|
|
w.buff.WriteRune(w.r)
|
|
|
|
// If it's a comment, skip to its end.
|
|
if w.r == '-' && bytes.Equal(w.buff.Bytes(), []byte("<!--")) {
|
|
w.buff.Reset()
|
|
return htmlLexToEndOfComment
|
|
}
|
|
|
|
return htmlLexElementStart
|
|
}
|
|
|
|
// Entry state func.
|
|
// Looks for a opening bracket, '<'.
|
|
func htmlLexStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
|
|
if w.r == '<' {
|
|
w.backup()
|
|
w.buff.Reset()
|
|
return htmlLexElementStart
|
|
}
|
|
|
|
return htmlLexStart
|
|
}
|
|
|
|
// After "<!--", buff empty.
|
|
func htmlLexToEndOfComment(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
|
|
w.buff.WriteRune(w.r)
|
|
|
|
if w.r == '>' && bytes.HasSuffix(w.buff.Bytes(), []byte("-->")) {
|
|
// Done, start looking for HTML elements again.
|
|
return htmlLexStart
|
|
}
|
|
|
|
return htmlLexToEndOfComment
|
|
}
|
|
|
|
func parseHTMLElement(elStr string) (el htmlElement, err error) {
|
|
|
|
tagName := parseStartTag(elStr)
|
|
|
|
el.Tag = strings.ToLower(tagName)
|
|
tagNameToParse := el.Tag
|
|
|
|
// The net/html parser does not handle single table elements as input, e.g. tbody.
|
|
// We only care about the element/class/ids, so just store away the original tag name
|
|
// and pretend it's a <div>.
|
|
if exceptionList[el.Tag] {
|
|
elStr = strings.Replace(elStr, tagName, "div", 1)
|
|
tagNameToParse = "div"
|
|
}
|
|
|
|
n, err := html.Parse(strings.NewReader(elStr))
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
var walk func(*html.Node)
|
|
walk = func(n *html.Node) {
|
|
if n.Type == html.ElementNode && n.Data == tagNameToParse {
|
|
for _, a := range n.Attr {
|
|
switch {
|
|
case strings.EqualFold(a.Key, "id"):
|
|
// There should be only one, but one never knows...
|
|
el.IDs = append(el.IDs, a.Val)
|
|
default:
|
|
if classAttrRe.MatchString(a.Key) {
|
|
el.Classes = append(el.Classes, strings.Fields(a.Val)...)
|
|
} else {
|
|
key := strings.ToLower(a.Key)
|
|
val := strings.TrimSpace(a.Val)
|
|
if strings.Contains(key, "class") && strings.HasPrefix(val, "{") {
|
|
// This looks like a Vue or AlpineJS class binding.
|
|
val = htmlJsonFixer.Replace(strings.Trim(val, "{}"))
|
|
lines := strings.Split(val, "\n")
|
|
for i, l := range lines {
|
|
lines[i] = strings.TrimSpace(l)
|
|
}
|
|
val = strings.Join(lines, "\n")
|
|
val = jsonAttrRe.ReplaceAllString(val, "$1")
|
|
el.Classes = append(el.Classes, strings.Fields(val)...)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
walk(c)
|
|
}
|
|
}
|
|
|
|
walk(n)
|
|
|
|
return
|
|
}
|
|
|
|
// Variants of s
|
|
// <body class="b a">
|
|
// <div>
|
|
func parseStartTag(s string) string {
|
|
spaceIndex := strings.IndexFunc(s, func(r rune) bool {
|
|
return unicode.IsSpace(r)
|
|
})
|
|
|
|
if spaceIndex == -1 {
|
|
return s[1 : len(s)-1]
|
|
}
|
|
|
|
return s[1:spaceIndex]
|
|
}
|