hugo/tpl/internal/go_templates/htmltemplate/transition.go
Bjørn Erik Pedersen 2168c5b125 Upgrade to Go 1.23
Fixes #12763
2024-08-15 10:18:19 +02:00

686 lines
18 KiB
Go

// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package template
import (
"bytes"
"strings"
)
// transitionFunc is the array of context transition functions for text nodes.
// A transition function takes a context and template text input, and returns
// the updated context and the number of bytes consumed from the front of the
// input.
var transitionFunc = [...]func(context, []byte) (context, int){
stateText: tText,
stateTag: tTag,
stateAttrName: tAttrName,
stateAfterName: tAfterName,
stateBeforeValue: tBeforeValue,
stateHTMLCmt: tHTMLCmt,
stateRCDATA: tSpecialTagEnd,
stateAttr: tAttr,
stateURL: tURL,
stateSrcset: tURL,
stateJS: tJS,
stateJSDqStr: tJSDelimited,
stateJSSqStr: tJSDelimited,
stateJSRegexp: tJSDelimited,
stateJSTmplLit: tJSTmpl,
stateJSBlockCmt: tBlockCmt,
stateJSLineCmt: tLineCmt,
stateJSHTMLOpenCmt: tLineCmt,
stateJSHTMLCloseCmt: tLineCmt,
stateCSS: tCSS,
stateCSSDqStr: tCSSStr,
stateCSSSqStr: tCSSStr,
stateCSSDqURL: tCSSStr,
stateCSSSqURL: tCSSStr,
stateCSSURL: tCSSStr,
stateCSSBlockCmt: tBlockCmt,
stateCSSLineCmt: tLineCmt,
stateError: tError,
}
var commentStart = []byte("<!--")
var commentEnd = []byte("-->")
// tText is the context transition function for the text state.
func tText(c context, s []byte) (context, int) {
k := 0
for {
i := k + bytes.IndexByte(s[k:], '<')
if i < k || i+1 == len(s) {
return c, len(s)
} else if i+4 <= len(s) && bytes.Equal(commentStart, s[i:i+4]) {
return context{state: stateHTMLCmt}, i + 4
}
i++
end := false
if s[i] == '/' {
if i+1 == len(s) {
return c, len(s)
}
end, i = true, i+1
}
j, e := eatTagName(s, i)
if j != i {
if end {
e = elementNone
}
// We've found an HTML tag.
return context{state: stateTag, element: e}, j
}
k = j
}
}
var elementContentType = [...]state{
elementNone: stateText,
elementScript: stateJS,
elementStyle: stateCSS,
elementTextarea: stateRCDATA,
elementTitle: stateRCDATA,
}
// tTag is the context transition function for the tag state.
func tTag(c context, s []byte) (context, int) {
// Find the attribute name.
i := eatWhiteSpace(s, 0)
if i == len(s) {
return c, len(s)
}
if s[i] == '>' {
return context{
state: elementContentType[c.element],
element: c.element,
}, i + 1
}
j, err := eatAttrName(s, i)
if err != nil {
return context{state: stateError, err: err}, len(s)
}
state, attr := stateTag, attrNone
if i == j {
return context{
state: stateError,
err: errorf(ErrBadHTML, nil, 0, "expected space, attr name, or end of tag, but got %q", s[i:]),
}, len(s)
}
attrName := strings.ToLower(string(s[i:j]))
if c.element == elementScript && attrName == "type" {
attr = attrScriptType
} else {
switch attrType(attrName) {
case contentTypeURL:
attr = attrURL
case contentTypeCSS:
attr = attrStyle
case contentTypeJS:
attr = attrScript
case contentTypeSrcset:
attr = attrSrcset
}
}
if j == len(s) {
state = stateAttrName
} else {
state = stateAfterName
}
return context{state: state, element: c.element, attr: attr}, j
}
// tAttrName is the context transition function for stateAttrName.
func tAttrName(c context, s []byte) (context, int) {
i, err := eatAttrName(s, 0)
if err != nil {
return context{state: stateError, err: err}, len(s)
} else if i != len(s) {
c.state = stateAfterName
}
return c, i
}
// tAfterName is the context transition function for stateAfterName.
func tAfterName(c context, s []byte) (context, int) {
// Look for the start of the value.
i := eatWhiteSpace(s, 0)
if i == len(s) {
return c, len(s)
} else if s[i] != '=' {
// Occurs due to tag ending '>', and valueless attribute.
c.state = stateTag
return c, i
}
c.state = stateBeforeValue
// Consume the "=".
return c, i + 1
}
var attrStartStates = [...]state{
attrNone: stateAttr,
attrScript: stateJS,
attrScriptType: stateAttr,
attrStyle: stateCSS,
attrURL: stateURL,
attrSrcset: stateSrcset,
}
// tBeforeValue is the context transition function for stateBeforeValue.
func tBeforeValue(c context, s []byte) (context, int) {
i := eatWhiteSpace(s, 0)
if i == len(s) {
return c, len(s)
}
// Find the attribute delimiter.
delim := delimSpaceOrTagEnd
switch s[i] {
case '\'':
delim, i = delimSingleQuote, i+1
case '"':
delim, i = delimDoubleQuote, i+1
}
c.state, c.delim = attrStartStates[c.attr], delim
return c, i
}
// tHTMLCmt is the context transition function for stateHTMLCmt.
func tHTMLCmt(c context, s []byte) (context, int) {
if i := bytes.Index(s, commentEnd); i != -1 {
return context{}, i + 3
}
return c, len(s)
}
// specialTagEndMarkers maps element types to the character sequence that
// case-insensitively signals the end of the special tag body.
var specialTagEndMarkers = [...][]byte{
elementScript: []byte("script"),
elementStyle: []byte("style"),
elementTextarea: []byte("textarea"),
elementTitle: []byte("title"),
}
var (
specialTagEndPrefix = []byte("</")
tagEndSeparators = []byte("> \t\n\f/")
)
// tSpecialTagEnd is the context transition function for raw text and RCDATA
// element states.
func tSpecialTagEnd(c context, s []byte) (context, int) {
if c.element != elementNone {
// script end tags ("</script") within script literals are ignored, so that
// we can properly escape them.
if c.element == elementScript && (isInScriptLiteral(c.state) || isComment(c.state)) {
return c, len(s)
}
if i := indexTagEnd(s, specialTagEndMarkers[c.element]); i != -1 {
return context{}, i
}
}
return c, len(s)
}
// indexTagEnd finds the index of a special tag end in a case insensitive way, or returns -1
func indexTagEnd(s []byte, tag []byte) int {
res := 0
plen := len(specialTagEndPrefix)
for len(s) > 0 {
// Try to find the tag end prefix first
i := bytes.Index(s, specialTagEndPrefix)
if i == -1 {
return i
}
s = s[i+plen:]
// Try to match the actual tag if there is still space for it
if len(tag) <= len(s) && bytes.EqualFold(tag, s[:len(tag)]) {
s = s[len(tag):]
// Check the tag is followed by a proper separator
if len(s) > 0 && bytes.IndexByte(tagEndSeparators, s[0]) != -1 {
return res + i
}
res += len(tag)
}
res += i + plen
}
return -1
}
// tAttr is the context transition function for the attribute state.
func tAttr(c context, s []byte) (context, int) {
return c, len(s)
}
// tURL is the context transition function for the URL state.
func tURL(c context, s []byte) (context, int) {
if bytes.ContainsAny(s, "#?") {
c.urlPart = urlPartQueryOrFrag
} else if len(s) != eatWhiteSpace(s, 0) && c.urlPart == urlPartNone {
// HTML5 uses "Valid URL potentially surrounded by spaces" for
// attrs: https://www.w3.org/TR/html5/index.html#attributes-1
c.urlPart = urlPartPreQuery
}
return c, len(s)
}
// tJS is the context transition function for the JS state.
func tJS(c context, s []byte) (context, int) {
i := bytes.IndexAny(s, "\"`'/{}<-#")
if i == -1 {
// Entire input is non string, comment, regexp tokens.
c.jsCtx = nextJSCtx(s, c.jsCtx)
return c, len(s)
}
c.jsCtx = nextJSCtx(s[:i], c.jsCtx)
switch s[i] {
case '"':
c.state, c.jsCtx = stateJSDqStr, jsCtxRegexp
case '\'':
c.state, c.jsCtx = stateJSSqStr, jsCtxRegexp
case '`':
c.state, c.jsCtx = stateJSTmplLit, jsCtxRegexp
case '/':
switch {
case i+1 < len(s) && s[i+1] == '/':
c.state, i = stateJSLineCmt, i+1
case i+1 < len(s) && s[i+1] == '*':
c.state, i = stateJSBlockCmt, i+1
case c.jsCtx == jsCtxRegexp:
c.state = stateJSRegexp
case c.jsCtx == jsCtxDivOp:
c.jsCtx = jsCtxRegexp
default:
return context{
state: stateError,
err: errorf(ErrSlashAmbig, nil, 0, "'/' could start a division or regexp: %.32q", s[i:]),
}, len(s)
}
// ECMAScript supports HTML style comments for legacy reasons, see Appendix
// B.1.1 "HTML-like Comments". The handling of these comments is somewhat
// confusing. Multi-line comments are not supported, i.e. anything on lines
// between the opening and closing tokens is not considered a comment, but
// anything following the opening or closing token, on the same line, is
// ignored. As such we simply treat any line prefixed with "<!--" or "-->"
// as if it were actually prefixed with "//" and move on.
case '<':
if i+3 < len(s) && bytes.Equal(commentStart, s[i:i+4]) {
c.state, i = stateJSHTMLOpenCmt, i+3
}
case '-':
if i+2 < len(s) && bytes.Equal(commentEnd, s[i:i+3]) {
c.state, i = stateJSHTMLCloseCmt, i+2
}
// ECMAScript also supports "hashbang" comment lines, see Section 12.5.
case '#':
if i+1 < len(s) && s[i+1] == '!' {
c.state, i = stateJSLineCmt, i+1
}
case '{':
// We only care about tracking brace depth if we are inside of a
// template literal.
if len(c.jsBraceDepth) == 0 {
return c, i + 1
}
c.jsBraceDepth[len(c.jsBraceDepth)-1]++
case '}':
if len(c.jsBraceDepth) == 0 {
return c, i + 1
}
// There are no cases where a brace can be escaped in the JS context
// that are not syntax errors, it seems. Because of this we can just
// count "\}" as "}" and move on, the script is already broken as
// fully fledged parsers will just fail anyway.
c.jsBraceDepth[len(c.jsBraceDepth)-1]--
if c.jsBraceDepth[len(c.jsBraceDepth)-1] >= 0 {
return c, i + 1
}
c.jsBraceDepth = c.jsBraceDepth[:len(c.jsBraceDepth)-1]
c.state = stateJSTmplLit
default:
panic("unreachable")
}
return c, i + 1
}
func tJSTmpl(c context, s []byte) (context, int) {
var k int
for {
i := k + bytes.IndexAny(s[k:], "`\\$")
if i < k {
break
}
switch s[i] {
case '\\':
i++
if i == len(s) {
return context{
state: stateError,
err: errorf(ErrPartialEscape, nil, 0, "unfinished escape sequence in JS string: %q", s),
}, len(s)
}
case '$':
if len(s) >= i+2 && s[i+1] == '{' {
c.jsBraceDepth = append(c.jsBraceDepth, 0)
c.state = stateJS
return c, i + 2
}
case '`':
// end
c.state = stateJS
return c, i + 1
}
k = i + 1
}
return c, len(s)
}
// tJSDelimited is the context transition function for the JS string and regexp
// states.
func tJSDelimited(c context, s []byte) (context, int) {
specials := `\"`
switch c.state {
case stateJSSqStr:
specials = `\'`
case stateJSRegexp:
specials = `\/[]`
}
k, inCharset := 0, false
for {
i := k + bytes.IndexAny(s[k:], specials)
if i < k {
break
}
switch s[i] {
case '\\':
i++
if i == len(s) {
return context{
state: stateError,
err: errorf(ErrPartialEscape, nil, 0, "unfinished escape sequence in JS string: %q", s),
}, len(s)
}
case '[':
inCharset = true
case ']':
inCharset = false
case '/':
// If "</script" appears in a regex literal, the '/' should not
// close the regex literal, and it will later be escaped to
// "\x3C/script" in escapeText.
if i > 0 && i+7 <= len(s) && bytes.Equal(bytes.ToLower(s[i-1:i+7]), []byte("</script")) {
i++
} else if !inCharset {
c.state, c.jsCtx = stateJS, jsCtxDivOp
return c, i + 1
}
default:
// end delimiter
if !inCharset {
c.state, c.jsCtx = stateJS, jsCtxDivOp
return c, i + 1
}
}
k = i + 1
}
if inCharset {
// This can be fixed by making context richer if interpolation
// into charsets is desired.
return context{
state: stateError,
err: errorf(ErrPartialCharset, nil, 0, "unfinished JS regexp charset: %q", s),
}, len(s)
}
return c, len(s)
}
var blockCommentEnd = []byte("*/")
// tBlockCmt is the context transition function for /*comment*/ states.
func tBlockCmt(c context, s []byte) (context, int) {
i := bytes.Index(s, blockCommentEnd)
if i == -1 {
return c, len(s)
}
switch c.state {
case stateJSBlockCmt:
c.state = stateJS
case stateCSSBlockCmt:
c.state = stateCSS
default:
panic(c.state.String())
}
return c, i + 2
}
// tLineCmt is the context transition function for //comment states, and the JS HTML-like comment state.
func tLineCmt(c context, s []byte) (context, int) {
var lineTerminators string
var endState state
switch c.state {
case stateJSLineCmt, stateJSHTMLOpenCmt, stateJSHTMLCloseCmt:
lineTerminators, endState = "\n\r\u2028\u2029", stateJS
case stateCSSLineCmt:
lineTerminators, endState = "\n\f\r", stateCSS
// Line comments are not part of any published CSS standard but
// are supported by the 4 major browsers.
// This defines line comments as
// LINECOMMENT ::= "//" [^\n\f\d]*
// since https://www.w3.org/TR/css3-syntax/#SUBTOK-nl defines
// newlines:
// nl ::= #xA | #xD #xA | #xD | #xC
default:
panic(c.state.String())
}
i := bytes.IndexAny(s, lineTerminators)
if i == -1 {
return c, len(s)
}
c.state = endState
// Per section 7.4 of EcmaScript 5 : https://es5.github.io/#x7.4
// "However, the LineTerminator at the end of the line is not
// considered to be part of the single-line comment; it is
// recognized separately by the lexical grammar and becomes part
// of the stream of input elements for the syntactic grammar."
return c, i
}
// tCSS is the context transition function for the CSS state.
func tCSS(c context, s []byte) (context, int) {
// CSS quoted strings are almost never used except for:
// (1) URLs as in background: "/foo.png"
// (2) Multiword font-names as in font-family: "Times New Roman"
// (3) List separators in content values as in inline-lists:
// <style>
// ul.inlineList { list-style: none; padding:0 }
// ul.inlineList > li { display: inline }
// ul.inlineList > li:before { content: ", " }
// ul.inlineList > li:first-child:before { content: "" }
// </style>
// <ul class=inlineList><li>One<li>Two<li>Three</ul>
// (4) Attribute value selectors as in a[href="http://example.com/"]
//
// We conservatively treat all strings as URLs, but make some
// allowances to avoid confusion.
//
// In (1), our conservative assumption is justified.
// In (2), valid font names do not contain ':', '?', or '#', so our
// conservative assumption is fine since we will never transition past
// urlPartPreQuery.
// In (3), our protocol heuristic should not be tripped, and there
// should not be non-space content after a '?' or '#', so as long as
// we only %-encode RFC 3986 reserved characters we are ok.
// In (4), we should URL escape for URL attributes, and for others we
// have the attribute name available if our conservative assumption
// proves problematic for real code.
k := 0
for {
i := k + bytes.IndexAny(s[k:], `("'/`)
if i < k {
return c, len(s)
}
switch s[i] {
case '(':
// Look for url to the left.
p := bytes.TrimRight(s[:i], "\t\n\f\r ")
if endsWithCSSKeyword(p, "url") {
j := len(s) - len(bytes.TrimLeft(s[i+1:], "\t\n\f\r "))
switch {
case j != len(s) && s[j] == '"':
c.state, j = stateCSSDqURL, j+1
case j != len(s) && s[j] == '\'':
c.state, j = stateCSSSqURL, j+1
default:
c.state = stateCSSURL
}
return c, j
}
case '/':
if i+1 < len(s) {
switch s[i+1] {
case '/':
c.state = stateCSSLineCmt
return c, i + 2
case '*':
c.state = stateCSSBlockCmt
return c, i + 2
}
}
case '"':
c.state = stateCSSDqStr
return c, i + 1
case '\'':
c.state = stateCSSSqStr
return c, i + 1
}
k = i + 1
}
}
// tCSSStr is the context transition function for the CSS string and URL states.
func tCSSStr(c context, s []byte) (context, int) {
var endAndEsc string
switch c.state {
case stateCSSDqStr, stateCSSDqURL:
endAndEsc = `\"`
case stateCSSSqStr, stateCSSSqURL:
endAndEsc = `\'`
case stateCSSURL:
// Unquoted URLs end with a newline or close parenthesis.
// The below includes the wc (whitespace character) and nl.
endAndEsc = "\\\t\n\f\r )"
default:
panic(c.state.String())
}
k := 0
for {
i := k + bytes.IndexAny(s[k:], endAndEsc)
if i < k {
c, nread := tURL(c, decodeCSS(s[k:]))
return c, k + nread
}
if s[i] == '\\' {
i++
if i == len(s) {
return context{
state: stateError,
err: errorf(ErrPartialEscape, nil, 0, "unfinished escape sequence in CSS string: %q", s),
}, len(s)
}
} else {
c.state = stateCSS
return c, i + 1
}
c, _ = tURL(c, decodeCSS(s[:i+1]))
k = i + 1
}
}
// tError is the context transition function for the error state.
func tError(c context, s []byte) (context, int) {
return c, len(s)
}
// eatAttrName returns the largest j such that s[i:j] is an attribute name.
// It returns an error if s[i:] does not look like it begins with an
// attribute name, such as encountering a quote mark without a preceding
// equals sign.
func eatAttrName(s []byte, i int) (int, *Error) {
for j := i; j < len(s); j++ {
switch s[j] {
case ' ', '\t', '\n', '\f', '\r', '=', '>':
return j, nil
case '\'', '"', '<':
// These result in a parse warning in HTML5 and are
// indicative of serious problems if seen in an attr
// name in a template.
return -1, errorf(ErrBadHTML, nil, 0, "%q in attribute name: %.32q", s[j:j+1], s)
default:
// No-op.
}
}
return len(s), nil
}
var elementNameMap = map[string]element{
"script": elementScript,
"style": elementStyle,
"textarea": elementTextarea,
"title": elementTitle,
}
// asciiAlpha reports whether c is an ASCII letter.
func asciiAlpha(c byte) bool {
return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
}
// asciiAlphaNum reports whether c is an ASCII letter or digit.
func asciiAlphaNum(c byte) bool {
return asciiAlpha(c) || '0' <= c && c <= '9'
}
// eatTagName returns the largest j such that s[i:j] is a tag name and the tag type.
func eatTagName(s []byte, i int) (int, element) {
if i == len(s) || !asciiAlpha(s[i]) {
return i, elementNone
}
j := i + 1
for j < len(s) {
x := s[j]
if asciiAlphaNum(x) {
j++
continue
}
// Allow "x-y" or "x:y" but not "x-", "-y", or "x--y".
if (x == ':' || x == '-') && j+1 < len(s) && asciiAlphaNum(s[j+1]) {
j += 2
continue
}
break
}
return j, elementNameMap[strings.ToLower(string(s[i:j]))]
}
// eatWhiteSpace returns the largest j such that s[i:j] is white space.
func eatWhiteSpace(s []byte, i int) int {
for j := i; j < len(s); j++ {
switch s[j] {
case ' ', '\t', '\n', '\f', '\r':
// No-op.
default:
return j
}
}
return len(s)
}