mirror of
https://github.com/gohugoio/hugo.git
synced 2025-01-08 20:12:01 +00:00
8e5044d7f5
This issue was introduced in `v0.102.0`.
In 223bf28004
we removed the byte source from the parsed page result, which
meant we had to preserve exact positioning for all elements. This introduced some new `TypeIgnore` tokens
which we, wrongly, assumed didn't matter where we put in the result slice (they should be ignored anyway).
But it seems that this broke the logic where we determine if it's positional or named params in the case
where the paramater value contains escaped quoutes.
This commit makes sure that these ignore tokens (the back slashes) are never sent back to the client, which is how it was before `v0.102.0`.
This commit also fixes some lost error information in that same commit.
Fixes #10236
584 lines
12 KiB
Go
584 lines
12 KiB
Go
// Copyright 2018 The Hugo Authors. All rights reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package pageparser
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"unicode"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
const eof = -1
|
|
|
|
// returns the next state in scanner.
|
|
type stateFunc func(*pageLexer) stateFunc
|
|
|
|
type pageLexer struct {
|
|
input []byte
|
|
stateStart stateFunc
|
|
state stateFunc
|
|
pos int // input position
|
|
start int // item start position
|
|
width int // width of last element
|
|
|
|
// Contains lexers for shortcodes and other main section
|
|
// elements.
|
|
sectionHandlers *sectionHandlers
|
|
|
|
cfg Config
|
|
|
|
// The summary divider to look for.
|
|
summaryDivider []byte
|
|
// Set when we have parsed any summary divider
|
|
summaryDividerChecked bool
|
|
// Whether we're in a HTML comment.
|
|
isInHTMLComment bool
|
|
|
|
lexerShortcodeState
|
|
|
|
// items delivered to client
|
|
items Items
|
|
}
|
|
|
|
// Implement the Result interface
|
|
func (l *pageLexer) Iterator() *Iterator {
|
|
return NewIterator(l.items)
|
|
}
|
|
|
|
func (l *pageLexer) Input() []byte {
|
|
return l.input
|
|
}
|
|
|
|
type Config struct {
|
|
EnableEmoji bool
|
|
}
|
|
|
|
// note: the input position here is normally 0 (start), but
|
|
// can be set if position of first shortcode is known
|
|
func newPageLexer(input []byte, stateStart stateFunc, cfg Config) *pageLexer {
|
|
lexer := &pageLexer{
|
|
input: input,
|
|
stateStart: stateStart,
|
|
cfg: cfg,
|
|
lexerShortcodeState: lexerShortcodeState{
|
|
currLeftDelimItem: tLeftDelimScNoMarkup,
|
|
currRightDelimItem: tRightDelimScNoMarkup,
|
|
openShortcodes: make(map[string]bool),
|
|
},
|
|
items: make([]Item, 0, 5),
|
|
}
|
|
|
|
lexer.sectionHandlers = createSectionHandlers(lexer)
|
|
|
|
return lexer
|
|
}
|
|
|
|
// main loop
|
|
func (l *pageLexer) run() *pageLexer {
|
|
for l.state = l.stateStart; l.state != nil; {
|
|
l.state = l.state(l)
|
|
}
|
|
return l
|
|
}
|
|
|
|
// Page syntax
|
|
var (
|
|
byteOrderMark = '\ufeff'
|
|
summaryDivider = []byte("<!--more-->")
|
|
summaryDividerOrg = []byte("# more")
|
|
delimTOML = []byte("+++")
|
|
delimYAML = []byte("---")
|
|
delimOrg = []byte("#+")
|
|
htmlCommentStart = []byte("<!--")
|
|
htmlCommentEnd = []byte("-->")
|
|
|
|
emojiDelim = byte(':')
|
|
)
|
|
|
|
func (l *pageLexer) next() rune {
|
|
if l.pos >= len(l.input) {
|
|
l.width = 0
|
|
return eof
|
|
}
|
|
|
|
runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:])
|
|
l.width = runeWidth
|
|
l.pos += l.width
|
|
|
|
return runeValue
|
|
}
|
|
|
|
// peek, but no consume
|
|
func (l *pageLexer) peek() rune {
|
|
r := l.next()
|
|
l.backup()
|
|
return r
|
|
}
|
|
|
|
// steps back one
|
|
func (l *pageLexer) backup() {
|
|
l.pos -= l.width
|
|
}
|
|
|
|
func (l *pageLexer) append(item Item) {
|
|
if item.Pos() < len(l.input) {
|
|
item.firstByte = l.input[item.Pos()]
|
|
}
|
|
l.items = append(l.items, item)
|
|
}
|
|
|
|
// sends an item back to the client.
|
|
func (l *pageLexer) emit(t ItemType) {
|
|
defer func() {
|
|
l.start = l.pos
|
|
}()
|
|
|
|
if t == tText {
|
|
// Identify any trailing whitespace/intendation.
|
|
// We currently only care about the last one.
|
|
for i := l.pos - 1; i >= l.start; i-- {
|
|
b := l.input[i]
|
|
if b != ' ' && b != '\t' && b != '\r' && b != '\n' {
|
|
break
|
|
}
|
|
if i == l.start && b != '\n' {
|
|
l.append(Item{Type: tIndentation, low: l.start, high: l.pos})
|
|
return
|
|
} else if b == '\n' && i < l.pos-1 {
|
|
l.append(Item{Type: t, low: l.start, high: i + 1})
|
|
l.append(Item{Type: tIndentation, low: i + 1, high: l.pos})
|
|
return
|
|
} else if b == '\n' && i == l.pos-1 {
|
|
break
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
l.append(Item{Type: t, low: l.start, high: l.pos})
|
|
|
|
}
|
|
|
|
// sends a string item back to the client.
|
|
func (l *pageLexer) emitString(t ItemType) {
|
|
l.append(Item{Type: t, low: l.start, high: l.pos, isString: true})
|
|
l.start = l.pos
|
|
}
|
|
|
|
func (l *pageLexer) isEOF() bool {
|
|
return l.pos >= len(l.input)
|
|
}
|
|
|
|
// special case, do not send '\\' back to client
|
|
func (l *pageLexer) ignoreEscapesAndEmit(t ItemType, isString bool) {
|
|
i := l.start
|
|
k := i
|
|
|
|
var segments []lowHigh
|
|
|
|
for i < l.pos {
|
|
r, w := utf8.DecodeRune(l.input[i:l.pos])
|
|
if r == '\\' {
|
|
if i > k {
|
|
segments = append(segments, lowHigh{k, i})
|
|
}
|
|
// See issue #10236.
|
|
// We don't send the backslash back to the client,
|
|
// which makes the end parsing simpler.
|
|
// This means that we cannot render the AST back to be
|
|
// exactly the same as the input,
|
|
// but that was also the situation before we introduced the issue in #10236.
|
|
k = i + w
|
|
}
|
|
i += w
|
|
}
|
|
|
|
if k < l.pos {
|
|
segments = append(segments, lowHigh{k, l.pos})
|
|
}
|
|
|
|
if len(segments) > 0 {
|
|
l.append(Item{Type: t, segments: segments})
|
|
}
|
|
|
|
l.start = l.pos
|
|
|
|
}
|
|
|
|
// gets the current value (for debugging and error handling)
|
|
func (l *pageLexer) current() []byte {
|
|
return l.input[l.start:l.pos]
|
|
}
|
|
|
|
// ignore current element
|
|
func (l *pageLexer) ignore() {
|
|
l.start = l.pos
|
|
}
|
|
|
|
var lf = []byte("\n")
|
|
|
|
// nil terminates the parser
|
|
func (l *pageLexer) errorf(format string, args ...any) stateFunc {
|
|
l.append(Item{Type: tError, Err: fmt.Errorf(format, args...)})
|
|
return nil
|
|
}
|
|
|
|
func (l *pageLexer) consumeCRLF() bool {
|
|
var consumed bool
|
|
for _, r := range crLf {
|
|
if l.next() != r {
|
|
l.backup()
|
|
} else {
|
|
consumed = true
|
|
}
|
|
}
|
|
return consumed
|
|
}
|
|
|
|
func (l *pageLexer) consumeToNextLine() {
|
|
for {
|
|
r := l.next()
|
|
if r == eof || isEndOfLine(r) {
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
func (l *pageLexer) consumeToSpace() {
|
|
for {
|
|
r := l.next()
|
|
if r == eof || unicode.IsSpace(r) {
|
|
l.backup()
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
func (l *pageLexer) consumeSpace() {
|
|
for {
|
|
r := l.next()
|
|
if r == eof || !unicode.IsSpace(r) {
|
|
l.backup()
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// lex a string starting at ":"
|
|
func lexEmoji(l *pageLexer) stateFunc {
|
|
pos := l.pos + 1
|
|
valid := false
|
|
|
|
for i := pos; i < len(l.input); i++ {
|
|
if i > pos && l.input[i] == emojiDelim {
|
|
pos = i + 1
|
|
valid = true
|
|
break
|
|
}
|
|
r, _ := utf8.DecodeRune(l.input[i:])
|
|
if !(isAlphaNumericOrHyphen(r) || r == '+') {
|
|
break
|
|
}
|
|
}
|
|
|
|
if valid {
|
|
l.pos = pos
|
|
l.emit(TypeEmoji)
|
|
} else {
|
|
l.pos++
|
|
l.emit(tText)
|
|
}
|
|
|
|
return lexMainSection
|
|
}
|
|
|
|
type sectionHandlers struct {
|
|
l *pageLexer
|
|
|
|
// Set when none of the sections are found so we
|
|
// can safely stop looking and skip to the end.
|
|
skipAll bool
|
|
|
|
handlers []*sectionHandler
|
|
skipIndexes []int
|
|
}
|
|
|
|
func (s *sectionHandlers) skip() int {
|
|
if s.skipAll {
|
|
return -1
|
|
}
|
|
|
|
s.skipIndexes = s.skipIndexes[:0]
|
|
var shouldSkip bool
|
|
for _, skipper := range s.handlers {
|
|
idx := skipper.skip()
|
|
if idx != -1 {
|
|
shouldSkip = true
|
|
s.skipIndexes = append(s.skipIndexes, idx)
|
|
}
|
|
}
|
|
|
|
if !shouldSkip {
|
|
s.skipAll = true
|
|
return -1
|
|
}
|
|
|
|
return minIndex(s.skipIndexes...)
|
|
}
|
|
|
|
func createSectionHandlers(l *pageLexer) *sectionHandlers {
|
|
shortCodeHandler := §ionHandler{
|
|
l: l,
|
|
skipFunc: func(l *pageLexer) int {
|
|
return l.index(leftDelimSc)
|
|
},
|
|
lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) {
|
|
if !l.isShortCodeStart() {
|
|
return origin, false
|
|
}
|
|
|
|
if l.isInline {
|
|
// If we're inside an inline shortcode, the only valid shortcode markup is
|
|
// the markup which closes it.
|
|
b := l.input[l.pos+3:]
|
|
end := indexNonWhiteSpace(b, '/')
|
|
if end != len(l.input)-1 {
|
|
b = bytes.TrimSpace(b[end+1:])
|
|
if end == -1 || !bytes.HasPrefix(b, []byte(l.currShortcodeName+" ")) {
|
|
return l.errorf("inline shortcodes do not support nesting"), true
|
|
}
|
|
}
|
|
}
|
|
|
|
if l.hasPrefix(leftDelimScWithMarkup) {
|
|
l.currLeftDelimItem = tLeftDelimScWithMarkup
|
|
l.currRightDelimItem = tRightDelimScWithMarkup
|
|
} else {
|
|
l.currLeftDelimItem = tLeftDelimScNoMarkup
|
|
l.currRightDelimItem = tRightDelimScNoMarkup
|
|
}
|
|
|
|
return lexShortcodeLeftDelim, true
|
|
},
|
|
}
|
|
|
|
summaryDividerHandler := §ionHandler{
|
|
l: l,
|
|
skipFunc: func(l *pageLexer) int {
|
|
if l.summaryDividerChecked || l.summaryDivider == nil {
|
|
return -1
|
|
}
|
|
return l.index(l.summaryDivider)
|
|
},
|
|
lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) {
|
|
if !l.hasPrefix(l.summaryDivider) {
|
|
return origin, false
|
|
}
|
|
|
|
l.summaryDividerChecked = true
|
|
l.pos += len(l.summaryDivider)
|
|
// This makes it a little easier to reason about later.
|
|
l.consumeSpace()
|
|
l.emit(TypeLeadSummaryDivider)
|
|
|
|
return origin, true
|
|
},
|
|
}
|
|
|
|
handlers := []*sectionHandler{shortCodeHandler, summaryDividerHandler}
|
|
|
|
if l.cfg.EnableEmoji {
|
|
emojiHandler := §ionHandler{
|
|
l: l,
|
|
skipFunc: func(l *pageLexer) int {
|
|
return l.indexByte(emojiDelim)
|
|
},
|
|
lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) {
|
|
return lexEmoji, true
|
|
},
|
|
}
|
|
|
|
handlers = append(handlers, emojiHandler)
|
|
}
|
|
|
|
return §ionHandlers{
|
|
l: l,
|
|
handlers: handlers,
|
|
skipIndexes: make([]int, len(handlers)),
|
|
}
|
|
}
|
|
|
|
func (s *sectionHandlers) lex(origin stateFunc) stateFunc {
|
|
if s.skipAll {
|
|
return nil
|
|
}
|
|
|
|
if s.l.pos > s.l.start {
|
|
s.l.emit(tText)
|
|
}
|
|
|
|
for _, handler := range s.handlers {
|
|
if handler.skipAll {
|
|
continue
|
|
}
|
|
|
|
next, handled := handler.lexFunc(origin, handler.l)
|
|
if next == nil || handled {
|
|
return next
|
|
}
|
|
}
|
|
|
|
// Not handled by the above.
|
|
s.l.pos++
|
|
|
|
return origin
|
|
}
|
|
|
|
type sectionHandler struct {
|
|
l *pageLexer
|
|
|
|
// No more sections of this type.
|
|
skipAll bool
|
|
|
|
// Returns the index of the next match, -1 if none found.
|
|
skipFunc func(l *pageLexer) int
|
|
|
|
// Lex lexes the current section and returns the next state func and
|
|
// a bool telling if this section was handled.
|
|
// Note that returning nil as the next state will terminate the
|
|
// lexer.
|
|
lexFunc func(origin stateFunc, l *pageLexer) (stateFunc, bool)
|
|
}
|
|
|
|
func (s *sectionHandler) skip() int {
|
|
if s.skipAll {
|
|
return -1
|
|
}
|
|
|
|
idx := s.skipFunc(s.l)
|
|
if idx == -1 {
|
|
s.skipAll = true
|
|
}
|
|
return idx
|
|
}
|
|
|
|
func lexMainSection(l *pageLexer) stateFunc {
|
|
if l.isEOF() {
|
|
return lexDone
|
|
}
|
|
|
|
if l.isInHTMLComment {
|
|
return lexEndFrontMatterHTMLComment
|
|
}
|
|
|
|
// Fast forward as far as possible.
|
|
skip := l.sectionHandlers.skip()
|
|
|
|
if skip == -1 {
|
|
l.pos = len(l.input)
|
|
return lexDone
|
|
} else if skip > 0 {
|
|
l.pos += skip
|
|
}
|
|
|
|
next := l.sectionHandlers.lex(lexMainSection)
|
|
if next != nil {
|
|
return next
|
|
}
|
|
|
|
l.pos = len(l.input)
|
|
return lexDone
|
|
}
|
|
|
|
func lexDone(l *pageLexer) stateFunc {
|
|
// Done!
|
|
if l.pos > l.start {
|
|
l.emit(tText)
|
|
}
|
|
l.emit(tEOF)
|
|
return nil
|
|
}
|
|
|
|
func (l *pageLexer) printCurrentInput() {
|
|
fmt.Printf("input[%d:]: %q", l.pos, string(l.input[l.pos:]))
|
|
}
|
|
|
|
// state helpers
|
|
|
|
func (l *pageLexer) index(sep []byte) int {
|
|
return bytes.Index(l.input[l.pos:], sep)
|
|
}
|
|
|
|
func (l *pageLexer) indexByte(sep byte) int {
|
|
return bytes.IndexByte(l.input[l.pos:], sep)
|
|
}
|
|
|
|
func (l *pageLexer) hasPrefix(prefix []byte) bool {
|
|
return bytes.HasPrefix(l.input[l.pos:], prefix)
|
|
}
|
|
|
|
// helper functions
|
|
|
|
// returns the min index >= 0
|
|
func minIndex(indices ...int) int {
|
|
min := -1
|
|
|
|
for _, j := range indices {
|
|
if j < 0 {
|
|
continue
|
|
}
|
|
if min == -1 {
|
|
min = j
|
|
} else if j < min {
|
|
min = j
|
|
}
|
|
}
|
|
return min
|
|
}
|
|
|
|
func indexNonWhiteSpace(s []byte, in rune) int {
|
|
idx := bytes.IndexFunc(s, func(r rune) bool {
|
|
return !unicode.IsSpace(r)
|
|
})
|
|
|
|
if idx == -1 {
|
|
return -1
|
|
}
|
|
|
|
r, _ := utf8.DecodeRune(s[idx:])
|
|
if r == in {
|
|
return idx
|
|
}
|
|
return -1
|
|
}
|
|
|
|
func isSpace(r rune) bool {
|
|
return r == ' ' || r == '\t'
|
|
}
|
|
|
|
func isAlphaNumericOrHyphen(r rune) bool {
|
|
// let unquoted YouTube ids as positional params slip through (they contain hyphens)
|
|
return isAlphaNumeric(r) || r == '-'
|
|
}
|
|
|
|
var crLf = []rune{'\r', '\n'}
|
|
|
|
func isEndOfLine(r rune) bool {
|
|
return r == '\r' || r == '\n'
|
|
}
|
|
|
|
func isAlphaNumeric(r rune) bool {
|
|
return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
|
|
}
|