parser/pageparser: File renames and splitting

See #5324
2024-11-21 20:46:30 -05:00 · 2018-10-17 13:16:45 +02:00 · 2018-10-17 13:16:45 +02:00 · f6863e1ef7
commit f6863e1ef7
parent d6c16afde0
4 changed files with 249 additions and 209 deletions
--- a/parser/pageparser/item.go
+++ b/parser/pageparser/item.go
@ -0,0 +1,103 @@
 // Copyright 2018 The Hugo Authors. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 package pageparser
 import "fmt"
 type Item struct {
 	typ itemType
 	pos pos
 	Val string
 }
 func (i Item) IsText() bool {
 	return i.typ == tText
 }
 func (i Item) IsShortcodeName() bool {
 	return i.typ == tScName
 }
 func (i Item) IsLeftShortcodeDelim() bool {
 	return i.typ == tLeftDelimScWithMarkup || i.typ == tLeftDelimScNoMarkup
 }
 func (i Item) IsRightShortcodeDelim() bool {
 	return i.typ == tRightDelimScWithMarkup || i.typ == tRightDelimScNoMarkup
 }
 func (i Item) IsShortcodeClose() bool {
 	return i.typ == tScClose
 }
 func (i Item) IsShortcodeParam() bool {
 	return i.typ == tScParam
 }
 func (i Item) IsShortcodeParamVal() bool {
 	return i.typ == tScParamVal
 }
 func (i Item) IsShortcodeMarkupDelimiter() bool {
 	return i.typ == tLeftDelimScWithMarkup || i.typ == tRightDelimScWithMarkup
 }
 func (i Item) IsDone() bool {
 	return i.typ == tError || i.typ == tEOF
 }
 func (i Item) IsEOF() bool {
 	return i.typ == tEOF
 }
 func (i Item) IsError() bool {
 	return i.typ == tError
 }
 func (i Item) String() string {
 	switch {
 	case i.typ == tEOF:
 		return "EOF"
 	case i.typ == tError:
 		return i.Val
 	case i.typ > tKeywordMarker:
 		return fmt.Sprintf("<%s>", i.Val)
 	case len(i.Val) > 20:
 		return fmt.Sprintf("%.20q...", i.Val)
 	}
 	return fmt.Sprintf("[%s]", i.Val)
 }
 type itemType int
 const (
 	tError itemType = iota
 	tEOF
 	// shortcode items
 	tLeftDelimScNoMarkup
 	tRightDelimScNoMarkup
 	tLeftDelimScWithMarkup
 	tRightDelimScWithMarkup
 	tScClose
 	tScName
 	tScParam
 	tScParamVal
 	//itemIdentifier
 	tText // plain text, used for everything outside the shortcodes
 	// preserved for later - keywords come after this
 	tKeywordMarker
 )
--- a/parser/pageparser/shortcodeparser.go
+++ b/parser/pageparser/shortcodeparser.go
@ -11,6 +11,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // Package pageparser provides a parser for Hugo content files (Markdown, HTML etc.) in Hugo.
 // This implementation is highly inspired by the great talk given by Rob Pike called "Lexical Scanning in Go"
 // It's on YouTube, Google it!.
 // See slides here: http://cuddle.googlecode.com/hg/talk/lex.html
 package pageparser
 import (
@ -20,177 +24,26 @@ import (
 	"unicode/utf8"
 )
 // The lexical scanning below is highly inspired by the great talk given by
 // Rob Pike called "Lexical Scanning in Go" (it's on YouTube, Google it!).
 // See slides here: http://cuddle.googlecode.com/hg/talk/lex.html
 // parsing
 type Tokens struct {
 	lexer     *pagelexer
 	token     [3]Item // 3-item look-ahead is what we currently need
 	peekCount int
 }
 func (t *Tokens) Next() Item {
 	if t.peekCount > 0 {
 		t.peekCount--
 	} else {
 		t.token[0] = t.lexer.nextItem()
 	}
 	return t.token[t.peekCount]
 }
 // backs up one token.
 func (t *Tokens) Backup() {
 	t.peekCount++
 }
 // backs up two tokens.
 func (t *Tokens) Backup2(t1 Item) {
 	t.token[1] = t1
 	t.peekCount = 2
 }
 // backs up three tokens.
 func (t *Tokens) Backup3(t2, t1 Item) {
 	t.token[1] = t1
 	t.token[2] = t2
 	t.peekCount = 3
 }
 // check for non-error and non-EOF types coming next
 func (t *Tokens) IsValueNext() bool {
 	i := t.Peek()
 	return i.typ != tError && i.typ != tEOF
 }
 // look at, but do not consume, the next item
 // repeated, sequential calls will return the same item
 func (t *Tokens) Peek() Item {
 	if t.peekCount > 0 {
 		return t.token[t.peekCount-1]
 	}
 	t.peekCount = 1
 	t.token[0] = t.lexer.nextItem()
 	return t.token[0]
 }
 // Consume is a convencience method to consume the next n tokens,
 // but back off Errors and EOF.
 func (t *Tokens) Consume(cnt int) {
 	for i := 0; i < cnt; i++ {
 		token := t.Next()
 		if token.typ == tError || token.typ == tEOF {
 			t.Backup()
 			break
 		}
 	}
 }
 // LineNumber returns the current line number. Used for logging.
 func (t *Tokens) LineNumber() int {
 	return t.lexer.lineNum()
 }
 // lexical scanning
 // position (in bytes)
 type pos int
 type Item struct {
 	typ itemType
 	pos pos
 	Val string
 }
 func (i Item) IsText() bool {
 	return i.typ == tText
 }
 func (i Item) IsShortcodeName() bool {
 	return i.typ == tScName
 }
 func (i Item) IsLeftShortcodeDelim() bool {
 	return i.typ == tLeftDelimScWithMarkup || i.typ == tLeftDelimScNoMarkup
 }
 func (i Item) IsRightShortcodeDelim() bool {
 	return i.typ == tRightDelimScWithMarkup || i.typ == tRightDelimScNoMarkup
 }
 func (i Item) IsShortcodeClose() bool {
 	return i.typ == tScClose
 }
 func (i Item) IsShortcodeParam() bool {
 	return i.typ == tScParam
 }
 func (i Item) IsShortcodeParamVal() bool {
 	return i.typ == tScParamVal
 }
 func (i Item) IsShortcodeMarkupDelimiter() bool {
 	return i.typ == tLeftDelimScWithMarkup || i.typ == tRightDelimScWithMarkup
 }
 func (i Item) IsDone() bool {
 	return i.typ == tError || i.typ == tEOF
 }
 func (i Item) IsEOF() bool {
 	return i.typ == tEOF
 }
 func (i Item) IsError() bool {
 	return i.typ == tError
 }
 func (i Item) String() string {
 	switch {
 	case i.typ == tEOF:
 		return "EOF"
 	case i.typ == tError:
 		return i.Val
 	case i.typ > tKeywordMarker:
 		return fmt.Sprintf("<%s>", i.Val)
 	case len(i.Val) > 20:
 		return fmt.Sprintf("%.20q...", i.Val)
 	}
 	return fmt.Sprintf("[%s]", i.Val)
 }
 type itemType int
 const (
 	tError itemType = iota
 	tEOF
 	// shortcode items
 	tLeftDelimScNoMarkup
 	tRightDelimScNoMarkup
 	tLeftDelimScWithMarkup
 	tRightDelimScWithMarkup
 	tScClose
 	tScName
 	tScParam
 	tScParamVal
 	//itemIdentifier
 	tText // plain text, used for everything outside the shortcodes
 	// preserved for later - keywords come after this
 	tKeywordMarker
 )
 const eof = -1
 // returns the next state in scanner.
-type stateFunc func(*pagelexer) stateFunc
+type stateFunc func(*pageLexer) stateFunc
-type pagelexer struct {
+type lexerShortcodeState struct {
 	currLeftDelimItem  itemType
 	currRightDelimItem itemType
 	currShortcodeName  string          // is only set when a shortcode is in opened state
 	closingState       int             // > 0 = on its way to be closed
 	elementStepNum     int             // step number in element
 	paramElements      int             // number of elements (name + value = 2) found first
 	openShortcodes     map[string]bool // set of shortcodes in open state
 }
 type pageLexer struct {
 	name    string
 	input   string
 	state   stateFunc
@ -199,14 +52,7 @@ type pagelexer struct {
 	width   pos // width of last element
 	lastPos pos // position of the last item returned by nextItem
-	// shortcode state
+	lexerShortcodeState
 	currLeftDelimItem  itemType
 	currRightDelimItem itemType
 	currShortcodeName  string          // is only set when a shortcode is in opened state
 	closingState       int             // > 0 = on its way to be closed
 	elementStepNum     int             // step number in element
 	paramElements      int             // number of elements (name + value = 2) found first
 	openShortcodes     map[string]bool // set of shortcodes in open state
 	// items delivered to client
 	items []Item
@ -217,31 +63,35 @@ func Parse(s string) *Tokens {
 }
 func ParseFrom(s string, from int) *Tokens {
-	return &Tokens{lexer: newShortcodeLexer("default", s, pos(from))}
+	lexer := newPageLexer("default", s, pos(from))
 	lexer.run()
 	return &Tokens{lexer: lexer}
 }
 // note: the input position here is normally 0 (start), but
 // can be set if position of first shortcode is known
-func newShortcodeLexer(name, input string, inputPosition pos) *pagelexer {
+func newPageLexer(name, input string, inputPosition pos) *pageLexer {
-	lexer := &pagelexer{
+	lexer := &pageLexer{
-		name:               name,
+		name:  name,
-		input:              input,
+		input: input,
-		currLeftDelimItem:  tLeftDelimScNoMarkup,
+		pos:   inputPosition,
-		currRightDelimItem: tRightDelimScNoMarkup,
+		lexerShortcodeState: lexerShortcodeState{
-		pos:                inputPosition,
+			currLeftDelimItem:  tLeftDelimScNoMarkup,
-		openShortcodes:     make(map[string]bool),
+			currRightDelimItem: tRightDelimScNoMarkup,
-		items:              make([]Item, 0, 5),
+			openShortcodes:     make(map[string]bool),
 		},
 		items: make([]Item, 0, 5),
 	}
-	lexer.runShortcodeLexer()
+
 	return lexer
 }
 // main loop
-// this looks kind of funky, but it works
+func (l *pageLexer) run() *pageLexer {
 func (l *pagelexer) runShortcodeLexer() {
 	for l.state = lexTextOutsideShortcodes; l.state != nil; {
 		l.state = l.state(l)
 	}
 	return l
 }
 // state functions
@ -255,7 +105,7 @@ const (
 	rightComment           = "*/"
 )
-func (l *pagelexer) next() rune {
+func (l *pageLexer) next() rune {
 	if int(l.pos) >= len(l.input) {
 		l.width = 0
 		return eof
@ -270,25 +120,25 @@ func (l *pagelexer) next() rune {
 }
 // peek, but no consume
-func (l *pagelexer) peek() rune {
+func (l *pageLexer) peek() rune {
 	r := l.next()
 	l.backup()
 	return r
 }
 // steps back one
-func (l *pagelexer) backup() {
+func (l *pageLexer) backup() {
 	l.pos -= l.width
 }
 // sends an item back to the client.
-func (l *pagelexer) emit(t itemType) {
+func (l *pageLexer) emit(t itemType) {
 	l.items = append(l.items, Item{t, l.start, l.input[l.start:l.pos]})
 	l.start = l.pos
 }
 // special case, do not send '\\' back to client
-func (l *pagelexer) ignoreEscapesAndEmit(t itemType) {
+func (l *pageLexer) ignoreEscapesAndEmit(t itemType) {
 	val := strings.Map(func(r rune) rune {
 		if r == '\\' {
 			return -1
@ -300,28 +150,28 @@ func (l *pagelexer) ignoreEscapesAndEmit(t itemType) {
 }
 // gets the current value (for debugging and error handling)
-func (l *pagelexer) current() string {
+func (l *pageLexer) current() string {
 	return l.input[l.start:l.pos]
 }
 // ignore current element
-func (l *pagelexer) ignore() {
+func (l *pageLexer) ignore() {
 	l.start = l.pos
 }
 // nice to have in error logs
-func (l *pagelexer) lineNum() int {
+func (l *pageLexer) lineNum() int {
 	return strings.Count(l.input[:l.lastPos], "\n") + 1
 }
 // nil terminates the parser
-func (l *pagelexer) errorf(format string, args ...interface{}) stateFunc {
+func (l *pageLexer) errorf(format string, args ...interface{}) stateFunc {
 	l.items = append(l.items, Item{tError, l.start, fmt.Sprintf(format, args...)})
 	return nil
 }
 // consumes and returns the next item
-func (l *pagelexer) nextItem() Item {
+func (l *pageLexer) nextItem() Item {
 	item := l.items[0]
 	l.items = l.items[1:]
 	l.lastPos = item.pos
@ -330,7 +180,7 @@ func (l *pagelexer) nextItem() Item {
 // scans until an opening shortcode opening bracket.
 // if no shortcodes, it will keep on scanning until EOF
-func lexTextOutsideShortcodes(l *pagelexer) stateFunc {
+func lexTextOutsideShortcodes(l *pageLexer) stateFunc {
 	for {
 		if strings.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) || strings.HasPrefix(l.input[l.pos:], leftDelimScNoMarkup) {
 			if l.pos > l.start {
@ -358,7 +208,7 @@ func lexTextOutsideShortcodes(l *pagelexer) stateFunc {
 	return nil
 }
-func lexShortcodeLeftDelim(l *pagelexer) stateFunc {
+func lexShortcodeLeftDelim(l *pageLexer) stateFunc {
 	l.pos += pos(len(l.currentLeftShortcodeDelim()))
 	if strings.HasPrefix(l.input[l.pos:], leftComment) {
 		return lexShortcodeComment
@ -369,7 +219,7 @@ func lexShortcodeLeftDelim(l *pagelexer) stateFunc {
 	return lexInsideShortcode
 }
-func lexShortcodeComment(l *pagelexer) stateFunc {
+func lexShortcodeComment(l *pageLexer) stateFunc {
 	posRightComment := strings.Index(l.input[l.pos:], rightComment+l.currentRightShortcodeDelim())
 	if posRightComment <= 1 {
 		return l.errorf("comment must be closed")
@ -387,7 +237,7 @@ func lexShortcodeComment(l *pagelexer) stateFunc {
 	return lexTextOutsideShortcodes
 }
-func lexShortcodeRightDelim(l *pagelexer) stateFunc {
+func lexShortcodeRightDelim(l *pageLexer) stateFunc {
 	l.closingState = 0
 	l.pos += pos(len(l.currentRightShortcodeDelim()))
 	l.emit(l.currentRightShortcodeDelimItem())
@ -399,7 +249,7 @@ func lexShortcodeRightDelim(l *pagelexer) stateFunc {
 // 2. "param" or "param\"
 // 3. param="123" or param="123\"
 // 4. param="Some \"escaped\" text"
-func lexShortcodeParam(l *pagelexer, escapedQuoteStart bool) stateFunc {
+func lexShortcodeParam(l *pageLexer, escapedQuoteStart bool) stateFunc {
 	first := true
 	nextEq := false
@ -451,7 +301,7 @@ func lexShortcodeParam(l *pagelexer, escapedQuoteStart bool) stateFunc {
 }
-func lexShortcodeQuotedParamVal(l *pagelexer, escapedQuotedValuesAllowed bool, typ itemType) stateFunc {
+func lexShortcodeQuotedParamVal(l *pageLexer, escapedQuotedValuesAllowed bool, typ itemType) stateFunc {
 	openQuoteFound := false
 	escapedInnerQuoteFound := false
 	escapedQuoteState := 0
@ -516,7 +366,7 @@ Loop:
 }
 // scans an alphanumeric inside shortcode
-func lexIdentifierInShortcode(l *pagelexer) stateFunc {
+func lexIdentifierInShortcode(l *pageLexer) stateFunc {
 	lookForEnd := false
 Loop:
 	for {
@ -549,7 +399,7 @@ Loop:
 	return lexInsideShortcode
 }
-func lexEndOfShortcode(l *pagelexer) stateFunc {
+func lexEndOfShortcode(l *pageLexer) stateFunc {
 	if strings.HasPrefix(l.input[l.pos:], l.currentRightShortcodeDelim()) {
 		return lexShortcodeRightDelim
 	}
@ -563,7 +413,7 @@ func lexEndOfShortcode(l *pagelexer) stateFunc {
 }
 // scans the elements inside shortcode tags
-func lexInsideShortcode(l *pagelexer) stateFunc {
+func lexInsideShortcode(l *pageLexer) stateFunc {
 	if strings.HasPrefix(l.input[l.pos:], l.currentRightShortcodeDelim()) {
 		return lexShortcodeRightDelim
 	}
@ -601,15 +451,15 @@ func lexInsideShortcode(l *pagelexer) stateFunc {
 // state helpers
-func (l *pagelexer) currentLeftShortcodeDelimItem() itemType {
+func (l *pageLexer) currentLeftShortcodeDelimItem() itemType {
 	return l.currLeftDelimItem
 }
-func (l *pagelexer) currentRightShortcodeDelimItem() itemType {
+func (l *pageLexer) currentRightShortcodeDelimItem() itemType {
 	return l.currRightDelimItem
 }
-func (l *pagelexer) currentLeftShortcodeDelim() string {
+func (l *pageLexer) currentLeftShortcodeDelim() string {
 	if l.currLeftDelimItem == tLeftDelimScWithMarkup {
 		return leftDelimScWithMarkup
 	}
@ -617,7 +467,7 @@ func (l *pagelexer) currentLeftShortcodeDelim() string {
 }
-func (l *pagelexer) currentRightShortcodeDelim() string {
+func (l *pageLexer) currentRightShortcodeDelim() string {
 	if l.currRightDelimItem == tRightDelimScWithMarkup {
 		return rightDelimScWithMarkup
 	}
--- a/parser/pageparser/pageparser.go
+++ b/parser/pageparser/pageparser.go
@ -0,0 +1,87 @@
 // Copyright 2018 The Hugo Authors. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // Package pageparser provides a parser for Hugo content files (Markdown, HTML etc.) in Hugo.
 // This implementation is highly inspired by the great talk given by Rob Pike called "Lexical Scanning in Go"
 // It's on YouTube, Google it!.
 // See slides here: http://cuddle.googlecode.com/hg/talk/lex.html
 package pageparser
 // The lexical scanning below
 type Tokens struct {
 	lexer     *pageLexer
 	token     [3]Item // 3-item look-ahead is what we currently need
 	peekCount int
 }
 func (t *Tokens) Next() Item {
 	if t.peekCount > 0 {
 		t.peekCount--
 	} else {
 		t.token[0] = t.lexer.nextItem()
 	}
 	return t.token[t.peekCount]
 }
 // backs up one token.
 func (t *Tokens) Backup() {
 	t.peekCount++
 }
 // backs up two tokens.
 func (t *Tokens) Backup2(t1 Item) {
 	t.token[1] = t1
 	t.peekCount = 2
 }
 // backs up three tokens.
 func (t *Tokens) Backup3(t2, t1 Item) {
 	t.token[1] = t1
 	t.token[2] = t2
 	t.peekCount = 3
 }
 // check for non-error and non-EOF types coming next
 func (t *Tokens) IsValueNext() bool {
 	i := t.Peek()
 	return i.typ != tError && i.typ != tEOF
 }
 // look at, but do not consume, the next item
 // repeated, sequential calls will return the same item
 func (t *Tokens) Peek() Item {
 	if t.peekCount > 0 {
 		return t.token[t.peekCount-1]
 	}
 	t.peekCount = 1
 	t.token[0] = t.lexer.nextItem()
 	return t.token[0]
 }
 // Consume is a convencience method to consume the next n tokens,
 // but back off Errors and EOF.
 func (t *Tokens) Consume(cnt int) {
 	for i := 0; i < cnt; i++ {
 		token := t.Next()
 		if token.typ == tError || token.typ == tEOF {
 			t.Backup()
 			break
 		}
 	}
 }
 // LineNumber returns the current line number. Used for logging.
 func (t *Tokens) LineNumber() int {
 	return t.lexer.lineNum()
 }
--- a/parser/pageparser/shortcodeparser_test.go
+++ b/parser/pageparser/shortcodeparser_test.go
@ -179,7 +179,7 @@ func BenchmarkShortcodeLexer(b *testing.B) {
 }
 func collect(t *shortCodeLexerTest) (items []Item) {
-	l := newShortcodeLexer(t.name, t.input, 0)
+	l := newPageLexer(t.name, t.input, 0).run()
 	for {
 		item := l.nextItem()
 		items = append(items, item)