hugo/parser/pageparser/pageparser.go

// Copyright 2018 The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package pageparser provides a parser for Hugo content files (Markdown, HTML etc.) in Hugo.
// This implementation is highly inspired by the great talk given by Rob Pike called "Lexical Scanning in Go"
// It's on YouTube, Google it!.
// See slides here: http://cuddle.googlecode.com/hg/talk/lex.html
package pageparser

import (
	"bytes"
	"io"
	"io/ioutil"

	"github.com/pkg/errors"
)

// Result holds the parse result.
type Result interface {
	// Iterator returns a new Iterator positioned at the beginning of the parse tree.
	Iterator() *Iterator
	// Input returns the input to Parse.
	Input() []byte
}

var _ Result = (*pageLexer)(nil)

// Parse parses the page in the given reader according to the given Config.
func Parse(r io.Reader, cfg Config) (Result, error) {
	b, err := ioutil.ReadAll(r)
	if err != nil {
		return nil, errors.Wrap(err, "failed to read page content")
	}
	return parseBytes(b, cfg)
}

func parseBytes(b []byte, cfg Config) (Result, error) {
	lexer := newPageLexer(b, lexIntroSection, cfg)
	lexer.run()
	return lexer, nil
}

// An Iterator has methods to iterate a parsed page with support going back
// if needed.
type Iterator struct {
	l       *pageLexer
	lastPos int // position of the last item returned by nextItem
}

// consumes and returns the next item
func (t *Iterator) Next() Item {
	t.lastPos++
	return t.current()
}

// Input returns the input source.
func (t *Iterator) Input() []byte {
	return t.l.Input()
}

var errIndexOutOfBounds = Item{tError, 0, []byte("no more tokens")}

func (t *Iterator) current() Item {
	if t.lastPos >= len(t.l.items) {
		return errIndexOutOfBounds
	}
	return t.l.items[t.lastPos]
}

// backs up one token.
func (t *Iterator) Backup() {
	if t.lastPos < 0 {
		panic("need to go forward before going back")
	}
	t.lastPos--
}

// check for non-error and non-EOF types coming next
func (t *Iterator) IsValueNext() bool {
	i := t.Peek()
	return i.Type != tError && i.Type != tEOF
}

// look at, but do not consume, the next item
// repeated, sequential calls will return the same item
func (t *Iterator) Peek() Item {
	return t.l.items[t.lastPos+1]
}

// PeekWalk will feed the next items in the iterator to walkFn
// until it returns false.
func (t *Iterator) PeekWalk(walkFn func(item Item) bool) {
	for i := t.lastPos + 1; i < len(t.l.items); i++ {
		item := t.l.items[i]
		if !walkFn(item) {
			break
		}
	}
}

// Consume is a convencience method to consume the next n tokens,
// but back off Errors and EOF.
func (t *Iterator) Consume(cnt int) {
	for i := 0; i < cnt; i++ {
		token := t.Next()
		if token.Type == tError || token.Type == tEOF {
			t.Backup()
			break
		}
	}
}

// LineNumber returns the current line number. Used for logging.
func (t *Iterator) LineNumber() int {
	return bytes.Count(t.l.input[:t.current().Pos], lf) + 1
}
parser/pageparser: File renames and splitting See #5324 2018-10-17 07:16:45 -04:00			`// Copyright 2018 The Hugo Authors. All rights reserved.`
			`//`
			`// Licensed under the Apache License, Version 2.0 (the "License");`
			`// you may not use this file except in compliance with the License.`
			`// You may obtain a copy of the License at`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`//`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the License for the specific language governing permissions and`
			`// limitations under the License.`

			`// Package pageparser provides a parser for Hugo content files (Markdown, HTML etc.) in Hugo.`
			`// This implementation is highly inspired by the great talk given by Rob Pike called "Lexical Scanning in Go"`
			`// It's on YouTube, Google it!.`
			`// See slides here: http://cuddle.googlecode.com/hg/talk/lex.html`
			`package pageparser`

hugolib: Integrate new page parser See #5324 2018-10-18 04:21:23 -04:00			`import (`
			`"bytes"`
			`"io"`
			`"io/ioutil"`

			`"github.com/pkg/errors"`
			`)`

			`// Result holds the parse result.`
			`type Result interface {`
Move the emoji parsing to pageparser This avoids double parsing the page content when `enableEmoji=true`. This commit also adds some general improvements to the parser, making it in general much faster: ```bash benchmark old ns/op new ns/op delta BenchmarkShortcodeLexer-4 90258 101730 +12.71% BenchmarkParse-4 148940 15037 -89.90% benchmark old allocs new allocs delta BenchmarkShortcodeLexer-4 456 700 +53.51% BenchmarkParse-4 28 33 +17.86% benchmark old bytes new bytes delta BenchmarkShortcodeLexer-4 69875 81014 +15.94% BenchmarkParse-4 8128 8304 +2.17% ``` Running some site benchmarks with Emoji support turned on: ```bash benchmark old ns/op new ns/op delta BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 924556797 818115620 -11.51% benchmark old allocs new allocs delta BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 4112613 4133787 +0.51% benchmark old bytes new bytes delta BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 426982864 424363832 -0.61% ``` Fixes #5534 2018-12-17 15:03:23 -05:00			`// Iterator returns a new Iterator positioned at the beginning of the parse tree.`
hugolib: Integrate new page parser See #5324 2018-10-18 04:21:23 -04:00			`Iterator() *Iterator`
			`// Input returns the input to Parse.`
			`Input() []byte`
hugolib: Use []byte in shortcode parsing See #5324 2018-10-18 03:47:39 -04:00			`}`

hugolib: Integrate new page parser See #5324 2018-10-18 04:21:23 -04:00			`var _ Result = (*pageLexer)(nil)`

Move the emoji parsing to pageparser This avoids double parsing the page content when `enableEmoji=true`. This commit also adds some general improvements to the parser, making it in general much faster: ```bash benchmark old ns/op new ns/op delta BenchmarkShortcodeLexer-4 90258 101730 +12.71% BenchmarkParse-4 148940 15037 -89.90% benchmark old allocs new allocs delta BenchmarkShortcodeLexer-4 456 700 +53.51% BenchmarkParse-4 28 33 +17.86% benchmark old bytes new bytes delta BenchmarkShortcodeLexer-4 69875 81014 +15.94% BenchmarkParse-4 8128 8304 +2.17% ``` Running some site benchmarks with Emoji support turned on: ```bash benchmark old ns/op new ns/op delta BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 924556797 818115620 -11.51% benchmark old allocs new allocs delta BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 4112613 4133787 +0.51% benchmark old bytes new bytes delta BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 426982864 424363832 -0.61% ``` Fixes #5534 2018-12-17 15:03:23 -05:00			`// Parse parses the page in the given reader according to the given Config.`
			`func Parse(r io.Reader, cfg Config) (Result, error) {`
hugolib: Integrate new page parser See #5324 2018-10-18 04:21:23 -04:00			`b, err := ioutil.ReadAll(r)`
			`if err != nil {`
			`return nil, errors.Wrap(err, "failed to read page content")`
			`}`
Move the emoji parsing to pageparser This avoids double parsing the page content when `enableEmoji=true`. This commit also adds some general improvements to the parser, making it in general much faster: ```bash benchmark old ns/op new ns/op delta BenchmarkShortcodeLexer-4 90258 101730 +12.71% BenchmarkParse-4 148940 15037 -89.90% benchmark old allocs new allocs delta BenchmarkShortcodeLexer-4 456 700 +53.51% BenchmarkParse-4 28 33 +17.86% benchmark old bytes new bytes delta BenchmarkShortcodeLexer-4 69875 81014 +15.94% BenchmarkParse-4 8128 8304 +2.17% ``` Running some site benchmarks with Emoji support turned on: ```bash benchmark old ns/op new ns/op delta BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 924556797 818115620 -11.51% benchmark old allocs new allocs delta BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 4112613 4133787 +0.51% benchmark old bytes new bytes delta BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 426982864 424363832 -0.61% ``` Fixes #5534 2018-12-17 15:03:23 -05:00			`return parseBytes(b, cfg)`
parser/pageparser: Add a benchmark 2018-12-19 14:07:49 -05:00			`}`

Move the emoji parsing to pageparser This avoids double parsing the page content when `enableEmoji=true`. This commit also adds some general improvements to the parser, making it in general much faster: ```bash benchmark old ns/op new ns/op delta BenchmarkShortcodeLexer-4 90258 101730 +12.71% BenchmarkParse-4 148940 15037 -89.90% benchmark old allocs new allocs delta BenchmarkShortcodeLexer-4 456 700 +53.51% BenchmarkParse-4 28 33 +17.86% benchmark old bytes new bytes delta BenchmarkShortcodeLexer-4 69875 81014 +15.94% BenchmarkParse-4 8128 8304 +2.17% ``` Running some site benchmarks with Emoji support turned on: ```bash benchmark old ns/op new ns/op delta BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 924556797 818115620 -11.51% benchmark old allocs new allocs delta BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 4112613 4133787 +0.51% benchmark old bytes new bytes delta BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 426982864 424363832 -0.61% ``` Fixes #5534 2018-12-17 15:03:23 -05:00			`func parseBytes(b []byte, cfg Config) (Result, error) {`
			`lexer := newPageLexer(b, lexIntroSection, cfg)`
hugolib: Use []byte in shortcode parsing See #5324 2018-10-18 03:47:39 -04:00			`lexer.run()`
hugolib: Integrate new page parser See #5324 2018-10-18 04:21:23 -04:00			`return lexer, nil`
hugolib: Use []byte in shortcode parsing See #5324 2018-10-18 03:47:39 -04:00			`}`
parser/pageparser: File renames and splitting See #5324 2018-10-17 07:16:45 -04:00
hugolib: Integrate new page parser See #5324 2018-10-18 04:21:23 -04:00			`// An Iterator has methods to iterate a parsed page with support going back`
			`// if needed.`
			`type Iterator struct {`
			`l *pageLexer`
Convert the rest to new page parser code paths And remove some now unused code. See #5324 2018-10-20 11:38:49 -04:00			`lastPos int // position of the last item returned by nextItem`
parser/pageparser: File renames and splitting See #5324 2018-10-17 07:16:45 -04:00			`}`

hugolib: Integrate new page parser See #5324 2018-10-18 04:21:23 -04:00			`// consumes and returns the next item`
			`func (t *Iterator) Next() Item {`
			`t.lastPos++`
			`return t.current()`
parser/pageparser: File renames and splitting See #5324 2018-10-17 07:16:45 -04:00			`}`

hugolib: Continue the file context/line number errors work See #5324 2018-10-21 06:20:21 -04:00			`// Input returns the input source.`
			`func (t *Iterator) Input() []byte {`
			`return t.l.Input()`
			`}`

hugolib: Integrate new page parser See #5324 2018-10-18 04:21:23 -04:00			`var errIndexOutOfBounds = Item{tError, 0, []byte("no more tokens")}`

			`func (t *Iterator) current() Item {`
Convert the rest to new page parser code paths And remove some now unused code. See #5324 2018-10-20 11:38:49 -04:00			`if t.lastPos >= len(t.l.items) {`
hugolib: Integrate new page parser See #5324 2018-10-18 04:21:23 -04:00			`return errIndexOutOfBounds`
			`}`
			`return t.l.items[t.lastPos]`
parser/pageparser: File renames and splitting See #5324 2018-10-17 07:16:45 -04:00			`}`

hugolib: Integrate new page parser See #5324 2018-10-18 04:21:23 -04:00			`// backs up one token.`
			`func (t *Iterator) Backup() {`
			`if t.lastPos < 0 {`
			`panic("need to go forward before going back")`
			`}`
			`t.lastPos--`
parser/pageparser: File renames and splitting See #5324 2018-10-17 07:16:45 -04:00			`}`

			`// check for non-error and non-EOF types coming next`
hugolib: Integrate new page parser See #5324 2018-10-18 04:21:23 -04:00			`func (t *Iterator) IsValueNext() bool {`
parser/pageparser: File renames and splitting See #5324 2018-10-17 07:16:45 -04:00			`i := t.Peek()`
hugolib: Redo the summary delimiter logic Now that we have a proper page parse tree, this can be greatly simplified. See #5324 2018-10-19 05:30:57 -04:00			`return i.Type != tError && i.Type != tEOF`
parser/pageparser: File renames and splitting See #5324 2018-10-17 07:16:45 -04:00			`}`

			`// look at, but do not consume, the next item`
			`// repeated, sequential calls will return the same item`
hugolib: Integrate new page parser See #5324 2018-10-18 04:21:23 -04:00			`func (t *Iterator) Peek() Item {`
			`return t.l.items[t.lastPos+1]`
parser/pageparser: File renames and splitting See #5324 2018-10-17 07:16:45 -04:00			`}`

hugolib: Redo the summary delimiter logic Now that we have a proper page parse tree, this can be greatly simplified. See #5324 2018-10-19 05:30:57 -04:00			`// PeekWalk will feed the next items in the iterator to walkFn`
			`// until it returns false.`
			`func (t *Iterator) PeekWalk(walkFn func(item Item) bool) {`
Convert the rest to new page parser code paths And remove some now unused code. See #5324 2018-10-20 11:38:49 -04:00			`for i := t.lastPos + 1; i < len(t.l.items); i++ {`
hugolib: Redo the summary delimiter logic Now that we have a proper page parse tree, this can be greatly simplified. See #5324 2018-10-19 05:30:57 -04:00			`item := t.l.items[i]`
			`if !walkFn(item) {`
			`break`
			`}`
			`}`
			`}`

parser/pageparser: File renames and splitting See #5324 2018-10-17 07:16:45 -04:00			`// Consume is a convencience method to consume the next n tokens,`
			`// but back off Errors and EOF.`
hugolib: Integrate new page parser See #5324 2018-10-18 04:21:23 -04:00			`func (t *Iterator) Consume(cnt int) {`
parser/pageparser: File renames and splitting See #5324 2018-10-17 07:16:45 -04:00			`for i := 0; i < cnt; i++ {`
			`token := t.Next()`
hugolib: Redo the summary delimiter logic Now that we have a proper page parse tree, this can be greatly simplified. See #5324 2018-10-19 05:30:57 -04:00			`if token.Type == tError \|\| token.Type == tEOF {`
parser/pageparser: File renames and splitting See #5324 2018-10-17 07:16:45 -04:00			`t.Backup()`
			`break`
			`}`
			`}`
			`}`

			`// LineNumber returns the current line number. Used for logging.`
hugolib: Integrate new page parser See #5324 2018-10-18 04:21:23 -04:00			`func (t *Iterator) LineNumber() int {`
parser/metadecoders: Consolidate the metadata decoders See #5324 2018-10-20 05:16:18 -04:00			`return bytes.Count(t.l.input[:t.current().Pos], lf) + 1`
parser/pageparser: File renames and splitting See #5324 2018-10-17 07:16:45 -04:00			`}`