2019-01-02 11:33:26 +00:00
|
|
|
// Copyright 2019 The Hugo Authors. All rights reserved.
|
2018-10-17 11:16:45 +00:00
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
|
|
|
package pageparser
|
|
|
|
|
2018-10-18 08:21:23 +00:00
|
|
|
import (
|
|
|
|
"bytes"
|
2022-07-07 14:11:47 +00:00
|
|
|
"errors"
|
2022-05-02 14:07:52 +00:00
|
|
|
"fmt"
|
2018-10-18 08:21:23 +00:00
|
|
|
"io"
|
2023-01-26 09:30:25 +00:00
|
|
|
"regexp"
|
|
|
|
"strings"
|
2018-10-18 08:21:23 +00:00
|
|
|
|
2019-09-10 09:26:34 +00:00
|
|
|
"github.com/gohugoio/hugo/parser/metadecoders"
|
2018-10-18 08:21:23 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
// Result holds the parse result.
|
|
|
|
type Result interface {
|
Move the emoji parsing to pageparser
This avoids double parsing the page content when `enableEmoji=true`.
This commit also adds some general improvements to the parser, making it in general much faster:
```bash
benchmark old ns/op new ns/op delta
BenchmarkShortcodeLexer-4 90258 101730 +12.71%
BenchmarkParse-4 148940 15037 -89.90%
benchmark old allocs new allocs delta
BenchmarkShortcodeLexer-4 456 700 +53.51%
BenchmarkParse-4 28 33 +17.86%
benchmark old bytes new bytes delta
BenchmarkShortcodeLexer-4 69875 81014 +15.94%
BenchmarkParse-4 8128 8304 +2.17%
```
Running some site benchmarks with Emoji support turned on:
```bash
benchmark old ns/op new ns/op delta
BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 924556797 818115620 -11.51%
benchmark old allocs new allocs delta
BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 4112613 4133787 +0.51%
benchmark old bytes new bytes delta
BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 426982864 424363832 -0.61%
```
Fixes #5534
2018-12-17 20:03:23 +00:00
|
|
|
// Iterator returns a new Iterator positioned at the beginning of the parse tree.
|
2018-10-18 08:21:23 +00:00
|
|
|
Iterator() *Iterator
|
|
|
|
// Input returns the input to Parse.
|
|
|
|
Input() []byte
|
2018-10-18 07:47:39 +00:00
|
|
|
}
|
|
|
|
|
2018-10-18 08:21:23 +00:00
|
|
|
var _ Result = (*pageLexer)(nil)
|
|
|
|
|
Move the emoji parsing to pageparser
This avoids double parsing the page content when `enableEmoji=true`.
This commit also adds some general improvements to the parser, making it in general much faster:
```bash
benchmark old ns/op new ns/op delta
BenchmarkShortcodeLexer-4 90258 101730 +12.71%
BenchmarkParse-4 148940 15037 -89.90%
benchmark old allocs new allocs delta
BenchmarkShortcodeLexer-4 456 700 +53.51%
BenchmarkParse-4 28 33 +17.86%
benchmark old bytes new bytes delta
BenchmarkShortcodeLexer-4 69875 81014 +15.94%
BenchmarkParse-4 8128 8304 +2.17%
```
Running some site benchmarks with Emoji support turned on:
```bash
benchmark old ns/op new ns/op delta
BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 924556797 818115620 -11.51%
benchmark old allocs new allocs delta
BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 4112613 4133787 +0.51%
benchmark old bytes new bytes delta
BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 426982864 424363832 -0.61%
```
Fixes #5534
2018-12-17 20:03:23 +00:00
|
|
|
// Parse parses the page in the given reader according to the given Config.
|
|
|
|
func Parse(r io.Reader, cfg Config) (Result, error) {
|
2019-01-02 11:33:26 +00:00
|
|
|
return parseSection(r, cfg, lexIntroSection)
|
|
|
|
}
|
|
|
|
|
2019-09-10 09:26:34 +00:00
|
|
|
type ContentFrontMatter struct {
|
|
|
|
Content []byte
|
2022-03-17 21:03:27 +00:00
|
|
|
FrontMatter map[string]any
|
2019-09-10 09:26:34 +00:00
|
|
|
FrontMatterFormat metadecoders.Format
|
|
|
|
}
|
|
|
|
|
|
|
|
// ParseFrontMatterAndContent is a convenience method to extract front matter
|
|
|
|
// and content from a content page.
|
|
|
|
func ParseFrontMatterAndContent(r io.Reader) (ContentFrontMatter, error) {
|
|
|
|
var cf ContentFrontMatter
|
|
|
|
|
|
|
|
psr, err := Parse(r, Config{})
|
|
|
|
if err != nil {
|
|
|
|
return cf, err
|
|
|
|
}
|
|
|
|
|
|
|
|
var frontMatterSource []byte
|
|
|
|
|
|
|
|
iter := psr.Iterator()
|
|
|
|
|
|
|
|
walkFn := func(item Item) bool {
|
|
|
|
if frontMatterSource != nil {
|
|
|
|
// The rest is content.
|
2022-07-07 14:11:47 +00:00
|
|
|
cf.Content = psr.Input()[item.low:]
|
2019-09-10 09:26:34 +00:00
|
|
|
// Done
|
|
|
|
return false
|
|
|
|
} else if item.IsFrontMatter() {
|
|
|
|
cf.FrontMatterFormat = FormatFromFrontMatterType(item.Type)
|
2022-07-07 14:11:47 +00:00
|
|
|
frontMatterSource = item.Val(psr.Input())
|
2019-09-10 09:26:34 +00:00
|
|
|
}
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
iter.PeekWalk(walkFn)
|
|
|
|
|
|
|
|
cf.FrontMatter, err = metadecoders.Default.UnmarshalToMap(frontMatterSource, cf.FrontMatterFormat)
|
|
|
|
return cf, err
|
|
|
|
}
|
|
|
|
|
|
|
|
func FormatFromFrontMatterType(typ ItemType) metadecoders.Format {
|
|
|
|
switch typ {
|
|
|
|
case TypeFrontMatterJSON:
|
|
|
|
return metadecoders.JSON
|
|
|
|
case TypeFrontMatterORG:
|
|
|
|
return metadecoders.ORG
|
|
|
|
case TypeFrontMatterTOML:
|
|
|
|
return metadecoders.TOML
|
|
|
|
case TypeFrontMatterYAML:
|
|
|
|
return metadecoders.YAML
|
|
|
|
default:
|
|
|
|
return ""
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-01-02 11:33:26 +00:00
|
|
|
// ParseMain parses starting with the main section. Used in tests.
|
|
|
|
func ParseMain(r io.Reader, cfg Config) (Result, error) {
|
|
|
|
return parseSection(r, cfg, lexMainSection)
|
|
|
|
}
|
|
|
|
|
|
|
|
func parseSection(r io.Reader, cfg Config, start stateFunc) (Result, error) {
|
2023-02-18 22:43:26 +00:00
|
|
|
b, err := io.ReadAll(r)
|
2018-10-18 08:21:23 +00:00
|
|
|
if err != nil {
|
2022-05-02 14:07:52 +00:00
|
|
|
return nil, fmt.Errorf("failed to read page content: %w", err)
|
2018-10-18 08:21:23 +00:00
|
|
|
}
|
2019-01-02 11:33:26 +00:00
|
|
|
return parseBytes(b, cfg, start)
|
2018-12-19 19:07:49 +00:00
|
|
|
}
|
|
|
|
|
2019-01-02 11:33:26 +00:00
|
|
|
func parseBytes(b []byte, cfg Config, start stateFunc) (Result, error) {
|
|
|
|
lexer := newPageLexer(b, start, cfg)
|
2018-10-18 07:47:39 +00:00
|
|
|
lexer.run()
|
2018-10-18 08:21:23 +00:00
|
|
|
return lexer, nil
|
2018-10-18 07:47:39 +00:00
|
|
|
}
|
2018-10-17 11:16:45 +00:00
|
|
|
|
2022-07-07 14:11:47 +00:00
|
|
|
// NewIterator creates a new Iterator.
|
|
|
|
func NewIterator(items Items) *Iterator {
|
|
|
|
return &Iterator{items: items, lastPos: -1}
|
|
|
|
}
|
|
|
|
|
2018-10-18 08:21:23 +00:00
|
|
|
// An Iterator has methods to iterate a parsed page with support going back
|
|
|
|
// if needed.
|
|
|
|
type Iterator struct {
|
2022-07-07 14:11:47 +00:00
|
|
|
items Items
|
2018-10-20 15:38:49 +00:00
|
|
|
lastPos int // position of the last item returned by nextItem
|
2018-10-17 11:16:45 +00:00
|
|
|
}
|
|
|
|
|
2018-10-18 08:21:23 +00:00
|
|
|
// consumes and returns the next item
|
|
|
|
func (t *Iterator) Next() Item {
|
|
|
|
t.lastPos++
|
2019-01-02 11:33:26 +00:00
|
|
|
return t.Current()
|
2018-10-17 11:16:45 +00:00
|
|
|
}
|
|
|
|
|
2022-07-07 14:11:47 +00:00
|
|
|
var errIndexOutOfBounds = Item{Type: tError, Err: errors.New("no more tokens")}
|
2018-10-18 08:21:23 +00:00
|
|
|
|
2019-01-02 11:33:26 +00:00
|
|
|
// Current will repeatably return the current item.
|
|
|
|
func (t *Iterator) Current() Item {
|
2022-07-07 14:11:47 +00:00
|
|
|
if t.lastPos >= len(t.items) {
|
2018-10-18 08:21:23 +00:00
|
|
|
return errIndexOutOfBounds
|
|
|
|
}
|
2022-07-07 14:11:47 +00:00
|
|
|
return t.items[t.lastPos]
|
2018-10-17 11:16:45 +00:00
|
|
|
}
|
|
|
|
|
2018-10-18 08:21:23 +00:00
|
|
|
// backs up one token.
|
|
|
|
func (t *Iterator) Backup() {
|
|
|
|
if t.lastPos < 0 {
|
|
|
|
panic("need to go forward before going back")
|
|
|
|
}
|
|
|
|
t.lastPos--
|
2018-10-17 11:16:45 +00:00
|
|
|
}
|
|
|
|
|
2022-05-28 11:18:50 +00:00
|
|
|
// Pos returns the current position in the input.
|
|
|
|
func (t *Iterator) Pos() int {
|
|
|
|
return t.lastPos
|
|
|
|
}
|
|
|
|
|
2018-10-17 11:16:45 +00:00
|
|
|
// check for non-error and non-EOF types coming next
|
2018-10-18 08:21:23 +00:00
|
|
|
func (t *Iterator) IsValueNext() bool {
|
2018-10-17 11:16:45 +00:00
|
|
|
i := t.Peek()
|
2018-10-19 09:30:57 +00:00
|
|
|
return i.Type != tError && i.Type != tEOF
|
2018-10-17 11:16:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// look at, but do not consume, the next item
|
|
|
|
// repeated, sequential calls will return the same item
|
2018-10-18 08:21:23 +00:00
|
|
|
func (t *Iterator) Peek() Item {
|
2022-07-07 14:11:47 +00:00
|
|
|
return t.items[t.lastPos+1]
|
2018-10-17 11:16:45 +00:00
|
|
|
}
|
|
|
|
|
2018-10-19 09:30:57 +00:00
|
|
|
// PeekWalk will feed the next items in the iterator to walkFn
|
|
|
|
// until it returns false.
|
|
|
|
func (t *Iterator) PeekWalk(walkFn func(item Item) bool) {
|
2022-07-07 14:11:47 +00:00
|
|
|
for i := t.lastPos + 1; i < len(t.items); i++ {
|
|
|
|
item := t.items[i]
|
2018-10-19 09:30:57 +00:00
|
|
|
if !walkFn(item) {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-12-16 11:11:32 +00:00
|
|
|
// Consume is a convenience method to consume the next n tokens,
|
2018-10-17 11:16:45 +00:00
|
|
|
// but back off Errors and EOF.
|
2018-10-18 08:21:23 +00:00
|
|
|
func (t *Iterator) Consume(cnt int) {
|
2018-10-17 11:16:45 +00:00
|
|
|
for i := 0; i < cnt; i++ {
|
|
|
|
token := t.Next()
|
2018-10-19 09:30:57 +00:00
|
|
|
if token.Type == tError || token.Type == tEOF {
|
2018-10-17 11:16:45 +00:00
|
|
|
t.Backup()
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// LineNumber returns the current line number. Used for logging.
|
2022-07-07 14:11:47 +00:00
|
|
|
func (t *Iterator) LineNumber(source []byte) int {
|
|
|
|
return bytes.Count(source[:t.Current().low], lf) + 1
|
|
|
|
}
|
|
|
|
|
|
|
|
// IsProbablySourceOfItems returns true if the given source looks like original
|
|
|
|
// source of the items.
|
|
|
|
// There may be some false positives, but that is highly unlikely and good enough
|
|
|
|
// for the planned purpose.
|
|
|
|
// It will also return false if the last item is not EOF (error situations) and
|
|
|
|
// true if both source and items are empty.
|
|
|
|
func IsProbablySourceOfItems(source []byte, items Items) bool {
|
|
|
|
if len(source) == 0 && len(items) == 0 {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
if len(items) == 0 {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
last := items[len(items)-1]
|
|
|
|
if last.Type != tEOF {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
if last.Pos() != len(source) {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, item := range items {
|
|
|
|
if item.Type == tError {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
if item.Type == tEOF {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
if item.Pos() >= len(source) {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
if item.firstByte != source[item.Pos()] {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return true
|
2018-10-17 11:16:45 +00:00
|
|
|
}
|
2023-01-26 09:30:25 +00:00
|
|
|
|
|
|
|
var hasShortcodeRe = regexp.MustCompile(`{{[%,<][^\/]`)
|
|
|
|
|
|
|
|
// HasShortcode returns true if the given string contains a shortcode.
|
|
|
|
func HasShortcode(s string) bool {
|
|
|
|
// Fast path for the common case.
|
|
|
|
if !strings.Contains(s, "{{") {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
return hasShortcodeRe.MatchString(s)
|
|
|
|
}
|