hugo/parser/pageparser/pageparser.go
Bjørn Erik Pedersen eada236f87
Introduce a tree map for all content
This commit introduces a new data structure to store pages and their resources.

This data structure is backed by radix trees.

This simplies tree operations, makes all pages a bundle,  and paves the way for #6310.

It also solves a set of annoying issues (see list below).

Not a motivation behind this, but this commit also makes Hugo in general a little bit faster and more memory effective (see benchmarks). Especially for partial rebuilds on content edits, but also when taxonomies is in use.

```
name                                   old time/op    new time/op    delta
SiteNew/Bundle_with_image/Edit-16        1.32ms ± 8%    1.00ms ± 9%  -24.42%  (p=0.029 n=4+4)
SiteNew/Bundle_with_JSON_file/Edit-16    1.28ms ± 0%    0.94ms ± 0%  -26.26%  (p=0.029 n=4+4)
SiteNew/Tags_and_categories/Edit-16      33.9ms ± 2%    21.8ms ± 1%  -35.67%  (p=0.029 n=4+4)
SiteNew/Canonify_URLs/Edit-16            40.6ms ± 1%    37.7ms ± 3%   -7.20%  (p=0.029 n=4+4)
SiteNew/Deep_content_tree/Edit-16        56.7ms ± 0%    51.7ms ± 1%   -8.82%  (p=0.029 n=4+4)
SiteNew/Many_HTML_templates/Edit-16      19.9ms ± 2%    18.3ms ± 3%   -7.64%  (p=0.029 n=4+4)
SiteNew/Page_collections/Edit-16         37.9ms ± 4%    34.0ms ± 2%  -10.28%  (p=0.029 n=4+4)
SiteNew/Bundle_with_image-16             10.7ms ± 0%    10.6ms ± 0%   -1.15%  (p=0.029 n=4+4)
SiteNew/Bundle_with_JSON_file-16         10.8ms ± 0%    10.7ms ± 0%   -1.05%  (p=0.029 n=4+4)
SiteNew/Tags_and_categories-16           43.2ms ± 1%    39.6ms ± 1%   -8.35%  (p=0.029 n=4+4)
SiteNew/Canonify_URLs-16                 47.6ms ± 1%    47.3ms ± 0%     ~     (p=0.057 n=4+4)
SiteNew/Deep_content_tree-16             73.0ms ± 1%    74.2ms ± 1%     ~     (p=0.114 n=4+4)
SiteNew/Many_HTML_templates-16           37.9ms ± 0%    38.1ms ± 1%     ~     (p=0.114 n=4+4)
SiteNew/Page_collections-16              53.6ms ± 1%    54.7ms ± 1%   +2.09%  (p=0.029 n=4+4)

name                                   old alloc/op   new alloc/op   delta
SiteNew/Bundle_with_image/Edit-16         486kB ± 0%     430kB ± 0%  -11.47%  (p=0.029 n=4+4)
SiteNew/Bundle_with_JSON_file/Edit-16     265kB ± 0%     209kB ± 0%  -21.06%  (p=0.029 n=4+4)
SiteNew/Tags_and_categories/Edit-16      13.6MB ± 0%     8.8MB ± 0%  -34.93%  (p=0.029 n=4+4)
SiteNew/Canonify_URLs/Edit-16            66.5MB ± 0%    63.9MB ± 0%   -3.95%  (p=0.029 n=4+4)
SiteNew/Deep_content_tree/Edit-16        28.8MB ± 0%    25.8MB ± 0%  -10.55%  (p=0.029 n=4+4)
SiteNew/Many_HTML_templates/Edit-16      6.16MB ± 0%    5.56MB ± 0%   -9.86%  (p=0.029 n=4+4)
SiteNew/Page_collections/Edit-16         16.9MB ± 0%    16.0MB ± 0%   -5.19%  (p=0.029 n=4+4)
SiteNew/Bundle_with_image-16             2.28MB ± 0%    2.29MB ± 0%   +0.35%  (p=0.029 n=4+4)
SiteNew/Bundle_with_JSON_file-16         2.07MB ± 0%    2.07MB ± 0%     ~     (p=0.114 n=4+4)
SiteNew/Tags_and_categories-16           14.3MB ± 0%    13.2MB ± 0%   -7.30%  (p=0.029 n=4+4)
SiteNew/Canonify_URLs-16                 69.1MB ± 0%    69.0MB ± 0%     ~     (p=0.343 n=4+4)
SiteNew/Deep_content_tree-16             31.3MB ± 0%    31.8MB ± 0%   +1.49%  (p=0.029 n=4+4)
SiteNew/Many_HTML_templates-16           10.8MB ± 0%    10.9MB ± 0%   +1.11%  (p=0.029 n=4+4)
SiteNew/Page_collections-16              21.4MB ± 0%    21.6MB ± 0%   +1.15%  (p=0.029 n=4+4)

name                                   old allocs/op  new allocs/op  delta
SiteNew/Bundle_with_image/Edit-16         4.74k ± 0%     3.86k ± 0%  -18.57%  (p=0.029 n=4+4)
SiteNew/Bundle_with_JSON_file/Edit-16     4.73k ± 0%     3.85k ± 0%  -18.58%  (p=0.029 n=4+4)
SiteNew/Tags_and_categories/Edit-16        301k ± 0%      198k ± 0%  -34.14%  (p=0.029 n=4+4)
SiteNew/Canonify_URLs/Edit-16              389k ± 0%      373k ± 0%   -4.07%  (p=0.029 n=4+4)
SiteNew/Deep_content_tree/Edit-16          338k ± 0%      262k ± 0%  -22.63%  (p=0.029 n=4+4)
SiteNew/Many_HTML_templates/Edit-16        102k ± 0%       88k ± 0%  -13.81%  (p=0.029 n=4+4)
SiteNew/Page_collections/Edit-16           176k ± 0%      152k ± 0%  -13.32%  (p=0.029 n=4+4)
SiteNew/Bundle_with_image-16              26.8k ± 0%     26.8k ± 0%   +0.05%  (p=0.029 n=4+4)
SiteNew/Bundle_with_JSON_file-16          26.8k ± 0%     26.8k ± 0%   +0.05%  (p=0.029 n=4+4)
SiteNew/Tags_and_categories-16             273k ± 0%      245k ± 0%  -10.36%  (p=0.029 n=4+4)
SiteNew/Canonify_URLs-16                   396k ± 0%      398k ± 0%   +0.39%  (p=0.029 n=4+4)
SiteNew/Deep_content_tree-16               317k ± 0%      325k ± 0%   +2.53%  (p=0.029 n=4+4)
SiteNew/Many_HTML_templates-16             146k ± 0%      147k ± 0%   +0.98%  (p=0.029 n=4+4)
SiteNew/Page_collections-16                210k ± 0%      215k ± 0%   +2.44%  (p=0.029 n=4+4)
```

Fixes #6312
Fixes #6087
Fixes #6738
Fixes #6412
Fixes #6743
Fixes #6875
Fixes #6034
Fixes #6902
Fixes #6173
Fixes #6590
2020-02-18 09:49:42 +01:00

195 lines
5.2 KiB
Go

// Copyright 2019 The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package pageparser provides a parser for Hugo content files (Markdown, HTML etc.) in Hugo.
// This implementation is highly inspired by the great talk given by Rob Pike called "Lexical Scanning in Go"
// It's on YouTube, Google it!.
// See slides here: http://cuddle.googlecode.com/hg/talk/lex.html
package pageparser
import (
"bytes"
"io"
"io/ioutil"
"github.com/gohugoio/hugo/parser/metadecoders"
"github.com/pkg/errors"
)
// Result holds the parse result.
type Result interface {
// Iterator returns a new Iterator positioned at the beginning of the parse tree.
Iterator() *Iterator
// Input returns the input to Parse.
Input() []byte
}
var _ Result = (*pageLexer)(nil)
// Parse parses the page in the given reader according to the given Config.
// TODO(bep) now that we have improved the "lazy order" init, it *may* be
// some potential saving in doing a buffered approach where the first pass does
// the frontmatter only.
func Parse(r io.Reader, cfg Config) (Result, error) {
return parseSection(r, cfg, lexIntroSection)
}
type ContentFrontMatter struct {
Content []byte
FrontMatter map[string]interface{}
FrontMatterFormat metadecoders.Format
}
// ParseFrontMatterAndContent is a convenience method to extract front matter
// and content from a content page.
func ParseFrontMatterAndContent(r io.Reader) (ContentFrontMatter, error) {
var cf ContentFrontMatter
psr, err := Parse(r, Config{})
if err != nil {
return cf, err
}
var frontMatterSource []byte
iter := psr.Iterator()
walkFn := func(item Item) bool {
if frontMatterSource != nil {
// The rest is content.
cf.Content = psr.Input()[item.Pos:]
// Done
return false
} else if item.IsFrontMatter() {
cf.FrontMatterFormat = FormatFromFrontMatterType(item.Type)
frontMatterSource = item.Val
}
return true
}
iter.PeekWalk(walkFn)
cf.FrontMatter, err = metadecoders.Default.UnmarshalToMap(frontMatterSource, cf.FrontMatterFormat)
return cf, err
}
func FormatFromFrontMatterType(typ ItemType) metadecoders.Format {
switch typ {
case TypeFrontMatterJSON:
return metadecoders.JSON
case TypeFrontMatterORG:
return metadecoders.ORG
case TypeFrontMatterTOML:
return metadecoders.TOML
case TypeFrontMatterYAML:
return metadecoders.YAML
default:
return ""
}
}
// ParseMain parses starting with the main section. Used in tests.
func ParseMain(r io.Reader, cfg Config) (Result, error) {
return parseSection(r, cfg, lexMainSection)
}
func parseSection(r io.Reader, cfg Config, start stateFunc) (Result, error) {
b, err := ioutil.ReadAll(r)
if err != nil {
return nil, errors.Wrap(err, "failed to read page content")
}
return parseBytes(b, cfg, start)
}
func parseBytes(b []byte, cfg Config, start stateFunc) (Result, error) {
lexer := newPageLexer(b, start, cfg)
lexer.run()
return lexer, nil
}
// An Iterator has methods to iterate a parsed page with support going back
// if needed.
type Iterator struct {
l *pageLexer
lastPos int // position of the last item returned by nextItem
}
// consumes and returns the next item
func (t *Iterator) Next() Item {
t.lastPos++
return t.Current()
}
// Input returns the input source.
func (t *Iterator) Input() []byte {
return t.l.Input()
}
var errIndexOutOfBounds = Item{tError, 0, []byte("no more tokens"), true}
// Current will repeatably return the current item.
func (t *Iterator) Current() Item {
if t.lastPos >= len(t.l.items) {
return errIndexOutOfBounds
}
return t.l.items[t.lastPos]
}
// backs up one token.
func (t *Iterator) Backup() {
if t.lastPos < 0 {
panic("need to go forward before going back")
}
t.lastPos--
}
// check for non-error and non-EOF types coming next
func (t *Iterator) IsValueNext() bool {
i := t.Peek()
return i.Type != tError && i.Type != tEOF
}
// look at, but do not consume, the next item
// repeated, sequential calls will return the same item
func (t *Iterator) Peek() Item {
return t.l.items[t.lastPos+1]
}
// PeekWalk will feed the next items in the iterator to walkFn
// until it returns false.
func (t *Iterator) PeekWalk(walkFn func(item Item) bool) {
for i := t.lastPos + 1; i < len(t.l.items); i++ {
item := t.l.items[i]
if !walkFn(item) {
break
}
}
}
// Consume is a convencience method to consume the next n tokens,
// but back off Errors and EOF.
func (t *Iterator) Consume(cnt int) {
for i := 0; i < cnt; i++ {
token := t.Next()
if token.Type == tError || token.Type == tEOF {
t.Backup()
break
}
}
}
// LineNumber returns the current line number. Used for logging.
func (t *Iterator) LineNumber() int {
return bytes.Count(t.l.input[:t.Current().Pos], lf) + 1
}