hugo/parser/pageparser/pageparser_intro_test.go

// Copyright 2018 The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package pageparser

import (
	"fmt"
	"reflect"
	"strings"
	"testing"
)

type lexerTest struct {
	name  string
	input string
	items []Item
}

func nti(tp ItemType, val string) Item {
	return Item{tp, 0, []byte(val), false}
}

var (
	tstJSON                = `{ "a": { "b": "\"Hugo\"}" } }`
	tstFrontMatterTOML     = nti(TypeFrontMatterTOML, "foo = \"bar\"\n")
	tstFrontMatterYAML     = nti(TypeFrontMatterYAML, "foo: \"bar\"\n")
	tstFrontMatterYAMLCRLF = nti(TypeFrontMatterYAML, "foo: \"bar\"\r\n")
	tstFrontMatterJSON     = nti(TypeFrontMatterJSON, tstJSON+"\r\n")
	tstSomeText            = nti(tText, "\nSome text.\n")
	tstSummaryDivider      = nti(TypeLeadSummaryDivider, "<!--more-->\n")
	tstNewline             = nti(tText, "\n")

	tstORG = `
#+TITLE: T1
#+AUTHOR: A1
#+DESCRIPTION: D1
`
	tstFrontMatterORG = nti(TypeFrontMatterORG, tstORG)
)

var crLfReplacer = strings.NewReplacer("\r", "#", "\n", "$")

// TODO(bep) a way to toggle ORG mode vs the rest.
var frontMatterTests = []lexerTest{
	{"empty", "", []Item{tstEOF}},
	{"Byte order mark", "\ufeff\nSome text.\n", []Item{nti(TypeIgnore, "\ufeff"), tstSomeText, tstEOF}},
	{"HTML Document", `  <html>  `, []Item{nti(tError, "plain HTML documents not supported")}},
	{"HTML Document with shortcode", `<html>{{< sc1 >}}</html>`, []Item{nti(tError, "plain HTML documents not supported")}},
	{"No front matter", "\nSome text.\n", []Item{tstSomeText, tstEOF}},
	{"YAML front matter", "---\nfoo: \"bar\"\n---\n\nSome text.\n", []Item{tstFrontMatterYAML, tstSomeText, tstEOF}},
	{"YAML empty front matter", "---\n---\n\nSome text.\n", []Item{nti(TypeFrontMatterYAML, ""), tstSomeText, tstEOF}},
	{"YAML commented out front matter", "<!--\n---\nfoo: \"bar\"\n---\n-->\nSome text.\n", []Item{nti(TypeIgnore, "<!--\n"), tstFrontMatterYAML, nti(TypeIgnore, "-->"), tstSomeText, tstEOF}},
	{"YAML commented out front matter, no end", "<!--\n---\nfoo: \"bar\"\n---\nSome text.\n", []Item{nti(TypeIgnore, "<!--\n"), tstFrontMatterYAML, nti(tError, "starting HTML comment with no end")}},
	// Note that we keep all bytes as they are, but we need to handle CRLF
	{"YAML front matter CRLF", "---\r\nfoo: \"bar\"\r\n---\n\nSome text.\n", []Item{tstFrontMatterYAMLCRLF, tstSomeText, tstEOF}},
	{"TOML front matter", "+++\nfoo = \"bar\"\n+++\n\nSome text.\n", []Item{tstFrontMatterTOML, tstSomeText, tstEOF}},
	{"JSON front matter", tstJSON + "\r\n\nSome text.\n", []Item{tstFrontMatterJSON, tstSomeText, tstEOF}},
	{"ORG front matter", tstORG + "\nSome text.\n", []Item{tstFrontMatterORG, tstSomeText, tstEOF}},
	{"Summary divider ORG", tstORG + "\nSome text.\n# more\nSome text.\n", []Item{tstFrontMatterORG, tstSomeText, nti(TypeLeadSummaryDivider, "# more\n"), nti(tText, "Some text.\n"), tstEOF}},
	{"Summary divider", "+++\nfoo = \"bar\"\n+++\n\nSome text.\n<!--more-->\nSome text.\n", []Item{tstFrontMatterTOML, tstSomeText, tstSummaryDivider, nti(tText, "Some text.\n"), tstEOF}},
	{"Summary divider same line", "+++\nfoo = \"bar\"\n+++\n\nSome text.<!--more-->Some text.\n", []Item{tstFrontMatterTOML, nti(tText, "\nSome text."), nti(TypeLeadSummaryDivider, "<!--more-->"), nti(tText, "Some text.\n"), tstEOF}},
	// https://github.com/gohugoio/hugo/issues/5402
	{"Summary and shortcode, no space", "+++\nfoo = \"bar\"\n+++\n\nSome text.\n<!--more-->{{< sc1 >}}\nSome text.\n", []Item{tstFrontMatterTOML, tstSomeText, nti(TypeLeadSummaryDivider, "<!--more-->"), tstLeftNoMD, tstSC1, tstRightNoMD, tstSomeText, tstEOF}},
	// https://github.com/gohugoio/hugo/issues/5464
	{"Summary and shortcode only", "+++\nfoo = \"bar\"\n+++\n{{< sc1 >}}\n<!--more-->\n{{< sc2 >}}", []Item{tstFrontMatterTOML, tstLeftNoMD, tstSC1, tstRightNoMD, tstNewline, tstSummaryDivider, tstLeftNoMD, tstSC2, tstRightNoMD, tstEOF}},
}

func TestFrontMatter(t *testing.T) {
	t.Parallel()
	for i, test := range frontMatterTests {
		items := collect([]byte(test.input), false, lexIntroSection)
		if !equal(items, test.items) {
			got := crLfReplacer.Replace(fmt.Sprint(items))
			expected := crLfReplacer.Replace(fmt.Sprint(test.items))
			t.Errorf("[%d] %s: got\n\t%v\nexpected\n\t%v", i, test.name, got, expected)
		}
	}
}

func collectWithConfig(input []byte, skipFrontMatter bool, stateStart stateFunc, cfg Config) (items []Item) {
	l := newPageLexer(input, stateStart, cfg)
	l.run()
	t := l.newIterator()

	for {
		item := t.Next()
		items = append(items, item)
		if item.Type == tEOF || item.Type == tError {
			break
		}
	}
	return
}

func collect(input []byte, skipFrontMatter bool, stateStart stateFunc) (items []Item) {
	var cfg Config

	return collectWithConfig(input, skipFrontMatter, stateStart, cfg)
}

// no positional checking, for now ...
func equal(i1, i2 []Item) bool {
	if len(i1) != len(i2) {
		return false
	}
	for k := range i1 {
		if i1[k].Type != i2[k].Type {
			return false
		}

		if !reflect.DeepEqual(i1[k].Val, i2[k].Val) {
			return false
		}
	}
	return true
}
parser/pageparser: Add front matter etc. support See #5324 2018-10-17 07:48:55 -04:00			`// Copyright 2018 The Hugo Authors. All rights reserved.`
			`//`
			`// Licensed under the Apache License, Version 2.0 (the "License");`
			`// you may not use this file except in compliance with the License.`
			`// You may obtain a copy of the License at`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`//`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the License for the specific language governing permissions and`
			`// limitations under the License.`

			`package pageparser`

			`import (`
			`"fmt"`
parser/pageparser: Use []byte in page lexer See #5324 2018-10-18 03:04:48 -04:00			`"reflect"`
parser/pageparser: Add front matter etc. support See #5324 2018-10-17 07:48:55 -04:00			`"strings"`
			`"testing"`
			`)`

			`type lexerTest struct {`
			`name string`
			`input string`
			`items []Item`
			`}`

hugolib: Integrate new page parser See #5324 2018-10-18 04:21:23 -04:00			`func nti(tp ItemType, val string) Item {`
Support typed bool, int and float in shortcode params This means that you now can do: {{< vidur 9KvBeKu false true 32 3.14 >}} And the boolean and numeric values will be converted to `bool`, `int` and `float64`. If you want these to be strings, they must be quoted: {{< vidur 9KvBeKu "false" "true" "32" "3.14" >}} Fixes #6371 2019-09-29 08:51:51 -04:00			`return Item{tp, 0, []byte(val), false}`
parser/pageparser: Use []byte in page lexer See #5324 2018-10-18 03:04:48 -04:00			`}`

parser/pageparser: Add front matter etc. support See #5324 2018-10-17 07:48:55 -04:00			`var (`
			tstJSON = `{ "a": { "b": "\"Hugo\"}" } }`
hugolib: Continue the file context/line number errors work See #5324 2018-10-21 06:20:21 -04:00			`tstFrontMatterTOML = nti(TypeFrontMatterTOML, "foo = \"bar\"\n")`
			`tstFrontMatterYAML = nti(TypeFrontMatterYAML, "foo: \"bar\"\n")`
			`tstFrontMatterYAMLCRLF = nti(TypeFrontMatterYAML, "foo: \"bar\"\r\n")`
hugolib: Integrate new page parser See #5324 2018-10-18 04:21:23 -04:00			`tstFrontMatterJSON = nti(TypeFrontMatterJSON, tstJSON+"\r\n")`
parser/pageparser: Use []byte in page lexer See #5324 2018-10-18 03:04:48 -04:00			`tstSomeText = nti(tText, "\nSome text.\n")`
hugolib: Fix broken manual summary handling Fixes #5381 2018-10-30 15:24:34 -04:00			`tstSummaryDivider = nti(TypeLeadSummaryDivider, "<!--more-->\n")`
parser/pageparser: Fix when only shortcode and then summary Fixes #5464 2018-11-24 11:06:26 -05:00			`tstNewline = nti(tText, "\n")`
parser/pageparser: Add front matter etc. support See #5324 2018-10-17 07:48:55 -04:00
			tstORG = `
			`#+TITLE: T1`
			`#+AUTHOR: A1`
			`#+DESCRIPTION: D1`
			`
hugolib: Integrate new page parser See #5324 2018-10-18 04:21:23 -04:00			`tstFrontMatterORG = nti(TypeFrontMatterORG, tstORG)`
parser/pageparser: Add front matter etc. support See #5324 2018-10-17 07:48:55 -04:00			`)`

			`var crLfReplacer = strings.NewReplacer("\r", "#", "\n", "$")`

			`// TODO(bep) a way to toggle ORG mode vs the rest.`
			`var frontMatterTests = []lexerTest{`
			`{"empty", "", []Item{tstEOF}},`
hugolib: Integrate new page parser See #5324 2018-10-18 04:21:23 -04:00			`{"Byte order mark", "\ufeff\nSome text.\n", []Item{nti(TypeIgnore, "\ufeff"), tstSomeText, tstEOF}},`
Fix handling of HTML files without front matter This means that any HTML file inside /content will be treated as a regular file. If you want it processes with shortcodes and a layout, add front matter. The defintion of an HTML file here is: * File with extension .htm or .html * With first non-whitespace character "<" that isn't a HTML comment. This is in line with the documentation. Fixes #7030 Fixes #7028 See #6789 2020-03-09 07:04:33 -04:00			{"HTML Document", ` <html> `, []Item{nti(tError, "plain HTML documents not supported")}},
			{"HTML Document with shortcode", `<html>{{< sc1 >}}</html>`, []Item{nti(tError, "plain HTML documents not supported")}},
hugolib: Integrate new page parser See #5324 2018-10-18 04:21:23 -04:00			`{"No front matter", "\nSome text.\n", []Item{tstSomeText, tstEOF}},`
parser/pageparser: Add front matter etc. support See #5324 2018-10-17 07:48:55 -04:00			`{"YAML front matter", "---\nfoo: \"bar\"\n---\n\nSome text.\n", []Item{tstFrontMatterYAML, tstSomeText, tstEOF}},`
hugolib: Continue the file context/line number errors work See #5324 2018-10-21 06:20:21 -04:00			`{"YAML empty front matter", "---\n---\n\nSome text.\n", []Item{nti(TypeFrontMatterYAML, ""), tstSomeText, tstEOF}},`
parser/pageparser: Fix handling of commented out front matter When the page parser was rewritten in 0.51, this was interpreted literally, but commented out front matter is used in the wild to "hide it from GitHub", e.g: ``` <!-- +++ title = "hello" +++ --> ``` Fixes #5478 2018-11-28 04:21:54 -05:00			`{"YAML commented out front matter", "<!--\n---\nfoo: \"bar\"\n---\n-->\nSome text.\n", []Item{nti(TypeIgnore, "<!--\n"), tstFrontMatterYAML, nti(TypeIgnore, "-->"), tstSomeText, tstEOF}},`
			`{"YAML commented out front matter, no end", "<!--\n---\nfoo: \"bar\"\n---\nSome text.\n", []Item{nti(TypeIgnore, "<!--\n"), tstFrontMatterYAML, nti(tError, "starting HTML comment with no end")}},`
parser/pageparser: Add front matter etc. support See #5324 2018-10-17 07:48:55 -04:00			`// Note that we keep all bytes as they are, but we need to handle CRLF`
			`{"YAML front matter CRLF", "---\r\nfoo: \"bar\"\r\n---\n\nSome text.\n", []Item{tstFrontMatterYAMLCRLF, tstSomeText, tstEOF}},`
			`{"TOML front matter", "+++\nfoo = \"bar\"\n+++\n\nSome text.\n", []Item{tstFrontMatterTOML, tstSomeText, tstEOF}},`
			`{"JSON front matter", tstJSON + "\r\n\nSome text.\n", []Item{tstFrontMatterJSON, tstSomeText, tstEOF}},`
			`{"ORG front matter", tstORG + "\nSome text.\n", []Item{tstFrontMatterORG, tstSomeText, tstEOF}},`
hugolib: Fix broken manual summary handling Fixes #5381 2018-10-30 15:24:34 -04:00			`{"Summary divider ORG", tstORG + "\nSome text.\n# more\nSome text.\n", []Item{tstFrontMatterORG, tstSomeText, nti(TypeLeadSummaryDivider, "# more\n"), nti(tText, "Some text.\n"), tstEOF}},`
			`{"Summary divider", "+++\nfoo = \"bar\"\n+++\n\nSome text.\n<!--more-->\nSome text.\n", []Item{tstFrontMatterTOML, tstSomeText, tstSummaryDivider, nti(tText, "Some text.\n"), tstEOF}},`
			`{"Summary divider same line", "+++\nfoo = \"bar\"\n+++\n\nSome text.<!--more-->Some text.\n", []Item{tstFrontMatterTOML, nti(tText, "\nSome text."), nti(TypeLeadSummaryDivider, "<!--more-->"), nti(tText, "Some text.\n"), tstEOF}},`
Fix shortcode directly following a shortcode delimiter Fixes #5402 2018-11-05 07:30:16 -05:00			`// https://github.com/gohugoio/hugo/issues/5402`
			`{"Summary and shortcode, no space", "+++\nfoo = \"bar\"\n+++\n\nSome text.\n<!--more-->{{< sc1 >}}\nSome text.\n", []Item{tstFrontMatterTOML, tstSomeText, nti(TypeLeadSummaryDivider, "<!--more-->"), tstLeftNoMD, tstSC1, tstRightNoMD, tstSomeText, tstEOF}},`
parser/pageparser: Fix when only shortcode and then summary Fixes #5464 2018-11-24 11:06:26 -05:00			`// https://github.com/gohugoio/hugo/issues/5464`
			`{"Summary and shortcode only", "+++\nfoo = \"bar\"\n+++\n{{< sc1 >}}\n<!--more-->\n{{< sc2 >}}", []Item{tstFrontMatterTOML, tstLeftNoMD, tstSC1, tstRightNoMD, tstNewline, tstSummaryDivider, tstLeftNoMD, tstSC2, tstRightNoMD, tstEOF}},`
parser/pageparser: Add front matter etc. support See #5324 2018-10-17 07:48:55 -04:00			`}`

			`func TestFrontMatter(t *testing.T) {`
			`t.Parallel()`
			`for i, test := range frontMatterTests {`
parser/pageparser: Use []byte in page lexer See #5324 2018-10-18 03:04:48 -04:00			`items := collect([]byte(test.input), false, lexIntroSection)`
parser/pageparser: Add front matter etc. support See #5324 2018-10-17 07:48:55 -04:00			`if !equal(items, test.items) {`
			`got := crLfReplacer.Replace(fmt.Sprint(items))`
			`expected := crLfReplacer.Replace(fmt.Sprint(test.items))`
			`t.Errorf("[%d] %s: got\n\t%v\nexpected\n\t%v", i, test.name, got, expected)`
			`}`
			`}`
			`}`

Move the emoji parsing to pageparser This avoids double parsing the page content when `enableEmoji=true`. This commit also adds some general improvements to the parser, making it in general much faster: ```bash benchmark old ns/op new ns/op delta BenchmarkShortcodeLexer-4 90258 101730 +12.71% BenchmarkParse-4 148940 15037 -89.90% benchmark old allocs new allocs delta BenchmarkShortcodeLexer-4 456 700 +53.51% BenchmarkParse-4 28 33 +17.86% benchmark old bytes new bytes delta BenchmarkShortcodeLexer-4 69875 81014 +15.94% BenchmarkParse-4 8128 8304 +2.17% ``` Running some site benchmarks with Emoji support turned on: ```bash benchmark old ns/op new ns/op delta BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 924556797 818115620 -11.51% benchmark old allocs new allocs delta BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 4112613 4133787 +0.51% benchmark old bytes new bytes delta BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 426982864 424363832 -0.61% ``` Fixes #5534 2018-12-17 15:03:23 -05:00			`func collectWithConfig(input []byte, skipFrontMatter bool, stateStart stateFunc, cfg Config) (items []Item) {`
			`l := newPageLexer(input, stateStart, cfg)`
parser/pageparser: Add front matter etc. support See #5324 2018-10-17 07:48:55 -04:00			`l.run()`
hugolib: Integrate new page parser See #5324 2018-10-18 04:21:23 -04:00			`t := l.newIterator()`
parser/pageparser: Add front matter etc. support See #5324 2018-10-17 07:48:55 -04:00
			`for {`
hugolib: Integrate new page parser See #5324 2018-10-18 04:21:23 -04:00			`item := t.Next()`
parser/pageparser: Add front matter etc. support See #5324 2018-10-17 07:48:55 -04:00			`items = append(items, item)`
hugolib: Redo the summary delimiter logic Now that we have a proper page parse tree, this can be greatly simplified. See #5324 2018-10-19 05:30:57 -04:00			`if item.Type == tEOF \|\| item.Type == tError {`
parser/pageparser: Add front matter etc. support See #5324 2018-10-17 07:48:55 -04:00			`break`
			`}`
			`}`
			`return`
			`}`

Move the emoji parsing to pageparser This avoids double parsing the page content when `enableEmoji=true`. This commit also adds some general improvements to the parser, making it in general much faster: ```bash benchmark old ns/op new ns/op delta BenchmarkShortcodeLexer-4 90258 101730 +12.71% BenchmarkParse-4 148940 15037 -89.90% benchmark old allocs new allocs delta BenchmarkShortcodeLexer-4 456 700 +53.51% BenchmarkParse-4 28 33 +17.86% benchmark old bytes new bytes delta BenchmarkShortcodeLexer-4 69875 81014 +15.94% BenchmarkParse-4 8128 8304 +2.17% ``` Running some site benchmarks with Emoji support turned on: ```bash benchmark old ns/op new ns/op delta BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 924556797 818115620 -11.51% benchmark old allocs new allocs delta BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 4112613 4133787 +0.51% benchmark old bytes new bytes delta BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 426982864 424363832 -0.61% ``` Fixes #5534 2018-12-17 15:03:23 -05:00			`func collect(input []byte, skipFrontMatter bool, stateStart stateFunc) (items []Item) {`
			`var cfg Config`

			`return collectWithConfig(input, skipFrontMatter, stateStart, cfg)`
			`}`

parser/pageparser: Add front matter etc. support See #5324 2018-10-17 07:48:55 -04:00			`// no positional checking, for now ...`
			`func equal(i1, i2 []Item) bool {`
			`if len(i1) != len(i2) {`
			`return false`
			`}`
			`for k := range i1 {`
hugolib: Redo the summary delimiter logic Now that we have a proper page parse tree, this can be greatly simplified. See #5324 2018-10-19 05:30:57 -04:00			`if i1[k].Type != i2[k].Type {`
parser/pageparser: Add front matter etc. support See #5324 2018-10-17 07:48:55 -04:00			`return false`
			`}`
Support typed bool, int and float in shortcode params This means that you now can do: {{< vidur 9KvBeKu false true 32 3.14 >}} And the boolean and numeric values will be converted to `bool`, `int` and `float64`. If you want these to be strings, they must be quoted: {{< vidur 9KvBeKu "false" "true" "32" "3.14" >}} Fixes #6371 2019-09-29 08:51:51 -04:00
parser/pageparser: Use []byte in page lexer See #5324 2018-10-18 03:04:48 -04:00			`if !reflect.DeepEqual(i1[k].Val, i2[k].Val) {`
parser/pageparser: Add front matter etc. support See #5324 2018-10-17 07:48:55 -04:00			`return false`
			`}`
			`}`
			`return true`
			`}`