2018-10-17 07:48:55 -04:00
// Copyright 2018 The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package pageparser
import (
"fmt"
"strings"
"testing"
2022-07-07 10:11:47 -04:00
qt "github.com/frankban/quicktest"
2018-10-17 07:48:55 -04:00
)
type lexerTest struct {
name string
input string
2022-07-07 10:11:47 -04:00
items [ ] typeText
}
type typeText struct {
typ ItemType
text string
2018-10-17 07:48:55 -04:00
}
2022-07-07 10:11:47 -04:00
func nti ( tp ItemType , val string ) typeText {
return typeText { typ : tp , text : val }
2018-10-18 03:04:48 -04:00
}
2018-10-17 07:48:55 -04:00
var (
tstJSON = ` { "a": { "b": "\"Hugo\"}" } } `
2018-10-21 06:20:21 -04:00
tstFrontMatterTOML = nti ( TypeFrontMatterTOML , "foo = \"bar\"\n" )
tstFrontMatterYAML = nti ( TypeFrontMatterYAML , "foo: \"bar\"\n" )
tstFrontMatterYAMLCRLF = nti ( TypeFrontMatterYAML , "foo: \"bar\"\r\n" )
2018-10-18 04:21:23 -04:00
tstFrontMatterJSON = nti ( TypeFrontMatterJSON , tstJSON + "\r\n" )
2018-10-18 03:04:48 -04:00
tstSomeText = nti ( tText , "\nSome text.\n" )
2018-10-30 15:24:34 -04:00
tstSummaryDivider = nti ( TypeLeadSummaryDivider , "<!--more-->\n" )
2018-11-24 11:06:26 -05:00
tstNewline = nti ( tText , "\n" )
2018-10-17 07:48:55 -04:00
tstORG = `
# + TITLE : T1
# + AUTHOR : A1
# + DESCRIPTION : D1
`
2018-10-18 04:21:23 -04:00
tstFrontMatterORG = nti ( TypeFrontMatterORG , tstORG )
2018-10-17 07:48:55 -04:00
)
var crLfReplacer = strings . NewReplacer ( "\r" , "#" , "\n" , "$" )
// TODO(bep) a way to toggle ORG mode vs the rest.
var frontMatterTests = [ ] lexerTest {
2022-07-07 10:11:47 -04:00
{ "empty" , "" , [ ] typeText { tstEOF } } ,
{ "Byte order mark" , "\ufeff\nSome text.\n" , [ ] typeText { nti ( TypeIgnore , "\ufeff" ) , tstSomeText , tstEOF } } ,
{ "HTML Document" , ` <html> ` , [ ] typeText { nti ( tError , "plain HTML documents not supported" ) } } ,
{ "HTML Document with shortcode" , ` <html> {{ < sc1 > }} </html> ` , [ ] typeText { nti ( tError , "plain HTML documents not supported" ) } } ,
{ "No front matter" , "\nSome text.\n" , [ ] typeText { tstSomeText , tstEOF } } ,
{ "YAML front matter" , "---\nfoo: \"bar\"\n---\n\nSome text.\n" , [ ] typeText { tstFrontMatterYAML , tstSomeText , tstEOF } } ,
{ "YAML empty front matter" , "---\n---\n\nSome text.\n" , [ ] typeText { nti ( TypeFrontMatterYAML , "" ) , tstSomeText , tstEOF } } ,
{ "YAML commented out front matter" , "<!--\n---\nfoo: \"bar\"\n---\n-->\nSome text.\n" , [ ] typeText { nti ( TypeIgnore , "<!--\n" ) , tstFrontMatterYAML , nti ( TypeIgnore , "-->" ) , tstSomeText , tstEOF } } ,
{ "YAML commented out front matter, no end" , "<!--\n---\nfoo: \"bar\"\n---\nSome text.\n" , [ ] typeText { nti ( TypeIgnore , "<!--\n" ) , tstFrontMatterYAML , nti ( tError , "starting HTML comment with no end" ) } } ,
2018-10-17 07:48:55 -04:00
// Note that we keep all bytes as they are, but we need to handle CRLF
2022-07-07 10:11:47 -04:00
{ "YAML front matter CRLF" , "---\r\nfoo: \"bar\"\r\n---\n\nSome text.\n" , [ ] typeText { tstFrontMatterYAMLCRLF , tstSomeText , tstEOF } } ,
{ "TOML front matter" , "+++\nfoo = \"bar\"\n+++\n\nSome text.\n" , [ ] typeText { tstFrontMatterTOML , tstSomeText , tstEOF } } ,
{ "JSON front matter" , tstJSON + "\r\n\nSome text.\n" , [ ] typeText { tstFrontMatterJSON , tstSomeText , tstEOF } } ,
{ "ORG front matter" , tstORG + "\nSome text.\n" , [ ] typeText { tstFrontMatterORG , tstSomeText , tstEOF } } ,
{ "Summary divider ORG" , tstORG + "\nSome text.\n# more\nSome text.\n" , [ ] typeText { tstFrontMatterORG , tstSomeText , nti ( TypeLeadSummaryDivider , "# more\n" ) , nti ( tText , "Some text.\n" ) , tstEOF } } ,
{ "Summary divider" , "+++\nfoo = \"bar\"\n+++\n\nSome text.\n<!--more-->\nSome text.\n" , [ ] typeText { tstFrontMatterTOML , tstSomeText , tstSummaryDivider , nti ( tText , "Some text.\n" ) , tstEOF } } ,
{ "Summary divider same line" , "+++\nfoo = \"bar\"\n+++\n\nSome text.<!--more-->Some text.\n" , [ ] typeText { tstFrontMatterTOML , nti ( tText , "\nSome text." ) , nti ( TypeLeadSummaryDivider , "<!--more-->" ) , nti ( tText , "Some text.\n" ) , tstEOF } } ,
2018-11-05 07:30:16 -05:00
// https://github.com/gohugoio/hugo/issues/5402
2022-07-07 10:11:47 -04:00
{ "Summary and shortcode, no space" , "+++\nfoo = \"bar\"\n+++\n\nSome text.\n<!--more-->{{< sc1 >}}\nSome text.\n" , [ ] typeText { tstFrontMatterTOML , tstSomeText , nti ( TypeLeadSummaryDivider , "<!--more-->" ) , tstLeftNoMD , tstSC1 , tstRightNoMD , tstSomeText , tstEOF } } ,
2018-11-24 11:06:26 -05:00
// https://github.com/gohugoio/hugo/issues/5464
2022-07-07 10:11:47 -04:00
{ "Summary and shortcode only" , "+++\nfoo = \"bar\"\n+++\n{{< sc1 >}}\n<!--more-->\n{{< sc2 >}}" , [ ] typeText { tstFrontMatterTOML , tstLeftNoMD , tstSC1 , tstRightNoMD , tstNewline , tstSummaryDivider , tstLeftNoMD , tstSC2 , tstRightNoMD , tstEOF } } ,
2018-10-17 07:48:55 -04:00
}
func TestFrontMatter ( t * testing . T ) {
t . Parallel ( )
2022-07-07 10:11:47 -04:00
c := qt . New ( t )
2018-10-17 07:48:55 -04:00
for i , test := range frontMatterTests {
2018-10-18 03:04:48 -04:00
items := collect ( [ ] byte ( test . input ) , false , lexIntroSection )
2022-07-07 10:11:47 -04:00
if ! equal ( test . input , items , test . items ) {
got := itemsToString ( items , [ ] byte ( test . input ) )
expected := testItemsToString ( test . items )
c . Assert ( got , qt . Equals , expected , qt . Commentf ( "Test %d: %s" , i , test . name ) )
}
}
}
func itemsToString ( items [ ] Item , source [ ] byte ) string {
var sb strings . Builder
for i , item := range items {
var s string
if item . Err != nil {
s = item . Err . Error ( )
} else {
s = string ( item . Val ( source ) )
}
sb . WriteString ( fmt . Sprintf ( "%s: %s\n" , item . Type , s ) )
if i < len ( items ) - 1 {
sb . WriteString ( "\n" )
2018-10-17 07:48:55 -04:00
}
}
2022-07-07 10:11:47 -04:00
return crLfReplacer . Replace ( sb . String ( ) )
}
func testItemsToString ( items [ ] typeText ) string {
var sb strings . Builder
for i , item := range items {
sb . WriteString ( fmt . Sprintf ( "%s: %s\n" , item . typ , item . text ) )
if i < len ( items ) - 1 {
sb . WriteString ( "\n" )
}
}
return crLfReplacer . Replace ( sb . String ( ) )
2018-10-17 07:48:55 -04:00
}
Move the emoji parsing to pageparser
This avoids double parsing the page content when `enableEmoji=true`.
This commit also adds some general improvements to the parser, making it in general much faster:
```bash
benchmark old ns/op new ns/op delta
BenchmarkShortcodeLexer-4 90258 101730 +12.71%
BenchmarkParse-4 148940 15037 -89.90%
benchmark old allocs new allocs delta
BenchmarkShortcodeLexer-4 456 700 +53.51%
BenchmarkParse-4 28 33 +17.86%
benchmark old bytes new bytes delta
BenchmarkShortcodeLexer-4 69875 81014 +15.94%
BenchmarkParse-4 8128 8304 +2.17%
```
Running some site benchmarks with Emoji support turned on:
```bash
benchmark old ns/op new ns/op delta
BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 924556797 818115620 -11.51%
benchmark old allocs new allocs delta
BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 4112613 4133787 +0.51%
benchmark old bytes new bytes delta
BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 426982864 424363832 -0.61%
```
Fixes #5534
2018-12-17 15:03:23 -05:00
func collectWithConfig ( input [ ] byte , skipFrontMatter bool , stateStart stateFunc , cfg Config ) ( items [ ] Item ) {
l := newPageLexer ( input , stateStart , cfg )
2018-10-17 07:48:55 -04:00
l . run ( )
2022-07-07 10:11:47 -04:00
iter := NewIterator ( l . items )
2018-10-17 07:48:55 -04:00
for {
2022-07-07 10:11:47 -04:00
item := iter . Next ( )
2018-10-17 07:48:55 -04:00
items = append ( items , item )
2018-10-19 05:30:57 -04:00
if item . Type == tEOF || item . Type == tError {
2018-10-17 07:48:55 -04:00
break
}
}
return
}
Move the emoji parsing to pageparser
This avoids double parsing the page content when `enableEmoji=true`.
This commit also adds some general improvements to the parser, making it in general much faster:
```bash
benchmark old ns/op new ns/op delta
BenchmarkShortcodeLexer-4 90258 101730 +12.71%
BenchmarkParse-4 148940 15037 -89.90%
benchmark old allocs new allocs delta
BenchmarkShortcodeLexer-4 456 700 +53.51%
BenchmarkParse-4 28 33 +17.86%
benchmark old bytes new bytes delta
BenchmarkShortcodeLexer-4 69875 81014 +15.94%
BenchmarkParse-4 8128 8304 +2.17%
```
Running some site benchmarks with Emoji support turned on:
```bash
benchmark old ns/op new ns/op delta
BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 924556797 818115620 -11.51%
benchmark old allocs new allocs delta
BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 4112613 4133787 +0.51%
benchmark old bytes new bytes delta
BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 426982864 424363832 -0.61%
```
Fixes #5534
2018-12-17 15:03:23 -05:00
func collect ( input [ ] byte , skipFrontMatter bool , stateStart stateFunc ) ( items [ ] Item ) {
var cfg Config
return collectWithConfig ( input , skipFrontMatter , stateStart , cfg )
}
2022-07-07 10:11:47 -04:00
func collectStringMain ( input string ) [ ] Item {
return collect ( [ ] byte ( input ) , true , lexMainSection )
}
2018-10-17 07:48:55 -04:00
// no positional checking, for now ...
2022-07-07 10:11:47 -04:00
func equal ( source string , got [ ] Item , expect [ ] typeText ) bool {
if len ( got ) != len ( expect ) {
2018-10-17 07:48:55 -04:00
return false
}
2022-07-07 10:11:47 -04:00
sourceb := [ ] byte ( source )
for k := range got {
g := got [ k ]
e := expect [ k ]
if g . Type != e . typ {
2018-10-17 07:48:55 -04:00
return false
}
2019-09-29 08:51:51 -04:00
2022-07-07 10:11:47 -04:00
var s string
if g . Err != nil {
s = g . Err . Error ( )
} else {
s = string ( g . Val ( sourceb ) )
}
if s != e . text {
2018-10-17 07:48:55 -04:00
return false
}
2022-07-07 10:11:47 -04:00
2018-10-17 07:48:55 -04:00
}
return true
}