2018-10-17 07:16:45 -04:00
|
|
|
// Copyright 2018 The Hugo Authors. All rights reserved.
|
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
|
|
|
package pageparser
|
|
|
|
|
2018-10-19 05:30:57 -04:00
|
|
|
import (
|
|
|
|
"bytes"
|
|
|
|
"fmt"
|
2019-09-29 08:51:51 -04:00
|
|
|
"regexp"
|
|
|
|
"strconv"
|
2018-10-19 05:30:57 -04:00
|
|
|
)
|
2018-10-17 07:16:45 -04:00
|
|
|
|
|
|
|
type Item struct {
|
2019-09-29 08:51:51 -04:00
|
|
|
Type ItemType
|
|
|
|
Pos int
|
|
|
|
Val []byte
|
|
|
|
isString bool
|
2018-10-17 07:16:45 -04:00
|
|
|
}
|
|
|
|
|
2018-10-18 04:21:23 -04:00
|
|
|
type Items []Item
|
|
|
|
|
2018-10-18 03:47:39 -04:00
|
|
|
func (i Item) ValStr() string {
|
|
|
|
return string(i.Val)
|
|
|
|
}
|
|
|
|
|
2019-09-29 08:51:51 -04:00
|
|
|
func (i Item) ValTyped() interface{} {
|
|
|
|
str := i.ValStr()
|
|
|
|
if i.isString {
|
|
|
|
// A quoted value that is a string even if it looks like a number etc.
|
|
|
|
return str
|
|
|
|
}
|
|
|
|
|
|
|
|
if boolRe.MatchString(str) {
|
|
|
|
return str == "true"
|
|
|
|
}
|
|
|
|
|
|
|
|
if intRe.MatchString(str) {
|
|
|
|
num, err := strconv.Atoi(str)
|
|
|
|
if err != nil {
|
|
|
|
return str
|
|
|
|
}
|
|
|
|
return num
|
|
|
|
}
|
|
|
|
|
|
|
|
if floatRe.MatchString(str) {
|
|
|
|
num, err := strconv.ParseFloat(str, 64)
|
|
|
|
if err != nil {
|
|
|
|
return str
|
|
|
|
}
|
|
|
|
return num
|
|
|
|
}
|
|
|
|
|
|
|
|
return str
|
|
|
|
}
|
|
|
|
|
2018-10-17 07:16:45 -04:00
|
|
|
func (i Item) IsText() bool {
|
2018-10-19 05:30:57 -04:00
|
|
|
return i.Type == tText
|
|
|
|
}
|
|
|
|
|
|
|
|
func (i Item) IsNonWhitespace() bool {
|
|
|
|
return len(bytes.TrimSpace(i.Val)) > 0
|
2018-10-17 07:16:45 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
func (i Item) IsShortcodeName() bool {
|
2018-10-19 05:30:57 -04:00
|
|
|
return i.Type == tScName
|
2018-10-17 07:16:45 -04:00
|
|
|
}
|
|
|
|
|
2018-11-26 05:01:27 -05:00
|
|
|
func (i Item) IsInlineShortcodeName() bool {
|
|
|
|
return i.Type == tScNameInline
|
|
|
|
}
|
|
|
|
|
2018-10-17 07:16:45 -04:00
|
|
|
func (i Item) IsLeftShortcodeDelim() bool {
|
2018-10-19 05:30:57 -04:00
|
|
|
return i.Type == tLeftDelimScWithMarkup || i.Type == tLeftDelimScNoMarkup
|
2018-10-17 07:16:45 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
func (i Item) IsRightShortcodeDelim() bool {
|
2018-10-19 05:30:57 -04:00
|
|
|
return i.Type == tRightDelimScWithMarkup || i.Type == tRightDelimScNoMarkup
|
2018-10-17 07:16:45 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
func (i Item) IsShortcodeClose() bool {
|
2018-10-19 05:30:57 -04:00
|
|
|
return i.Type == tScClose
|
2018-10-17 07:16:45 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
func (i Item) IsShortcodeParam() bool {
|
2018-10-19 05:30:57 -04:00
|
|
|
return i.Type == tScParam
|
2018-10-17 07:16:45 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
func (i Item) IsShortcodeParamVal() bool {
|
2018-10-19 05:30:57 -04:00
|
|
|
return i.Type == tScParamVal
|
2018-10-17 07:16:45 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
func (i Item) IsShortcodeMarkupDelimiter() bool {
|
2018-10-19 05:30:57 -04:00
|
|
|
return i.Type == tLeftDelimScWithMarkup || i.Type == tRightDelimScWithMarkup
|
2018-10-18 04:21:23 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
func (i Item) IsFrontMatter() bool {
|
2018-10-19 05:30:57 -04:00
|
|
|
return i.Type >= TypeFrontMatterYAML && i.Type <= TypeFrontMatterORG
|
2018-10-17 07:16:45 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
func (i Item) IsDone() bool {
|
2018-10-19 05:30:57 -04:00
|
|
|
return i.Type == tError || i.Type == tEOF
|
2018-10-17 07:16:45 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
func (i Item) IsEOF() bool {
|
2018-10-19 05:30:57 -04:00
|
|
|
return i.Type == tEOF
|
2018-10-17 07:16:45 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
func (i Item) IsError() bool {
|
2018-10-19 05:30:57 -04:00
|
|
|
return i.Type == tError
|
2018-10-17 07:16:45 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
func (i Item) String() string {
|
|
|
|
switch {
|
2018-10-19 05:30:57 -04:00
|
|
|
case i.Type == tEOF:
|
2018-10-17 07:16:45 -04:00
|
|
|
return "EOF"
|
2018-10-19 05:30:57 -04:00
|
|
|
case i.Type == tError:
|
2018-10-18 03:04:48 -04:00
|
|
|
return string(i.Val)
|
2018-10-19 05:30:57 -04:00
|
|
|
case i.Type > tKeywordMarker:
|
2018-10-17 07:16:45 -04:00
|
|
|
return fmt.Sprintf("<%s>", i.Val)
|
2018-10-17 07:48:55 -04:00
|
|
|
case len(i.Val) > 50:
|
2018-10-19 05:30:57 -04:00
|
|
|
return fmt.Sprintf("%v:%.20q...", i.Type, i.Val)
|
2018-10-17 07:16:45 -04:00
|
|
|
}
|
2018-10-19 05:30:57 -04:00
|
|
|
return fmt.Sprintf("%v:[%s]", i.Type, i.Val)
|
2018-10-17 07:16:45 -04:00
|
|
|
}
|
|
|
|
|
2018-10-18 04:21:23 -04:00
|
|
|
type ItemType int
|
2018-10-17 07:16:45 -04:00
|
|
|
|
|
|
|
const (
|
2018-10-18 04:21:23 -04:00
|
|
|
tError ItemType = iota
|
2018-10-17 07:16:45 -04:00
|
|
|
tEOF
|
|
|
|
|
2018-10-17 07:48:55 -04:00
|
|
|
// page items
|
2018-10-23 08:37:09 -04:00
|
|
|
TypeLeadSummaryDivider // <!--more-->, # more
|
2018-10-18 04:21:23 -04:00
|
|
|
TypeFrontMatterYAML
|
|
|
|
TypeFrontMatterTOML
|
|
|
|
TypeFrontMatterJSON
|
|
|
|
TypeFrontMatterORG
|
Move the emoji parsing to pageparser
This avoids double parsing the page content when `enableEmoji=true`.
This commit also adds some general improvements to the parser, making it in general much faster:
```bash
benchmark old ns/op new ns/op delta
BenchmarkShortcodeLexer-4 90258 101730 +12.71%
BenchmarkParse-4 148940 15037 -89.90%
benchmark old allocs new allocs delta
BenchmarkShortcodeLexer-4 456 700 +53.51%
BenchmarkParse-4 28 33 +17.86%
benchmark old bytes new bytes delta
BenchmarkShortcodeLexer-4 69875 81014 +15.94%
BenchmarkParse-4 8128 8304 +2.17%
```
Running some site benchmarks with Emoji support turned on:
```bash
benchmark old ns/op new ns/op delta
BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 924556797 818115620 -11.51%
benchmark old allocs new allocs delta
BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 4112613 4133787 +0.51%
benchmark old bytes new bytes delta
BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 426982864 424363832 -0.61%
```
Fixes #5534
2018-12-17 15:03:23 -05:00
|
|
|
TypeEmoji
|
2018-10-18 04:21:23 -04:00
|
|
|
TypeIgnore // // The BOM Unicode byte order marker and possibly others
|
2018-10-17 07:48:55 -04:00
|
|
|
|
2018-10-17 07:16:45 -04:00
|
|
|
// shortcode items
|
|
|
|
tLeftDelimScNoMarkup
|
|
|
|
tRightDelimScNoMarkup
|
|
|
|
tLeftDelimScWithMarkup
|
|
|
|
tRightDelimScWithMarkup
|
|
|
|
tScClose
|
|
|
|
tScName
|
2018-11-26 05:01:27 -05:00
|
|
|
tScNameInline
|
2018-10-17 07:16:45 -04:00
|
|
|
tScParam
|
|
|
|
tScParamVal
|
|
|
|
|
2018-10-17 07:48:55 -04:00
|
|
|
tText // plain text
|
2018-10-17 07:16:45 -04:00
|
|
|
|
|
|
|
// preserved for later - keywords come after this
|
|
|
|
tKeywordMarker
|
|
|
|
)
|
2019-09-29 08:51:51 -04:00
|
|
|
|
|
|
|
var (
|
|
|
|
boolRe = regexp.MustCompile(`^(true$)|(false$)`)
|
|
|
|
intRe = regexp.MustCompile(`^[-+]?\d+$`)
|
|
|
|
floatRe = regexp.MustCompile(`^[-+]?\d*\.\d+$`)
|
|
|
|
)
|