2016-03-23 09:51:16 -04:00
|
|
|
// Copyright 2016n The Hugo Authors. All rights reserved.
|
2015-12-10 17:19:38 -05:00
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
2013-09-17 18:52:40 -04:00
|
|
|
package parser
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bufio"
|
|
|
|
"bytes"
|
2013-09-18 12:15:46 -04:00
|
|
|
"fmt"
|
2013-09-17 18:52:40 -04:00
|
|
|
"io"
|
2015-08-02 01:24:22 -04:00
|
|
|
"regexp"
|
2015-09-23 00:43:17 -04:00
|
|
|
"strings"
|
2013-09-17 18:52:40 -04:00
|
|
|
"unicode"
|
2017-02-21 02:46:03 -05:00
|
|
|
|
|
|
|
"github.com/chaseadamsio/goorgeous"
|
2013-09-17 18:52:40 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
2016-03-23 09:51:16 -04:00
|
|
|
// TODO(bep) Do we really have to export these?
|
|
|
|
|
|
|
|
// HTMLLead identifies the start of HTML documents.
|
|
|
|
HTMLLead = "<"
|
|
|
|
// YAMLLead identifies the start of YAML frontmatter.
|
|
|
|
YAMLLead = "-"
|
|
|
|
// YAMLDelimUnix identifies the end of YAML front matter on Unix.
|
|
|
|
YAMLDelimUnix = "---\n"
|
|
|
|
// YAMLDelimDOS identifies the end of YAML front matter on Windows.
|
|
|
|
YAMLDelimDOS = "---\r\n"
|
|
|
|
// YAMLDelim identifies the YAML front matter delimiter.
|
|
|
|
YAMLDelim = "---"
|
|
|
|
// TOMLLead identifies the start of TOML front matter.
|
|
|
|
TOMLLead = "+"
|
|
|
|
// TOMLDelimUnix identifies the end of TOML front matter on Unix.
|
|
|
|
TOMLDelimUnix = "+++\n"
|
|
|
|
// TOMLDelimDOS identifies the end of TOML front matter on Windows.
|
|
|
|
TOMLDelimDOS = "+++\r\n"
|
|
|
|
// TOMLDelim identifies the TOML front matter delimiter.
|
|
|
|
TOMLDelim = "+++"
|
|
|
|
// JSONLead identifies the start of JSON frontmatter.
|
|
|
|
JSONLead = "{"
|
|
|
|
// HTMLCommentStart identifies the start of HTML comment.
|
|
|
|
HTMLCommentStart = "<!--"
|
|
|
|
// HTMLCommentEnd identifies the end of HTML comment.
|
|
|
|
HTMLCommentEnd = "-->"
|
2016-04-12 18:14:00 -04:00
|
|
|
// BOM Unicode byte order marker
|
|
|
|
BOM = '\ufeff'
|
2013-09-17 18:52:40 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
var (
|
2015-08-02 01:24:22 -04:00
|
|
|
delims = regexp.MustCompile(
|
2016-03-23 09:51:16 -04:00
|
|
|
"^(" + regexp.QuoteMeta(YAMLDelim) + `\s*\n|` + regexp.QuoteMeta(TOMLDelim) + `\s*\n|` + regexp.QuoteMeta(JSONLead) + ")",
|
2015-08-02 01:24:22 -04:00
|
|
|
)
|
2013-09-17 18:52:40 -04:00
|
|
|
)
|
|
|
|
|
2016-03-23 09:51:16 -04:00
|
|
|
// Page represents a parsed content page.
|
2013-09-17 18:52:40 -04:00
|
|
|
type Page interface {
|
2016-12-26 16:23:20 -05:00
|
|
|
// FrontMatter contains the raw frontmatter with relevant delimiters.
|
2016-03-23 09:51:16 -04:00
|
|
|
FrontMatter() []byte
|
2016-12-26 16:23:20 -05:00
|
|
|
|
|
|
|
// Content contains the raw page content.
|
2016-03-23 09:51:16 -04:00
|
|
|
Content() []byte
|
2016-12-26 16:23:20 -05:00
|
|
|
|
|
|
|
// IsRenderable denotes that the page should be rendered.
|
2013-09-18 13:17:43 -04:00
|
|
|
IsRenderable() bool
|
2016-12-26 16:23:20 -05:00
|
|
|
|
|
|
|
// Metadata returns the unmarshalled frontmatter data.
|
2014-05-01 13:19:51 -04:00
|
|
|
Metadata() (interface{}, error)
|
2013-09-17 18:52:40 -04:00
|
|
|
}
|
|
|
|
|
2016-12-26 16:23:20 -05:00
|
|
|
// page implements the Page interface.
|
2013-09-17 18:52:40 -04:00
|
|
|
type page struct {
|
|
|
|
render bool
|
2016-03-23 09:51:16 -04:00
|
|
|
frontmatter []byte
|
|
|
|
content []byte
|
2013-09-17 18:52:40 -04:00
|
|
|
}
|
|
|
|
|
2016-12-26 16:23:20 -05:00
|
|
|
// Content returns the raw page content.
|
2016-03-23 09:51:16 -04:00
|
|
|
func (p *page) Content() []byte {
|
2013-09-17 18:52:40 -04:00
|
|
|
return p.content
|
|
|
|
}
|
|
|
|
|
2016-12-26 16:23:20 -05:00
|
|
|
// FrontMatter contains the raw frontmatter with relevant delimiters.
|
2016-03-23 09:51:16 -04:00
|
|
|
func (p *page) FrontMatter() []byte {
|
2013-09-17 18:52:40 -04:00
|
|
|
return p.frontmatter
|
|
|
|
}
|
|
|
|
|
2016-12-26 16:23:20 -05:00
|
|
|
// IsRenderable denotes that the page should be rendered.
|
2013-09-18 13:17:43 -04:00
|
|
|
func (p *page) IsRenderable() bool {
|
|
|
|
return p.render
|
|
|
|
}
|
|
|
|
|
2016-12-26 16:23:20 -05:00
|
|
|
// Metadata returns the unmarshalled frontmatter data.
|
2014-05-01 13:19:51 -04:00
|
|
|
func (p *page) Metadata() (meta interface{}, err error) {
|
|
|
|
frontmatter := p.FrontMatter()
|
|
|
|
|
|
|
|
if len(frontmatter) != 0 {
|
2016-04-26 16:21:15 -04:00
|
|
|
fm := DetectFrontMatter(rune(frontmatter[0]))
|
2017-02-21 02:46:03 -05:00
|
|
|
if fm != nil {
|
|
|
|
meta, err = fm.Parse(frontmatter)
|
|
|
|
if err != nil {
|
|
|
|
return
|
|
|
|
}
|
2014-05-01 13:19:51 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2013-09-17 18:52:40 -04:00
|
|
|
// ReadFrom reads the content from an io.Reader and constructs a page.
|
|
|
|
func ReadFrom(r io.Reader) (p Page, err error) {
|
|
|
|
reader := bufio.NewReader(r)
|
|
|
|
|
2016-04-12 18:14:00 -04:00
|
|
|
// chomp BOM and assume UTF-8
|
|
|
|
if err = chompBOM(reader); err != nil && err != io.EOF {
|
|
|
|
return
|
|
|
|
}
|
2013-12-16 03:34:26 -05:00
|
|
|
if err = chompWhitespace(reader); err != nil && err != io.EOF {
|
2013-09-17 18:52:40 -04:00
|
|
|
return
|
|
|
|
}
|
2015-09-23 00:43:17 -04:00
|
|
|
if err = chompFrontmatterStartComment(reader); err != nil && err != io.EOF {
|
|
|
|
return
|
|
|
|
}
|
2013-09-17 18:52:40 -04:00
|
|
|
|
|
|
|
firstLine, err := peekLine(reader)
|
2013-12-16 03:34:26 -05:00
|
|
|
if err != nil && err != io.EOF {
|
2013-09-17 18:52:40 -04:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
newp := new(page)
|
|
|
|
newp.render = shouldRender(firstLine)
|
|
|
|
|
|
|
|
if newp.render && isFrontMatterDelim(firstLine) {
|
|
|
|
left, right := determineDelims(firstLine)
|
|
|
|
fm, err := extractFrontMatterDelims(reader, left, right)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
newp.frontmatter = fm
|
2017-02-21 02:46:03 -05:00
|
|
|
} else if newp.render && goorgeous.IsKeyword(firstLine) {
|
|
|
|
fm, err := goorgeous.ExtractOrgHeaders(reader)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
newp.frontmatter = fm
|
2013-09-17 18:52:40 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
content, err := extractContent(reader)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
newp.content = content
|
|
|
|
|
|
|
|
return newp, nil
|
|
|
|
}
|
|
|
|
|
2016-12-26 16:23:20 -05:00
|
|
|
// chompBOM scans any leading Unicode Byte Order Markers from r.
|
2016-04-12 18:14:00 -04:00
|
|
|
func chompBOM(r io.RuneScanner) (err error) {
|
|
|
|
for {
|
|
|
|
c, _, err := r.ReadRune()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if c != BOM {
|
|
|
|
r.UnreadRune()
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-12-26 16:23:20 -05:00
|
|
|
// chompWhitespace scans any leading Unicode whitespace from r.
|
2013-09-17 18:52:40 -04:00
|
|
|
func chompWhitespace(r io.RuneScanner) (err error) {
|
|
|
|
for {
|
|
|
|
c, _, err := r.ReadRune()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if !unicode.IsSpace(c) {
|
|
|
|
r.UnreadRune()
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-12-26 16:23:20 -05:00
|
|
|
// chompFrontmatterStartComment checks r for a leading HTML comment. If a
|
|
|
|
// comment is found, it is read from r and then whitespace is trimmed from the
|
|
|
|
// beginning of r.
|
2015-09-23 00:43:17 -04:00
|
|
|
func chompFrontmatterStartComment(r *bufio.Reader) (err error) {
|
|
|
|
candidate, err := r.Peek(32)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
str := string(candidate)
|
2016-03-23 09:51:16 -04:00
|
|
|
if strings.HasPrefix(str, HTMLCommentStart) {
|
2015-09-23 00:43:17 -04:00
|
|
|
lineEnd := strings.IndexAny(str, "\n")
|
|
|
|
if lineEnd == -1 {
|
|
|
|
//TODO: if we can't find it, Peek more?
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
testStr := strings.TrimSuffix(str[0:lineEnd], "\r")
|
2016-08-30 13:30:33 -04:00
|
|
|
if strings.Contains(testStr, HTMLCommentEnd) {
|
2015-09-23 00:43:17 -04:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
buf := make([]byte, lineEnd)
|
|
|
|
if _, err = r.Read(buf); err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if err = chompWhitespace(r); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2016-12-26 16:23:20 -05:00
|
|
|
// chompFrontmatterEndComment checks r for a trailing HTML comment.
|
2015-09-23 00:43:17 -04:00
|
|
|
func chompFrontmatterEndComment(r *bufio.Reader) (err error) {
|
|
|
|
candidate, err := r.Peek(32)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
str := string(candidate)
|
|
|
|
lineEnd := strings.IndexAny(str, "\n")
|
|
|
|
if lineEnd == -1 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
testStr := strings.TrimSuffix(str[0:lineEnd], "\r")
|
2016-08-30 13:30:33 -04:00
|
|
|
if strings.Contains(testStr, HTMLCommentStart) {
|
2015-09-23 00:43:17 -04:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
//TODO: if we can't find it, Peek more?
|
2016-03-23 09:51:16 -04:00
|
|
|
if strings.HasSuffix(testStr, HTMLCommentEnd) {
|
2015-09-23 00:43:17 -04:00
|
|
|
buf := make([]byte, lineEnd)
|
|
|
|
if _, err = r.Read(buf); err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if err = chompWhitespace(r); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2013-09-17 18:52:40 -04:00
|
|
|
func peekLine(r *bufio.Reader) (line []byte, err error) {
|
|
|
|
firstFive, err := r.Peek(5)
|
|
|
|
if err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
idx := bytes.IndexByte(firstFive, '\n')
|
|
|
|
if idx == -1 {
|
|
|
|
return firstFive, nil
|
|
|
|
}
|
2015-03-07 06:59:04 -05:00
|
|
|
idx++ // include newline.
|
2013-09-17 18:52:40 -04:00
|
|
|
return firstFive[:idx], nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func shouldRender(lead []byte) (frontmatter bool) {
|
|
|
|
if len(lead) <= 0 {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2016-03-23 09:51:16 -04:00
|
|
|
if bytes.Equal(lead[:1], []byte(HTMLLead)) {
|
2013-09-17 18:52:40 -04:00
|
|
|
return
|
|
|
|
}
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
func isFrontMatterDelim(data []byte) bool {
|
2015-08-02 01:24:22 -04:00
|
|
|
return delims.Match(data)
|
2013-09-17 18:52:40 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
func determineDelims(firstLine []byte) (left, right []byte) {
|
2016-12-20 08:54:52 -05:00
|
|
|
switch firstLine[0] {
|
|
|
|
case YAMLLead[0]:
|
|
|
|
return []byte(YAMLDelim), []byte(YAMLDelim)
|
|
|
|
case TOMLLead[0]:
|
2016-03-23 09:51:16 -04:00
|
|
|
return []byte(TOMLDelim), []byte(TOMLDelim)
|
2016-12-20 08:54:52 -05:00
|
|
|
case JSONLead[0]:
|
2016-03-23 09:51:16 -04:00
|
|
|
return []byte(JSONLead), []byte("}")
|
2013-09-17 18:52:40 -04:00
|
|
|
default:
|
|
|
|
panic(fmt.Sprintf("Unable to determine delims from %q", firstLine))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-10 02:15:51 -05:00
|
|
|
// extractFrontMatterDelims takes a frontmatter from the content bufio.Reader.
|
2016-02-13 04:24:25 -05:00
|
|
|
// Beginning white spaces of the bufio.Reader must be trimmed before call this
|
2015-01-10 02:15:51 -05:00
|
|
|
// function.
|
2016-03-23 09:51:16 -04:00
|
|
|
func extractFrontMatterDelims(r *bufio.Reader, left, right []byte) (fm []byte, err error) {
|
2013-09-18 12:15:46 -04:00
|
|
|
var (
|
|
|
|
c byte
|
2015-01-10 02:15:51 -05:00
|
|
|
buf bytes.Buffer
|
2015-03-07 06:59:04 -05:00
|
|
|
level int
|
|
|
|
sameDelim = bytes.Equal(left, right)
|
2017-06-19 10:45:52 -04:00
|
|
|
inQuote bool
|
2017-07-03 03:00:17 -04:00
|
|
|
escaped bool
|
2013-09-18 12:15:46 -04:00
|
|
|
)
|
2015-01-10 02:15:51 -05:00
|
|
|
// Frontmatter must start with a delimiter. To check it first,
|
|
|
|
// pre-reads beginning delimiter length - 1 bytes from Reader
|
|
|
|
for i := 0; i < len(left)-1; i++ {
|
|
|
|
if c, err = r.ReadByte(); err != nil {
|
|
|
|
return nil, fmt.Errorf("unable to read frontmatter at filepos %d: %s", buf.Len(), err)
|
|
|
|
}
|
|
|
|
if err = buf.WriteByte(c); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Reads a character from Reader one by one and checks it matches the
|
2015-08-18 21:59:34 -04:00
|
|
|
// last character of one of delimiters to find the last character of
|
2015-01-10 02:15:51 -05:00
|
|
|
// frontmatter. If it matches, makes sure it contains the delimiter
|
|
|
|
// and if so, also checks it is followed by CR+LF or LF when YAML,
|
|
|
|
// TOML case. In JSON case, nested delimiters must be parsed and it
|
|
|
|
// is expected that the delimiter only contains one character.
|
2013-09-17 18:52:40 -04:00
|
|
|
for {
|
2013-09-18 12:15:46 -04:00
|
|
|
if c, err = r.ReadByte(); err != nil {
|
2015-01-10 02:15:51 -05:00
|
|
|
return nil, fmt.Errorf("unable to read frontmatter at filepos %d: %s", buf.Len(), err)
|
|
|
|
}
|
|
|
|
if err = buf.WriteByte(c); err != nil {
|
|
|
|
return nil, err
|
2013-09-17 18:52:40 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
switch c {
|
2017-06-19 10:45:52 -04:00
|
|
|
case '"':
|
2017-07-03 03:00:17 -04:00
|
|
|
if !escaped {
|
|
|
|
inQuote = !inQuote
|
|
|
|
}
|
|
|
|
escaped = false
|
|
|
|
case '\\':
|
|
|
|
escaped = true
|
2015-01-10 02:15:51 -05:00
|
|
|
case left[len(left)-1]:
|
|
|
|
if sameDelim { // YAML, TOML case
|
2015-08-03 10:32:51 -04:00
|
|
|
if bytes.HasSuffix(buf.Bytes(), left) && (buf.Len() == len(left) || buf.Bytes()[buf.Len()-len(left)-1] == '\n') {
|
2015-08-02 01:24:22 -04:00
|
|
|
nextByte:
|
2015-01-10 02:15:51 -05:00
|
|
|
c, err = r.ReadByte()
|
|
|
|
if err != nil {
|
|
|
|
// It is ok that the end delimiter ends with EOF
|
|
|
|
if err != io.EOF || level != 1 {
|
|
|
|
return nil, fmt.Errorf("unable to read frontmatter at filepos %d: %s", buf.Len(), err)
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
switch c {
|
|
|
|
case '\n':
|
|
|
|
// ok
|
2015-08-02 01:24:22 -04:00
|
|
|
case ' ':
|
|
|
|
// Consume this byte and try to match again
|
|
|
|
goto nextByte
|
2015-01-10 02:15:51 -05:00
|
|
|
case '\r':
|
|
|
|
if err = buf.WriteByte(c); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
if c, err = r.ReadByte(); err != nil {
|
|
|
|
return nil, fmt.Errorf("unable to read frontmatter at filepos %d: %s", buf.Len(), err)
|
|
|
|
}
|
|
|
|
if c != '\n' {
|
|
|
|
return nil, fmt.Errorf("frontmatter delimiter must be followed by CR+LF or LF but those can't be found at filepos %d", buf.Len())
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
return nil, fmt.Errorf("frontmatter delimiter must be followed by CR+LF or LF but those can't be found at filepos %d", buf.Len())
|
|
|
|
}
|
|
|
|
if err = buf.WriteByte(c); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
2013-09-17 18:52:40 -04:00
|
|
|
if level == 0 {
|
|
|
|
level = 1
|
|
|
|
} else {
|
|
|
|
level = 0
|
|
|
|
}
|
2013-09-18 12:15:46 -04:00
|
|
|
}
|
2015-01-10 02:15:51 -05:00
|
|
|
} else { // JSON case
|
2017-06-19 10:45:52 -04:00
|
|
|
if !inQuote {
|
|
|
|
level++
|
|
|
|
}
|
2013-09-17 18:52:40 -04:00
|
|
|
}
|
2015-01-10 02:15:51 -05:00
|
|
|
case right[len(right)-1]: // JSON case only reaches here
|
2017-06-19 10:45:52 -04:00
|
|
|
if !inQuote {
|
|
|
|
level--
|
|
|
|
}
|
2013-09-17 18:52:40 -04:00
|
|
|
}
|
|
|
|
|
2015-01-10 02:15:51 -05:00
|
|
|
if level == 0 {
|
|
|
|
// Consumes white spaces immediately behind frontmatter
|
2015-09-23 00:43:17 -04:00
|
|
|
if err = chompWhitespace(r); err != nil && err != io.EOF {
|
|
|
|
return nil, err
|
2013-09-17 18:52:40 -04:00
|
|
|
}
|
2015-09-23 00:43:17 -04:00
|
|
|
if err = chompFrontmatterEndComment(r); err != nil && err != io.EOF {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2015-01-10 02:15:51 -05:00
|
|
|
return buf.Bytes(), nil
|
2013-09-17 18:52:40 -04:00
|
|
|
}
|
2017-07-03 03:00:17 -04:00
|
|
|
|
2013-09-17 18:52:40 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-03-23 09:51:16 -04:00
|
|
|
func extractContent(r io.Reader) (content []byte, err error) {
|
2013-09-17 18:52:40 -04:00
|
|
|
wr := new(bytes.Buffer)
|
|
|
|
if _, err = wr.ReadFrom(r); err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
return wr.Bytes(), nil
|
|
|
|
}
|