// Copyright 2019 The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package helpers implements general utility functions that work with
// and on content. The helper functions defined here lay down the
// foundation of how Hugo works with files and filepaths, and perform
// string operations on content.
package helpers
import (
"bytes"
"fmt"
"html/template"
"os/exec"
"runtime"
"unicode"
"unicode/utf8"
"github.com/gohugoio/hugo/common/maps"
"github.com/niklasfasching/go-org/org"
bp "github.com/gohugoio/hugo/bufferpool"
"github.com/gohugoio/hugo/config"
"github.com/miekg/mmark"
"github.com/mitchellh/mapstructure"
"github.com/russross/blackfriday"
jww "github.com/spf13/jwalterweatherman"
"strings"
)
// SummaryDivider denotes where content summarization should end. The default is "".
var SummaryDivider = []byte("")
var (
openingPTag = []byte("
")
closingPTag = []byte("
")
paragraphIndicator = []byte("", "\n", " ", "\n", " ", "\n")
var mmarkExtensionMap = map[string]int{
"tables": mmark.EXTENSION_TABLES,
"fencedCode": mmark.EXTENSION_FENCED_CODE,
"autolink": mmark.EXTENSION_AUTOLINK,
"laxHtmlBlocks": mmark.EXTENSION_LAX_HTML_BLOCKS,
"spaceHeaders": mmark.EXTENSION_SPACE_HEADERS,
"hardLineBreak": mmark.EXTENSION_HARD_LINE_BREAK,
"footnotes": mmark.EXTENSION_FOOTNOTES,
"noEmptyLineBeforeBlock": mmark.EXTENSION_NO_EMPTY_LINE_BEFORE_BLOCK,
"headerIds": mmark.EXTENSION_HEADER_IDS,
"autoHeaderIds": mmark.EXTENSION_AUTO_HEADER_IDS,
}
// StripHTML accepts a string, strips out all HTML tags and returns it.
func StripHTML(s string) string {
// Shortcut strings with no tags in them
if !strings.ContainsAny(s, "<>") {
return s
}
s = stripHTMLReplacer.Replace(s)
// Walk through the string removing all tags
b := bp.GetBuffer()
defer bp.PutBuffer(b)
var inTag, isSpace, wasSpace bool
for _, r := range s {
if !inTag {
isSpace = false
}
switch {
case r == '<':
inTag = true
case r == '>':
inTag = false
case unicode.IsSpace(r):
isSpace = true
fallthrough
default:
if !inTag && (!isSpace || (isSpace && !wasSpace)) {
b.WriteRune(r)
}
}
wasSpace = isSpace
}
return b.String()
}
// stripEmptyNav strips out empty tags from content.
func stripEmptyNav(in []byte) []byte {
return bytes.Replace(in, []byte("\n \n\n"), []byte(``), -1)
}
// BytesToHTML converts bytes to type template.HTML.
func BytesToHTML(b []byte) template.HTML {
return template.HTML(string(b))
}
// getHTMLRenderer creates a new Blackfriday HTML Renderer with the given configuration.
func (c *ContentSpec) getHTMLRenderer(defaultFlags int, ctx *RenderingContext) blackfriday.Renderer {
renderParameters := blackfriday.HtmlRendererParameters{
FootnoteAnchorPrefix: c.footnoteAnchorPrefix,
FootnoteReturnLinkContents: c.footnoteReturnLinkContents,
}
b := len(ctx.DocumentID) != 0
if ctx.Config == nil {
panic(fmt.Sprintf("RenderingContext of %q doesn't have a config", ctx.DocumentID))
}
if b && !ctx.Config.PlainIDAnchors {
renderParameters.FootnoteAnchorPrefix = ctx.DocumentID + ":" + renderParameters.FootnoteAnchorPrefix
renderParameters.HeaderIDSuffix = ":" + ctx.DocumentID
}
htmlFlags := defaultFlags
htmlFlags |= blackfriday.HTML_USE_XHTML
htmlFlags |= blackfriday.HTML_FOOTNOTE_RETURN_LINKS
if ctx.Config.Smartypants {
htmlFlags |= blackfriday.HTML_USE_SMARTYPANTS
}
if ctx.Config.SmartypantsQuotesNBSP {
htmlFlags |= blackfriday.HTML_SMARTYPANTS_QUOTES_NBSP
}
if ctx.Config.AngledQuotes {
htmlFlags |= blackfriday.HTML_SMARTYPANTS_ANGLED_QUOTES
}
if ctx.Config.Fractions {
htmlFlags |= blackfriday.HTML_SMARTYPANTS_FRACTIONS
}
if ctx.Config.HrefTargetBlank {
htmlFlags |= blackfriday.HTML_HREF_TARGET_BLANK
}
if ctx.Config.NofollowLinks {
htmlFlags |= blackfriday.HTML_NOFOLLOW_LINKS
}
if ctx.Config.NoreferrerLinks {
htmlFlags |= blackfriday.HTML_NOREFERRER_LINKS
}
if ctx.Config.SmartDashes {
htmlFlags |= blackfriday.HTML_SMARTYPANTS_DASHES
}
if ctx.Config.LatexDashes {
htmlFlags |= blackfriday.HTML_SMARTYPANTS_LATEX_DASHES
}
if ctx.Config.SkipHTML {
htmlFlags |= blackfriday.HTML_SKIP_HTML
}
return &HugoHTMLRenderer{
cs: c,
RenderingContext: ctx,
Renderer: blackfriday.HtmlRendererWithParameters(htmlFlags, "", "", renderParameters),
}
}
func getMarkdownExtensions(ctx *RenderingContext) int {
// Default Blackfriday common extensions
commonExtensions := 0 |
blackfriday.EXTENSION_NO_INTRA_EMPHASIS |
blackfriday.EXTENSION_TABLES |
blackfriday.EXTENSION_FENCED_CODE |
blackfriday.EXTENSION_AUTOLINK |
blackfriday.EXTENSION_STRIKETHROUGH |
blackfriday.EXTENSION_SPACE_HEADERS |
blackfriday.EXTENSION_HEADER_IDS |
blackfriday.EXTENSION_BACKSLASH_LINE_BREAK |
blackfriday.EXTENSION_DEFINITION_LISTS
// Extra Blackfriday extensions that Hugo enables by default
flags := commonExtensions |
blackfriday.EXTENSION_AUTO_HEADER_IDS |
blackfriday.EXTENSION_FOOTNOTES
if ctx.Config == nil {
panic(fmt.Sprintf("RenderingContext of %q doesn't have a config", ctx.DocumentID))
}
for _, extension := range ctx.Config.Extensions {
if flag, ok := blackfridayExtensionMap[extension]; ok {
flags |= flag
}
}
for _, extension := range ctx.Config.ExtensionsMask {
if flag, ok := blackfridayExtensionMap[extension]; ok {
flags &= ^flag
}
}
return flags
}
func (c ContentSpec) markdownRender(ctx *RenderingContext) []byte {
if ctx.RenderTOC {
return blackfriday.Markdown(ctx.Content,
c.getHTMLRenderer(blackfriday.HTML_TOC, ctx),
getMarkdownExtensions(ctx))
}
return blackfriday.Markdown(ctx.Content, c.getHTMLRenderer(0, ctx),
getMarkdownExtensions(ctx))
}
// getMmarkHTMLRenderer creates a new mmark HTML Renderer with the given configuration.
func (c *ContentSpec) getMmarkHTMLRenderer(defaultFlags int, ctx *RenderingContext) mmark.Renderer {
renderParameters := mmark.HtmlRendererParameters{
FootnoteAnchorPrefix: c.footnoteAnchorPrefix,
FootnoteReturnLinkContents: c.footnoteReturnLinkContents,
}
b := len(ctx.DocumentID) != 0
if ctx.Config == nil {
panic(fmt.Sprintf("RenderingContext of %q doesn't have a config", ctx.DocumentID))
}
if b && !ctx.Config.PlainIDAnchors {
renderParameters.FootnoteAnchorPrefix = ctx.DocumentID + ":" + renderParameters.FootnoteAnchorPrefix
// renderParameters.HeaderIDSuffix = ":" + ctx.DocumentId
}
htmlFlags := defaultFlags
htmlFlags |= mmark.HTML_FOOTNOTE_RETURN_LINKS
return &HugoMmarkHTMLRenderer{
cs: c,
Renderer: mmark.HtmlRendererWithParameters(htmlFlags, "", "", renderParameters),
Cfg: c.Cfg,
}
}
func getMmarkExtensions(ctx *RenderingContext) int {
flags := 0
flags |= mmark.EXTENSION_TABLES
flags |= mmark.EXTENSION_FENCED_CODE
flags |= mmark.EXTENSION_AUTOLINK
flags |= mmark.EXTENSION_SPACE_HEADERS
flags |= mmark.EXTENSION_CITATION
flags |= mmark.EXTENSION_TITLEBLOCK_TOML
flags |= mmark.EXTENSION_HEADER_IDS
flags |= mmark.EXTENSION_AUTO_HEADER_IDS
flags |= mmark.EXTENSION_UNIQUE_HEADER_IDS
flags |= mmark.EXTENSION_FOOTNOTES
flags |= mmark.EXTENSION_SHORT_REF
flags |= mmark.EXTENSION_NO_EMPTY_LINE_BEFORE_BLOCK
flags |= mmark.EXTENSION_INCLUDE
if ctx.Config == nil {
panic(fmt.Sprintf("RenderingContext of %q doesn't have a config", ctx.DocumentID))
}
for _, extension := range ctx.Config.Extensions {
if flag, ok := mmarkExtensionMap[extension]; ok {
flags |= flag
}
}
return flags
}
func (c ContentSpec) mmarkRender(ctx *RenderingContext) []byte {
return mmark.Parse(ctx.Content, c.getMmarkHTMLRenderer(0, ctx),
getMmarkExtensions(ctx)).Bytes()
}
// ExtractTOC extracts Table of Contents from content.
func ExtractTOC(content []byte) (newcontent []byte, toc []byte) {
if !bytes.Contains(content, []byte("")) {
return content, nil
}
origContent := make([]byte, len(content))
copy(origContent, content)
first := []byte(`
`)
replacement := []byte(`
`)
startOfTOC := bytes.Index(content, first)
peekEnd := len(content)
if peekEnd > 70+startOfTOC {
peekEnd = 70 + startOfTOC
}
if startOfTOC < 0 {
return stripEmptyNav(content), toc
}
// Need to peek ahead to see if this nav element is actually the right one.
correctNav := bytes.Index(content[startOfTOC:peekEnd], []byte(`= c.summaryLength {
return strings.Join(words[:index], " "), true
}
runeCount := utf8.RuneCountInString(word)
if len(word) == runeCount {
count++
} else if count+runeCount < c.summaryLength {
count += runeCount
} else {
for ri := range word {
if count >= c.summaryLength {
truncatedWords := append(words[:index], word[:ri])
return strings.Join(truncatedWords, " "), true
}
count++
}
}
}
return strings.Join(words, " "), false
}
// TruncateWordsToWholeSentence takes content and truncates to whole sentence
// limited by max number of words. It also returns whether it is truncated.
func (c *ContentSpec) TruncateWordsToWholeSentence(s string) (string, bool) {
var (
wordCount = 0
lastWordIndex = -1
)
for i, r := range s {
if unicode.IsSpace(r) {
wordCount++
lastWordIndex = i
if wordCount >= c.summaryLength {
break
}
}
}
if lastWordIndex == -1 {
return s, false
}
endIndex := -1
for j, r := range s[lastWordIndex:] {
if isEndOfSentence(r) {
endIndex = j + lastWordIndex + utf8.RuneLen(r)
break
}
}
if endIndex == -1 {
return s, false
}
return strings.TrimSpace(s[:endIndex]), endIndex < len(s)
}
// TrimShortHTML removes the /
tags from HTML input in the situation
// where said tags are the only tags in the input and enclose the content
// of the input (whitespace excluded).
func (c *ContentSpec) TrimShortHTML(input []byte) []byte {
first := bytes.Index(input, paragraphIndicator)
last := bytes.LastIndex(input, paragraphIndicator)
if first == last {
input = bytes.TrimSpace(input)
input = bytes.TrimPrefix(input, openingPTag)
input = bytes.TrimSuffix(input, closingPTag)
input = bytes.TrimSpace(input)
}
return input
}
func isEndOfSentence(r rune) bool {
return r == '.' || r == '?' || r == '!' || r == '"' || r == '\n'
}
// Kept only for benchmark.
func (c *ContentSpec) truncateWordsToWholeSentenceOld(content string) (string, bool) {
words := strings.Fields(content)
if c.summaryLength >= len(words) {
return strings.Join(words, " "), false
}
for counter, word := range words[c.summaryLength:] {
if strings.HasSuffix(word, ".") ||
strings.HasSuffix(word, "?") ||
strings.HasSuffix(word, ".\"") ||
strings.HasSuffix(word, "!") {
upper := c.summaryLength + counter + 1
return strings.Join(words[:upper], " "), (upper < len(words))
}
}
return strings.Join(words[:c.summaryLength], " "), true
}
func getAsciidocExecPath() string {
path, err := exec.LookPath("asciidoc")
if err != nil {
return ""
}
return path
}
func getAsciidoctorExecPath() string {
path, err := exec.LookPath("asciidoctor")
if err != nil {
return ""
}
return path
}
// HasAsciidoc returns whether Asciidoc or Asciidoctor is installed on this computer.
func HasAsciidoc() bool {
return (getAsciidoctorExecPath() != "" ||
getAsciidocExecPath() != "")
}
// getAsciidocContent calls asciidoctor or asciidoc as an external helper
// to convert AsciiDoc content to HTML.
func getAsciidocContent(ctx *RenderingContext) []byte {
var isAsciidoctor bool
path := getAsciidoctorExecPath()
if path == "" {
path = getAsciidocExecPath()
if path == "" {
jww.ERROR.Println("asciidoctor / asciidoc not found in $PATH: Please install.\n",
" Leaving AsciiDoc content unrendered.")
return ctx.Content
}
} else {
isAsciidoctor = true
}
jww.INFO.Println("Rendering", ctx.DocumentName, "with", path, "...")
args := []string{"--no-header-footer", "--safe"}
if isAsciidoctor {
// asciidoctor-specific arg to show stack traces on errors
args = append(args, "--trace")
}
args = append(args, "-")
return externallyRenderContent(ctx, path, args)
}
// HasRst returns whether rst2html is installed on this computer.
func HasRst() bool {
return getRstExecPath() != ""
}
func getRstExecPath() string {
path, err := exec.LookPath("rst2html")
if err != nil {
path, err = exec.LookPath("rst2html.py")
if err != nil {
return ""
}
}
return path
}
func getPythonExecPath() string {
path, err := exec.LookPath("python")
if err != nil {
path, err = exec.LookPath("python.exe")
if err != nil {
return ""
}
}
return path
}
// getRstContent calls the Python script rst2html as an external helper
// to convert reStructuredText content to HTML.
func getRstContent(ctx *RenderingContext) []byte {
path := getRstExecPath()
if path == "" {
jww.ERROR.Println("rst2html / rst2html.py not found in $PATH: Please install.\n",
" Leaving reStructuredText content unrendered.")
return ctx.Content
}
jww.INFO.Println("Rendering", ctx.DocumentName, "with", path, "...")
var result []byte
// certain *nix based OSs wrap executables in scripted launchers
// invoking binaries on these OSs via python interpreter causes SyntaxError
// invoke directly so that shebangs work as expected
// handle Windows manually because it doesn't do shebangs
if runtime.GOOS == "windows" {
python := getPythonExecPath()
args := []string{path, "--leave-comments", "--initial-header-level=2"}
result = externallyRenderContent(ctx, python, args)
} else {
args := []string{"--leave-comments", "--initial-header-level=2"}
result = externallyRenderContent(ctx, path, args)
}
// TODO(bep) check if rst2html has a body only option.
bodyStart := bytes.Index(result, []byte("
\n"))
if bodyStart < 0 {
bodyStart = -7 //compensate for length
}
bodyEnd := bytes.Index(result, []byte("\n"))
if bodyEnd < 0 || bodyEnd >= len(result) {
bodyEnd = len(result) - 1
if bodyEnd < 0 {
bodyEnd = 0
}
}
return result[bodyStart+7 : bodyEnd]
}
// getPandocContent calls pandoc as an external helper to convert pandoc markdown to HTML.
func getPandocContent(ctx *RenderingContext) []byte {
path, err := exec.LookPath("pandoc")
if err != nil {
jww.ERROR.Println("pandoc not found in $PATH: Please install.\n",
" Leaving pandoc content unrendered.")
return ctx.Content
}
args := []string{"--mathjax"}
return externallyRenderContent(ctx, path, args)
}
func orgRender(ctx *RenderingContext, c ContentSpec) []byte {
config := org.New()
config.Log = jww.WARN
writer := org.NewHTMLWriter()
writer.HighlightCodeBlock = func(source, lang string) string {
highlightedSource, err := c.Highlight(source, lang, "")
if err != nil {
jww.ERROR.Printf("Could not highlight source as lang %s. Using raw source.", lang)
return source
}
return highlightedSource
}
html, err := config.Parse(bytes.NewReader(ctx.Content), ctx.DocumentName).Write(writer)
if err != nil {
jww.ERROR.Printf("Could not render org: %s. Using unrendered content.", err)
return ctx.Content
}
return []byte(html)
}
func externallyRenderContent(ctx *RenderingContext, path string, args []string) []byte {
content := ctx.Content
cleanContent := bytes.Replace(content, SummaryDivider, []byte(""), 1)
cmd := exec.Command(path, args...)
cmd.Stdin = bytes.NewReader(cleanContent)
var out, cmderr bytes.Buffer
cmd.Stdout = &out
cmd.Stderr = &cmderr
err := cmd.Run()
// Most external helpers exit w/ non-zero exit code only if severe, i.e.
// halting errors occurred. -> log stderr output regardless of state of err
for _, item := range strings.Split(cmderr.String(), "\n") {
item := strings.TrimSpace(item)
if item != "" {
jww.ERROR.Printf("%s: %s", ctx.DocumentName, item)
}
}
if err != nil {
jww.ERROR.Printf("%s rendering %s: %v", path, ctx.DocumentName, err)
}
return normalizeExternalHelperLineFeeds(out.Bytes())
}