markup/goldmark: Make auto IDs GitHub compatible

You can turn off this behaviour:

```toml
[markup]
  [markup.goldmark]
    [markup.goldmark.parser]
      autoHeadingIDAsciiOnly = true
```
Note that the `anchorize` now adapts its behaviour depending on the default Markdown handler.

Fixes #6616
This commit is contained in:
Bjørn Erik Pedersen 2020-01-04 11:28:19 +01:00
parent ae816452b1
commit a82d2700fc
No known key found for this signature in database
GPG key ID: 330E6E2BD4859D8F
12 changed files with 421 additions and 35 deletions

47
common/text/transform.go Normal file
View file

@ -0,0 +1,47 @@
// Copyright 2019 The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package text
import (
"sync"
"unicode"
"golang.org/x/text/runes"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/norm"
)
var accentTransformerPool = &sync.Pool{
New: func() interface{} {
return transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
},
}
// RemoveAccents removes all accents from b.
func RemoveAccents(b []byte) []byte {
t := accentTransformerPool.Get().(transform.Transformer)
b, _, _ = transform.Bytes(t, b)
t.Reset()
accentTransformerPool.Put(t)
return b
}
// RemoveAccentsString removes all accents from s.
func RemoveAccentsString(s string) string {
t := accentTransformerPool.Get().(transform.Transformer)
s, _, _ = transform.String(t, s)
t.Reset()
accentTransformerPool.Put(t)
return s
}

View file

@ -0,0 +1,29 @@
// Copyright 2019 The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package text
import (
"testing"
qt "github.com/frankban/quicktest"
)
func TestRemoveAccents(t *testing.T) {
c := qt.New(t)
c.Assert(string(RemoveAccents([]byte("Resumé"))), qt.Equals, "Resume")
c.Assert(string(RemoveAccents([]byte("Hugo Rocks!"))), qt.Equals, "Hugo Rocks!")
c.Assert(string(RemoveAccentsString("Resumé")), qt.Equals, "Resume")
}

View file

@ -50,6 +50,7 @@ var (
type ContentSpec struct {
Converters markup.ConverterProvider
MardownConverter converter.Converter // Markdown converter with no document context
anchorNameSanitizer converter.AnchorNameSanitizer
// SummaryLength is the length of the summary that Hugo extracts from a content.
summaryLength int
@ -91,6 +92,17 @@ func NewContentSpec(cfg config.Provider, logger *loggers.Logger, contentFs afero
return nil, err
}
spec.MardownConverter = conv
if as, ok := conv.(converter.AnchorNameSanitizer); ok {
spec.anchorNameSanitizer = as
} else {
// Use Goldmark's sanitizer
p := converterProvider.Get("goldmark")
conv, err := p.New(converter.DocumentContext{})
if err != nil {
return nil, err
}
spec.anchorNameSanitizer = conv.(converter.AnchorNameSanitizer)
}
return spec, nil
}
@ -192,6 +204,10 @@ func (c *ContentSpec) RenderMarkdown(src []byte) ([]byte, error) {
return b.Bytes(), nil
}
func (c *ContentSpec) SanitizeAnchorName(s string) string {
return c.anchorNameSanitizer.SanitizeAnchorName(s)
}
func (c *ContentSpec) ResolveMarkup(in string) string {
in = strings.ToLower(in)
switch in {

View file

@ -24,6 +24,8 @@ import (
"strings"
"unicode"
"github.com/gohugoio/hugo/common/text"
"github.com/gohugoio/hugo/config"
"github.com/gohugoio/hugo/hugofs"
@ -31,9 +33,6 @@ import (
"github.com/gohugoio/hugo/common/hugio"
_errors "github.com/pkg/errors"
"github.com/spf13/afero"
"golang.org/x/text/runes"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/norm"
)
var (
@ -134,6 +133,10 @@ func ishex(c rune) bool {
// are also removed.
// Spaces will be replaced with a single hyphen, and sequential hyphens will be reduced to one.
func (p *PathSpec) UnicodeSanitize(s string) string {
if p.RemovePathAccents {
s = text.RemoveAccentsString(s)
}
source := []rune(s)
target := make([]rune, 0, len(source))
var prependHyphen bool
@ -154,17 +157,7 @@ func (p *PathSpec) UnicodeSanitize(s string) string {
}
}
var result string
if p.RemovePathAccents {
// remove accents - see https://blog.golang.org/normalization
t := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
result, _, _ = transform.String(t, string(target))
} else {
result = string(target)
}
return result
return string(target)
}
// ReplaceExtension takes a path and an extension, strips the old extension

View file

@ -60,6 +60,10 @@ type blackfridayConverter struct {
cfg converter.ProviderConfig
}
func (c *blackfridayConverter) SanitizeAnchorName(s string) string {
return blackfriday.SanitizedAnchorName(s)
}
func (c *blackfridayConverter) AnchorSuffix() string {
if c.bf.PlainIDAnchors {
return ""
@ -205,4 +209,5 @@ var blackfridayExtensionMap = map[string]int{
var (
_ converter.DocumentInfo = (*blackfridayConverter)(nil)
_ converter.AnchorNameSanitizer = (*blackfridayConverter)(nil)
)

View file

@ -87,6 +87,11 @@ type TableOfContentsProvider interface {
TableOfContents() tableofcontents.Root
}
// AnchorNameSanitizer tells how a converter sanitizes anchor names.
type AnchorNameSanitizer interface {
SanitizeAnchorName(s string) string
}
// Bytes holds a byte slice and implements the Result interface.
type Bytes []byte

125
markup/goldmark/autoid.go Normal file
View file

@ -0,0 +1,125 @@
// Copyright 2019 The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package goldmark
import (
"bytes"
"strconv"
"unicode"
"unicode/utf8"
"github.com/gohugoio/hugo/common/text"
"github.com/yuin/goldmark/ast"
"github.com/yuin/goldmark/parser"
"github.com/yuin/goldmark/util"
bp "github.com/gohugoio/hugo/bufferpool"
)
func sanitizeAnchorNameString(s string, asciiOnly bool) string {
return string(sanitizeAnchorName([]byte(s), asciiOnly))
}
func sanitizeAnchorName(b []byte, asciiOnly bool) []byte {
return sanitizeAnchorNameWithHook(b, asciiOnly, nil)
}
func sanitizeAnchorNameWithHook(b []byte, asciiOnly bool, hook func(buf *bytes.Buffer)) []byte {
buf := bp.GetBuffer()
if asciiOnly {
// Normalize it to preserve accents if possible.
b = text.RemoveAccents(b)
}
for len(b) > 0 {
r, size := utf8.DecodeRune(b)
switch {
case asciiOnly && size != 1:
case isSpace(r):
buf.WriteString("-")
case r == '-' || isAlphaNumeric(r):
buf.WriteRune(unicode.ToLower(r))
default:
}
b = b[size:]
}
if hook != nil {
hook(buf)
}
result := make([]byte, buf.Len())
copy(result, buf.Bytes())
bp.PutBuffer(buf)
return result
}
func isAlphaNumeric(r rune) bool {
return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
}
func isSpace(r rune) bool {
return r == ' ' || r == '\t'
}
var _ parser.IDs = (*idFactory)(nil)
type idFactory struct {
asciiOnly bool
vals map[string]struct{}
}
func newIDFactory(asciiOnly bool) *idFactory {
return &idFactory{
vals: make(map[string]struct{}),
asciiOnly: asciiOnly,
}
}
func (ids *idFactory) Generate(value []byte, kind ast.NodeKind) []byte {
return sanitizeAnchorNameWithHook(value, ids.asciiOnly, func(buf *bytes.Buffer) {
if buf.Len() == 0 {
if kind == ast.KindHeading {
buf.WriteString("heading")
} else {
buf.WriteString("id")
}
}
if _, found := ids.vals[util.BytesToReadOnlyString(buf.Bytes())]; found {
// Append a hypen and a number, starting with 1.
buf.WriteRune('-')
pos := buf.Len()
for i := 1; ; i++ {
buf.WriteString(strconv.Itoa(i))
if _, found := ids.vals[util.BytesToReadOnlyString(buf.Bytes())]; !found {
break
}
buf.Truncate(pos)
}
}
ids.vals[buf.String()] = struct{}{}
})
}
func (ids *idFactory) Put(value []byte) {
ids.vals[util.BytesToReadOnlyString(value)] = struct{}{}
}

View file

@ -0,0 +1,121 @@
// Copyright 2019 The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package goldmark
import (
"strings"
"testing"
qt "github.com/frankban/quicktest"
)
func TestSanitizeAnchorName(t *testing.T) {
c := qt.New(t)
// Tests generated manually on github.com
tests := `
God is good: 神真美好
Number 32
Question?
1+2=3
Special !"#$%&(parens)=?´* chars
Resumé
One-Hyphen
Multiple--Hyphens
Trailing hyphen-
Many spaces here
Forward/slash
Backward\slash
Under_score
`
expect := `
god-is-good-神真美好
number-32
question
123
special-parens-chars
resumé
one-hyphen
multiple--hyphens
trailing-hyphen-
many---spaces--here
forwardslash
backwardslash
under_score
`
tests, expect = strings.TrimSpace(tests), strings.TrimSpace(expect)
testlines, expectlines := strings.Split(tests, "\n"), strings.Split(expect, "\n")
if len(testlines) != len(expectlines) {
panic("test setup failed")
}
for i, input := range testlines {
input := input
expect := expectlines[i]
c.Run(input, func(c *qt.C) {
b := []byte(input)
got := string(sanitizeAnchorName(b, false))
c.Assert(got, qt.Equals, expect)
c.Assert(sanitizeAnchorNameString(input, false), qt.Equals, expect)
c.Assert(string(b), qt.Equals, input)
})
}
}
func TestSanitizeAnchorNameAsciiOnly(t *testing.T) {
c := qt.New(t)
c.Assert(sanitizeAnchorNameString("god is神真美好 good", true), qt.Equals, "god-is-good")
c.Assert(sanitizeAnchorNameString("Resumé", true), qt.Equals, "resume")
}
func BenchmarkSanitizeAnchorName(b *testing.B) {
input := []byte("God is good: 神真美好")
b.ResetTimer()
for i := 0; i < b.N; i++ {
result := sanitizeAnchorName(input, false)
if len(result) != 24 {
b.Fatalf("got %d", len(result))
}
}
}
func BenchmarkSanitizeAnchorNameAsciiOnly(b *testing.B) {
input := []byte("God is good: 神真美好")
b.ResetTimer()
for i := 0; i < b.N; i++ {
result := sanitizeAnchorName(input, true)
if len(result) != 12 {
b.Fatalf("got %d", len(result))
}
}
}
func BenchmarkSanitizeAnchorNameString(b *testing.B) {
input := "God is good: 神真美好"
b.ResetTimer()
for i := 0; i < b.N; i++ {
result := sanitizeAnchorNameString(input, false)
if len(result) != 24 {
b.Fatalf("got %d", len(result))
}
}
}

View file

@ -50,19 +50,33 @@ type provide struct {
func (p provide) New(cfg converter.ProviderConfig) (converter.Provider, error) {
md := newMarkdown(cfg)
return converter.NewProvider("goldmark", func(ctx converter.DocumentContext) (converter.Converter, error) {
return &goldmarkConverter{
ctx: ctx,
cfg: cfg,
md: md,
sanitizeAnchorName: func(s string) string {
return sanitizeAnchorNameString(s, cfg.MarkupConfig.Goldmark.Parser.AutoHeadingIDAsciiOnly)
},
}, nil
}), nil
}
var (
_ converter.AnchorNameSanitizer = (*goldmarkConverter)(nil)
)
type goldmarkConverter struct {
md goldmark.Markdown
ctx converter.DocumentContext
cfg converter.ProviderConfig
sanitizeAnchorName func(s string) string
}
func (c *goldmarkConverter) SanitizeAnchorName(s string) string {
return c.sanitizeAnchorName(s)
}
func newMarkdown(pcfg converter.ProviderConfig) goldmark.Markdown {
@ -226,7 +240,7 @@ func (c *goldmarkConverter) Convert(ctx converter.RenderContext) (result convert
buf := &bufWriter{Buffer: &bytes.Buffer{}}
result = buf
pctx := newParserContext(ctx)
pctx := c.newParserContext(ctx)
reader := text.NewReader(ctx.Src)
doc := c.md.Parser().Parse(
@ -265,8 +279,8 @@ func (c *goldmarkConverter) Supports(feature identity.Identity) bool {
return featureSet[feature.GetIdentity()]
}
func newParserContext(rctx converter.RenderContext) *parserContext {
ctx := parser.NewContext()
func (c *goldmarkConverter) newParserContext(rctx converter.RenderContext) *parserContext {
ctx := parser.NewContext(parser.WithIDs(newIDFactory(c.cfg.MarkupConfig.Goldmark.Parser.AutoHeadingIDAsciiOnly)))
ctx.Set(tocEnableKey, rctx.RenderTOC)
return &parserContext{
Context: ctx,

View file

@ -28,6 +28,23 @@ import (
qt "github.com/frankban/quicktest"
)
func convert(c *qt.C, mconf markup_config.Config, content string) converter.Result {
p, err := Provider.New(
converter.ProviderConfig{
MarkupConfig: mconf,
Logger: loggers.NewErrorLogger(),
},
)
c.Assert(err, qt.IsNil)
conv, err := p.New(converter.DocumentContext{DocumentID: "thedoc"})
c.Assert(err, qt.IsNil)
b, err := conv.Convert(converter.RenderContext{RenderTOC: true, Src: []byte(content)})
c.Assert(err, qt.IsNil)
return b
}
func TestConvert(t *testing.T) {
c := qt.New(t)
@ -92,29 +109,23 @@ description
: the description for the content.
## 神真美好
## 神真美好
## 神真美好
[^1]: And that's the footnote.
`
// Code fences
content = strings.Replace(content, "§§§", "```", -1)
mconf := markup_config.Default
mconf.Highlight.NoClasses = false
mconf.Goldmark.Renderer.Unsafe = true
p, err := Provider.New(
converter.ProviderConfig{
MarkupConfig: mconf,
Logger: loggers.NewErrorLogger(),
},
)
c.Assert(err, qt.IsNil)
conv, err := p.New(converter.DocumentContext{DocumentID: "thedoc"})
c.Assert(err, qt.IsNil)
b, err := conv.Convert(converter.RenderContext{RenderTOC: true, Src: []byte(content)})
c.Assert(err, qt.IsNil)
b := convert(c, mconf, content)
got := string(b.Bytes())
// Links
@ -123,6 +134,9 @@ description
// Header IDs
c.Assert(got, qt.Contains, `<h2 id="custom">Custom ID</h2>`, qt.Commentf(got))
c.Assert(got, qt.Contains, `<h2 id="auto-id">Auto ID</h2>`, qt.Commentf(got))
c.Assert(got, qt.Contains, `<h2 id="神真美好">神真美好</h2>`, qt.Commentf(got))
c.Assert(got, qt.Contains, `<h2 id="神真美好-1">神真美好</h2>`, qt.Commentf(got))
c.Assert(got, qt.Contains, `<h2 id="神真美好-2">神真美好</h2>`, qt.Commentf(got))
// Code fences
c.Assert(got, qt.Contains, "<div class=\"highlight\"><pre class=\"chroma\"><code class=\"language-bash\" data-lang=\"bash\">LINE1\n</code></pre></div>")
@ -148,6 +162,20 @@ description
}
func TestConvertAutoIDAsciiOnly(t *testing.T) {
c := qt.New(t)
content := `
## God is Good: 神真美好
`
mconf := markup_config.Default
mconf.Goldmark.Parser.AutoHeadingIDAsciiOnly = true
b := convert(c, mconf, content)
got := string(b.Bytes())
c.Assert(got, qt.Contains, "<h2 id=\"god-is-good-\">")
}
func TestCodeFence(t *testing.T) {
c := qt.New(t)

View file

@ -69,6 +69,10 @@ type Parser struct {
// auto generated heading ids.
AutoHeadingID bool
// When AutoHeadingID is enabled this will generate IDs with Ascii
// characters only.
AutoHeadingIDAsciiOnly bool
// Enables custom attributes.
Attribute bool
}

View file

@ -25,7 +25,6 @@ import (
"github.com/gohugoio/hugo/common/urls"
"github.com/gohugoio/hugo/deps"
_errors "github.com/pkg/errors"
"github.com/russross/blackfriday"
"github.com/spf13/cast"
)
@ -90,7 +89,7 @@ func (ns *Namespace) Anchorize(a interface{}) (string, error) {
if err != nil {
return "", nil
}
return blackfriday.SanitizedAnchorName(s), nil
return ns.deps.ContentSpec.SanitizeAnchorName(s), nil
}
// Ref returns the absolute URL path to a given content item.