diff --git a/common/text/transform.go b/common/text/transform.go new file mode 100644 index 000000000..f59577803 --- /dev/null +++ b/common/text/transform.go @@ -0,0 +1,47 @@ +// Copyright 2019 The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package text + +import ( + "sync" + "unicode" + + "golang.org/x/text/runes" + "golang.org/x/text/transform" + "golang.org/x/text/unicode/norm" +) + +var accentTransformerPool = &sync.Pool{ + New: func() interface{} { + return transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC) + }, +} + +// RemoveAccents removes all accents from b. +func RemoveAccents(b []byte) []byte { + t := accentTransformerPool.Get().(transform.Transformer) + b, _, _ = transform.Bytes(t, b) + t.Reset() + accentTransformerPool.Put(t) + return b +} + +// RemoveAccentsString removes all accents from s. +func RemoveAccentsString(s string) string { + t := accentTransformerPool.Get().(transform.Transformer) + s, _, _ = transform.String(t, s) + t.Reset() + accentTransformerPool.Put(t) + return s +} diff --git a/common/text/transform_test.go b/common/text/transform_test.go new file mode 100644 index 000000000..70b10d149 --- /dev/null +++ b/common/text/transform_test.go @@ -0,0 +1,29 @@ +// Copyright 2019 The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package text + +import ( + "testing" + + qt "github.com/frankban/quicktest" +) + +func TestRemoveAccents(t *testing.T) { + c := qt.New(t) + + c.Assert(string(RemoveAccents([]byte("Resumé"))), qt.Equals, "Resume") + c.Assert(string(RemoveAccents([]byte("Hugo Rocks!"))), qt.Equals, "Hugo Rocks!") + c.Assert(string(RemoveAccentsString("Resumé")), qt.Equals, "Resume") + +} diff --git a/helpers/content.go b/helpers/content.go index 1c780fefe..e61888357 100644 --- a/helpers/content.go +++ b/helpers/content.go @@ -48,8 +48,9 @@ var ( // ContentSpec provides functionality to render markdown content. type ContentSpec struct { - Converters markup.ConverterProvider - MardownConverter converter.Converter // Markdown converter with no document context + Converters markup.ConverterProvider + MardownConverter converter.Converter // Markdown converter with no document context + anchorNameSanitizer converter.AnchorNameSanitizer // SummaryLength is the length of the summary that Hugo extracts from a content. summaryLength int @@ -91,6 +92,17 @@ func NewContentSpec(cfg config.Provider, logger *loggers.Logger, contentFs afero return nil, err } spec.MardownConverter = conv + if as, ok := conv.(converter.AnchorNameSanitizer); ok { + spec.anchorNameSanitizer = as + } else { + // Use Goldmark's sanitizer + p := converterProvider.Get("goldmark") + conv, err := p.New(converter.DocumentContext{}) + if err != nil { + return nil, err + } + spec.anchorNameSanitizer = conv.(converter.AnchorNameSanitizer) + } return spec, nil } @@ -192,6 +204,10 @@ func (c *ContentSpec) RenderMarkdown(src []byte) ([]byte, error) { return b.Bytes(), nil } +func (c *ContentSpec) SanitizeAnchorName(s string) string { + return c.anchorNameSanitizer.SanitizeAnchorName(s) +} + func (c *ContentSpec) ResolveMarkup(in string) string { in = strings.ToLower(in) switch in { diff --git a/helpers/path.go b/helpers/path.go index 12ddfeb56..d97789e15 100644 --- a/helpers/path.go +++ b/helpers/path.go @@ -24,6 +24,8 @@ import ( "strings" "unicode" + "github.com/gohugoio/hugo/common/text" + "github.com/gohugoio/hugo/config" "github.com/gohugoio/hugo/hugofs" @@ -31,9 +33,6 @@ import ( "github.com/gohugoio/hugo/common/hugio" _errors "github.com/pkg/errors" "github.com/spf13/afero" - "golang.org/x/text/runes" - "golang.org/x/text/transform" - "golang.org/x/text/unicode/norm" ) var ( @@ -134,6 +133,10 @@ func ishex(c rune) bool { // are also removed. // Spaces will be replaced with a single hyphen, and sequential hyphens will be reduced to one. func (p *PathSpec) UnicodeSanitize(s string) string { + if p.RemovePathAccents { + s = text.RemoveAccentsString(s) + } + source := []rune(s) target := make([]rune, 0, len(source)) var prependHyphen bool @@ -154,17 +157,7 @@ func (p *PathSpec) UnicodeSanitize(s string) string { } } - var result string - - if p.RemovePathAccents { - // remove accents - see https://blog.golang.org/normalization - t := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC) - result, _, _ = transform.String(t, string(target)) - } else { - result = string(target) - } - - return result + return string(target) } // ReplaceExtension takes a path and an extension, strips the old extension diff --git a/markup/blackfriday/convert.go b/markup/blackfriday/convert.go index 3df23c7ae..bbbc2b377 100644 --- a/markup/blackfriday/convert.go +++ b/markup/blackfriday/convert.go @@ -60,6 +60,10 @@ type blackfridayConverter struct { cfg converter.ProviderConfig } +func (c *blackfridayConverter) SanitizeAnchorName(s string) string { + return blackfriday.SanitizedAnchorName(s) +} + func (c *blackfridayConverter) AnchorSuffix() string { if c.bf.PlainIDAnchors { return "" @@ -204,5 +208,6 @@ var blackfridayExtensionMap = map[string]int{ } var ( - _ converter.DocumentInfo = (*blackfridayConverter)(nil) + _ converter.DocumentInfo = (*blackfridayConverter)(nil) + _ converter.AnchorNameSanitizer = (*blackfridayConverter)(nil) ) diff --git a/markup/converter/converter.go b/markup/converter/converter.go index a4585bd03..b8a5c92c1 100644 --- a/markup/converter/converter.go +++ b/markup/converter/converter.go @@ -87,6 +87,11 @@ type TableOfContentsProvider interface { TableOfContents() tableofcontents.Root } +// AnchorNameSanitizer tells how a converter sanitizes anchor names. +type AnchorNameSanitizer interface { + SanitizeAnchorName(s string) string +} + // Bytes holds a byte slice and implements the Result interface. type Bytes []byte diff --git a/markup/goldmark/autoid.go b/markup/goldmark/autoid.go new file mode 100644 index 000000000..6599f08d9 --- /dev/null +++ b/markup/goldmark/autoid.go @@ -0,0 +1,125 @@ +// Copyright 2019 The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package goldmark + +import ( + "bytes" + "strconv" + "unicode" + "unicode/utf8" + + "github.com/gohugoio/hugo/common/text" + + "github.com/yuin/goldmark/ast" + "github.com/yuin/goldmark/parser" + "github.com/yuin/goldmark/util" + + bp "github.com/gohugoio/hugo/bufferpool" +) + +func sanitizeAnchorNameString(s string, asciiOnly bool) string { + return string(sanitizeAnchorName([]byte(s), asciiOnly)) +} + +func sanitizeAnchorName(b []byte, asciiOnly bool) []byte { + return sanitizeAnchorNameWithHook(b, asciiOnly, nil) +} + +func sanitizeAnchorNameWithHook(b []byte, asciiOnly bool, hook func(buf *bytes.Buffer)) []byte { + buf := bp.GetBuffer() + + if asciiOnly { + // Normalize it to preserve accents if possible. + b = text.RemoveAccents(b) + } + + for len(b) > 0 { + r, size := utf8.DecodeRune(b) + switch { + case asciiOnly && size != 1: + case isSpace(r): + buf.WriteString("-") + case r == '-' || isAlphaNumeric(r): + buf.WriteRune(unicode.ToLower(r)) + default: + } + + b = b[size:] + } + + if hook != nil { + hook(buf) + } + + result := make([]byte, buf.Len()) + copy(result, buf.Bytes()) + + bp.PutBuffer(buf) + + return result +} + +func isAlphaNumeric(r rune) bool { + return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r) +} + +func isSpace(r rune) bool { + return r == ' ' || r == '\t' +} + +var _ parser.IDs = (*idFactory)(nil) + +type idFactory struct { + asciiOnly bool + vals map[string]struct{} +} + +func newIDFactory(asciiOnly bool) *idFactory { + return &idFactory{ + vals: make(map[string]struct{}), + asciiOnly: asciiOnly, + } +} + +func (ids *idFactory) Generate(value []byte, kind ast.NodeKind) []byte { + return sanitizeAnchorNameWithHook(value, ids.asciiOnly, func(buf *bytes.Buffer) { + if buf.Len() == 0 { + if kind == ast.KindHeading { + buf.WriteString("heading") + } else { + buf.WriteString("id") + } + } + + if _, found := ids.vals[util.BytesToReadOnlyString(buf.Bytes())]; found { + // Append a hypen and a number, starting with 1. + buf.WriteRune('-') + pos := buf.Len() + for i := 1; ; i++ { + buf.WriteString(strconv.Itoa(i)) + if _, found := ids.vals[util.BytesToReadOnlyString(buf.Bytes())]; !found { + break + } + buf.Truncate(pos) + } + } + + ids.vals[buf.String()] = struct{}{} + + }) +} + +func (ids *idFactory) Put(value []byte) { + ids.vals[util.BytesToReadOnlyString(value)] = struct{}{} +} diff --git a/markup/goldmark/autoid_test.go b/markup/goldmark/autoid_test.go new file mode 100644 index 000000000..915c6a03c --- /dev/null +++ b/markup/goldmark/autoid_test.go @@ -0,0 +1,121 @@ +// Copyright 2019 The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package goldmark + +import ( + "strings" + "testing" + + qt "github.com/frankban/quicktest" +) + +func TestSanitizeAnchorName(t *testing.T) { + c := qt.New(t) + + // Tests generated manually on github.com + tests := ` +God is good: 神真美好 +Number 32 +Question? +1+2=3 +Special !"#$%&(parens)=?´* chars +Resumé +One-Hyphen +Multiple--Hyphens +Trailing hyphen- +Many spaces here +Forward/slash +Backward\slash +Under_score +` + + expect := ` +god-is-good-神真美好 +number-32 +question +123 +special-parens-chars +resumé +one-hyphen +multiple--hyphens +trailing-hyphen- +many---spaces--here +forwardslash +backwardslash +under_score +` + + tests, expect = strings.TrimSpace(tests), strings.TrimSpace(expect) + + testlines, expectlines := strings.Split(tests, "\n"), strings.Split(expect, "\n") + + if len(testlines) != len(expectlines) { + panic("test setup failed") + } + + for i, input := range testlines { + input := input + expect := expectlines[i] + c.Run(input, func(c *qt.C) { + b := []byte(input) + got := string(sanitizeAnchorName(b, false)) + c.Assert(got, qt.Equals, expect) + c.Assert(sanitizeAnchorNameString(input, false), qt.Equals, expect) + c.Assert(string(b), qt.Equals, input) + }) + } +} + +func TestSanitizeAnchorNameAsciiOnly(t *testing.T) { + c := qt.New(t) + + c.Assert(sanitizeAnchorNameString("god is神真美好 good", true), qt.Equals, "god-is-good") + c.Assert(sanitizeAnchorNameString("Resumé", true), qt.Equals, "resume") + +} + +func BenchmarkSanitizeAnchorName(b *testing.B) { + input := []byte("God is good: 神真美好") + b.ResetTimer() + for i := 0; i < b.N; i++ { + result := sanitizeAnchorName(input, false) + if len(result) != 24 { + b.Fatalf("got %d", len(result)) + + } + } +} + +func BenchmarkSanitizeAnchorNameAsciiOnly(b *testing.B) { + input := []byte("God is good: 神真美好") + b.ResetTimer() + for i := 0; i < b.N; i++ { + result := sanitizeAnchorName(input, true) + if len(result) != 12 { + b.Fatalf("got %d", len(result)) + + } + } +} + +func BenchmarkSanitizeAnchorNameString(b *testing.B) { + input := "God is good: 神真美好" + b.ResetTimer() + for i := 0; i < b.N; i++ { + result := sanitizeAnchorNameString(input, false) + if len(result) != 24 { + b.Fatalf("got %d", len(result)) + } + } +} diff --git a/markup/goldmark/convert.go b/markup/goldmark/convert.go index af204125f..7d50839e2 100644 --- a/markup/goldmark/convert.go +++ b/markup/goldmark/convert.go @@ -50,19 +50,33 @@ type provide struct { func (p provide) New(cfg converter.ProviderConfig) (converter.Provider, error) { md := newMarkdown(cfg) + return converter.NewProvider("goldmark", func(ctx converter.DocumentContext) (converter.Converter, error) { return &goldmarkConverter{ ctx: ctx, cfg: cfg, md: md, + sanitizeAnchorName: func(s string) string { + return sanitizeAnchorNameString(s, cfg.MarkupConfig.Goldmark.Parser.AutoHeadingIDAsciiOnly) + }, }, nil }), nil } +var ( + _ converter.AnchorNameSanitizer = (*goldmarkConverter)(nil) +) + type goldmarkConverter struct { md goldmark.Markdown ctx converter.DocumentContext cfg converter.ProviderConfig + + sanitizeAnchorName func(s string) string +} + +func (c *goldmarkConverter) SanitizeAnchorName(s string) string { + return c.sanitizeAnchorName(s) } func newMarkdown(pcfg converter.ProviderConfig) goldmark.Markdown { @@ -226,7 +240,7 @@ func (c *goldmarkConverter) Convert(ctx converter.RenderContext) (result convert buf := &bufWriter{Buffer: &bytes.Buffer{}} result = buf - pctx := newParserContext(ctx) + pctx := c.newParserContext(ctx) reader := text.NewReader(ctx.Src) doc := c.md.Parser().Parse( @@ -265,8 +279,8 @@ func (c *goldmarkConverter) Supports(feature identity.Identity) bool { return featureSet[feature.GetIdentity()] } -func newParserContext(rctx converter.RenderContext) *parserContext { - ctx := parser.NewContext() +func (c *goldmarkConverter) newParserContext(rctx converter.RenderContext) *parserContext { + ctx := parser.NewContext(parser.WithIDs(newIDFactory(c.cfg.MarkupConfig.Goldmark.Parser.AutoHeadingIDAsciiOnly))) ctx.Set(tocEnableKey, rctx.RenderTOC) return &parserContext{ Context: ctx, diff --git a/markup/goldmark/convert_test.go b/markup/goldmark/convert_test.go index 2a9727606..b9bf01ef5 100644 --- a/markup/goldmark/convert_test.go +++ b/markup/goldmark/convert_test.go @@ -28,6 +28,23 @@ import ( qt "github.com/frankban/quicktest" ) +func convert(c *qt.C, mconf markup_config.Config, content string) converter.Result { + + p, err := Provider.New( + converter.ProviderConfig{ + MarkupConfig: mconf, + Logger: loggers.NewErrorLogger(), + }, + ) + c.Assert(err, qt.IsNil) + conv, err := p.New(converter.DocumentContext{DocumentID: "thedoc"}) + c.Assert(err, qt.IsNil) + b, err := conv.Convert(converter.RenderContext{RenderTOC: true, Src: []byte(content)}) + c.Assert(err, qt.IsNil) + + return b +} + func TestConvert(t *testing.T) { c := qt.New(t) @@ -92,29 +109,23 @@ description : the description for the content. +## 神真美好 + +## 神真美好 + +## 神真美好 + [^1]: And that's the footnote. ` // Code fences content = strings.Replace(content, "§§§", "```", -1) - mconf := markup_config.Default mconf.Highlight.NoClasses = false mconf.Goldmark.Renderer.Unsafe = true - p, err := Provider.New( - converter.ProviderConfig{ - MarkupConfig: mconf, - Logger: loggers.NewErrorLogger(), - }, - ) - c.Assert(err, qt.IsNil) - conv, err := p.New(converter.DocumentContext{DocumentID: "thedoc"}) - c.Assert(err, qt.IsNil) - b, err := conv.Convert(converter.RenderContext{RenderTOC: true, Src: []byte(content)}) - c.Assert(err, qt.IsNil) - + b := convert(c, mconf, content) got := string(b.Bytes()) // Links @@ -123,6 +134,9 @@ description // Header IDs c.Assert(got, qt.Contains, `

Custom ID

`, qt.Commentf(got)) c.Assert(got, qt.Contains, `

Auto ID

`, qt.Commentf(got)) + c.Assert(got, qt.Contains, `

神真美好

`, qt.Commentf(got)) + c.Assert(got, qt.Contains, `

神真美好

`, qt.Commentf(got)) + c.Assert(got, qt.Contains, `

神真美好

`, qt.Commentf(got)) // Code fences c.Assert(got, qt.Contains, "
LINE1\n
") @@ -148,6 +162,20 @@ description } +func TestConvertAutoIDAsciiOnly(t *testing.T) { + c := qt.New(t) + + content := ` +## God is Good: 神真美好 +` + mconf := markup_config.Default + mconf.Goldmark.Parser.AutoHeadingIDAsciiOnly = true + b := convert(c, mconf, content) + got := string(b.Bytes()) + + c.Assert(got, qt.Contains, "

") +} + func TestCodeFence(t *testing.T) { c := qt.New(t) diff --git a/markup/goldmark/goldmark_config/config.go b/markup/goldmark/goldmark_config/config.go index bf18a384d..2454eb46f 100644 --- a/markup/goldmark/goldmark_config/config.go +++ b/markup/goldmark/goldmark_config/config.go @@ -69,6 +69,10 @@ type Parser struct { // auto generated heading ids. AutoHeadingID bool + // When AutoHeadingID is enabled this will generate IDs with Ascii + // characters only. + AutoHeadingIDAsciiOnly bool + // Enables custom attributes. Attribute bool } diff --git a/tpl/urls/urls.go b/tpl/urls/urls.go index 5bae411b3..ee0e55501 100644 --- a/tpl/urls/urls.go +++ b/tpl/urls/urls.go @@ -25,7 +25,6 @@ import ( "github.com/gohugoio/hugo/common/urls" "github.com/gohugoio/hugo/deps" _errors "github.com/pkg/errors" - "github.com/russross/blackfriday" "github.com/spf13/cast" ) @@ -90,7 +89,7 @@ func (ns *Namespace) Anchorize(a interface{}) (string, error) { if err != nil { return "", nil } - return blackfriday.SanitizedAnchorName(s), nil + return ns.deps.ContentSpec.SanitizeAnchorName(s), nil } // Ref returns the absolute URL path to a given content item.