transform/urlreplacers: Support unquoted URLs in canonifyURLs replacer

Fixes #5529
This commit is contained in:
Bjørn Erik Pedersen 2018-12-17 14:25:00 +01:00
parent d5a0b6bbbc
commit efe0b4e5c0
2 changed files with 130 additions and 104 deletions

View file

@ -16,6 +16,7 @@ package urlreplacers
import (
"bytes"
"io"
"unicode"
"unicode/utf8"
"github.com/gohugoio/hugo/transform"
@ -43,7 +44,7 @@ type absurllexer struct {
start int // item start position
width int // width of last element
matchers []absURLMatcher
quotes [][]byte
ms matchState
matches [3]bool // track matches of the 3 prefixes
@ -140,84 +141,115 @@ func (l *absurllexer) emit() {
l.start = l.pos
}
var (
relURLPrefix = []byte("/")
relURLPrefixLen = len(relURLPrefix)
)
func (l *absurllexer) consumeQuote() []byte {
for _, q := range l.quotes {
if bytes.HasPrefix(l.content[l.pos:], q) {
l.pos += len(q)
l.emit()
return q
}
}
return nil
}
// handle URLs in src and href.
func checkCandidateBase(l *absurllexer) {
for _, m := range l.matchers {
if !bytes.HasPrefix(l.content[l.pos:], m.match) {
continue
}
// check for schemaless URLs
posAfter := l.pos + len(m.match)
if posAfter >= len(l.content) {
return
}
r, _ := utf8.DecodeRune(l.content[posAfter:])
if r == '/' {
// schemaless: skip
return
}
if l.pos > l.start {
l.emit()
}
l.pos += len(m.match)
l.w.Write(m.quote)
l.w.Write(l.path)
l.start = l.pos
l.consumeQuote()
if !bytes.HasPrefix(l.content[l.pos:], relURLPrefix) {
return
}
// check for schemaless URLs
posAfter := l.pos + relURLPrefixLen
if posAfter >= len(l.content) {
return
}
r, _ := utf8.DecodeRune(l.content[posAfter:])
if r == '/' {
// schemaless: skip
return
}
if l.pos > l.start {
l.emit()
}
l.pos += relURLPrefixLen
l.w.Write(l.path)
l.start = l.pos
}
func (l *absurllexer) posAfterURL(q []byte) int {
if len(q) > 0 {
// look for end quote
return bytes.Index(l.content[l.pos:], q)
}
return bytes.IndexFunc(l.content[l.pos:], func(r rune) bool {
return r == '>' || unicode.IsSpace(r)
})
}
// handle URLs in srcset.
func checkCandidateSrcset(l *absurllexer) {
// special case, not frequent (me think)
for _, m := range l.matchers {
if !bytes.HasPrefix(l.content[l.pos:], m.match) {
continue
}
// check for schemaless URLs
posAfter := l.pos + len(m.match)
if posAfter >= len(l.content) {
return
}
r, _ := utf8.DecodeRune(l.content[posAfter:])
if r == '/' {
// schemaless: skip
continue
}
posLastQuote := bytes.Index(l.content[l.pos+1:], m.quote)
// safe guard
if posLastQuote < 0 || posLastQuote > 2000 {
return
}
if l.pos > l.start {
l.emit()
}
section := l.content[l.pos+len(m.quote) : l.pos+posLastQuote+1]
fields := bytes.Fields(section)
l.w.Write(m.quote)
for i, f := range fields {
if f[0] == '/' {
l.w.Write(l.path)
l.w.Write(f[1:])
} else {
l.w.Write(f)
}
if i < len(fields)-1 {
l.w.Write([]byte(" "))
}
}
l.w.Write(m.quote)
l.pos += len(section) + (len(m.quote) * 2)
l.start = l.pos
q := l.consumeQuote()
if q == nil {
// srcset needs to be quoted.
return
}
// special case, not frequent (me think)
if !bytes.HasPrefix(l.content[l.pos:], relURLPrefix) {
return
}
// check for schemaless URLs
posAfter := l.pos + relURLPrefixLen
if posAfter >= len(l.content) {
return
}
r, _ := utf8.DecodeRune(l.content[posAfter:])
if r == '/' {
// schemaless: skip
return
}
posEnd := l.posAfterURL(q)
// safe guard
if posEnd < 0 || posEnd > 2000 {
return
}
if l.pos > l.start {
l.emit()
}
section := l.content[l.pos : l.pos+posEnd+1]
fields := bytes.Fields(section)
for i, f := range fields {
if f[0] == '/' {
l.w.Write(l.path)
l.w.Write(f[1:])
} else {
l.w.Write(f)
}
if i < len(fields)-1 {
l.w.Write([]byte(" "))
}
}
l.pos += len(section)
l.start = l.pos
}
// main loop
@ -262,53 +294,32 @@ func (l *absurllexer) replace() {
}
}
func doReplace(path string, ct transform.FromTo, matchers []absURLMatcher) {
func doReplace(path string, ct transform.FromTo, quotes [][]byte) {
lexer := &absurllexer{
content: ct.From().Bytes(),
w: ct.To(),
path: []byte(path),
matchers: matchers}
content: ct.From().Bytes(),
w: ct.To(),
path: []byte(path),
quotes: quotes}
lexer.replace()
}
type absURLReplacer struct {
htmlMatchers []absURLMatcher
xmlMatchers []absURLMatcher
htmlQuotes [][]byte
xmlQuotes [][]byte
}
func newAbsURLReplacer() *absURLReplacer {
// HTML
dqHTMLMatch := []byte("\"/")
sqHTMLMatch := []byte("'/")
// XML
dqXMLMatch := []byte("&#34;/")
sqXMLMatch := []byte("&#39;/")
dqHTML := []byte("\"")
sqHTML := []byte("'")
dqXML := []byte("&#34;")
sqXML := []byte("&#39;")
return &absURLReplacer{
htmlMatchers: []absURLMatcher{
{dqHTMLMatch, dqHTML},
{sqHTMLMatch, sqHTML},
},
xmlMatchers: []absURLMatcher{
{dqXMLMatch, dqXML},
{sqXMLMatch, sqXML},
}}
htmlQuotes: [][]byte{[]byte("\""), []byte("'")},
xmlQuotes: [][]byte{[]byte("&#34;"), []byte("&#39;")}}
}
func (au *absURLReplacer) replaceInHTML(path string, ct transform.FromTo) {
doReplace(path, ct, au.htmlMatchers)
doReplace(path, ct, au.htmlQuotes)
}
func (au *absURLReplacer) replaceInXML(path string, ct transform.FromTo) {
doReplace(path, ct, au.xmlMatchers)
doReplace(path, ct, au.xmlQuotes)
}

View file

@ -156,6 +156,21 @@ func TestAbsURL(t *testing.T) {
}
func TestAbsURLUnqoted(t *testing.T) {
tr := transform.New(NewAbsURLTransformer(testBaseURL))
apply(t.Errorf, tr, []test{
test{
content: `Link: <a href=/asdf>ASDF</a>`,
expected: `Link: <a href=http://base/asdf>ASDF</a>`,
},
test{
content: `Link: <a href=/asdf >ASDF</a>`,
expected: `Link: <a href=http://base/asdf >ASDF</a>`,
},
})
}
func TestRelativeURL(t *testing.T) {
tr := transform.New(NewAbsURLTransformer(helpers.GetDottedRelativePath(filepath.FromSlash("/post/sub/"))))
@ -176,7 +191,7 @@ func TestAbsXMLURLSrcSet(t *testing.T) {
}
func BenchmarkXMLAbsURL(b *testing.B) {
tr := transform.New(NewAbsURLInXMLTransformer(""))
tr := transform.New(NewAbsURLInXMLTransformer(testBaseURL))
b.ResetTimer()
for i := 0; i < b.N; i++ {