canonifyurls in srcset

Speed is about the same as before, uses slightly less memory:

```
benchmark              old ns/op     new ns/op     delta
BenchmarkAbsURL        17302         17713         +2.38%
BenchmarkXMLAbsURL     9463          9470          +0.07%

benchmark              old allocs     new allocs     delta
BenchmarkAbsURL        28             24             -14.29%
BenchmarkXMLAbsURL     14             12             -14.29%

benchmark              old bytes     new bytes     delta
BenchmarkAbsURL        3422          3144          -8.12%
BenchmarkXMLAbsURL     1985          1864          -6.10%
```

Fixes #1059
This commit is contained in:
bep 2015-05-03 19:54:17 +02:00
parent be6482603e
commit bee6a69846
2 changed files with 254 additions and 104 deletions

View file

@ -29,69 +29,98 @@ type contentlexer struct {
start int // item start position
width int // width of last element
matchers []absURLMatcher
state stateFunc
prefixLookup *prefixes
matchers []absURLMatcher
state stateFunc
ms matchState
matches [3]bool // track matches of the 3 prefixes
i int // last index in matches checked
w io.Writer
}
type stateFunc func(*contentlexer) stateFunc
type prefixRunes []rune
type prefixes struct {
pr []prefixRunes
curr prefixRunes // current prefix lookup table
i int // current index
// first rune in potential match
first rune
// match-state:
// none, whitespace, partial, full
ms matchState
type prefix struct {
r []rune
f func(l *contentlexer)
}
var prefixes = []*prefix{
&prefix{r: []rune{'s', 'r', 'c', '='}, f: checkCandidateSrc},
&prefix{r: []rune{'s', 'r', 'c', 's', 'e', 't', '='}, f: checkCandidateSrcset},
&prefix{r: []rune{'h', 'r', 'e', 'f', '='}, f: checkCandidateHref}}
type absURLMatcher struct {
prefix int
match []byte
quote []byte
replacementURL []byte
}
// match returns partial and full match for the prefix in play
// - it's a full match if all prefix runes has checked out in row
// - it's a partial match if it's on its way towards a full match
func (l *contentlexer) match(r rune) {
p := l.prefixLookup
if p.curr == nil {
// assumes prefixes all start off on a different rune
// works in this special case: href, src
p.i = 0
for _, pr := range p.pr {
if pr[p.i] == r {
fullMatch := len(p.pr) == 1
p.first = r
if !fullMatch {
p.curr = pr
l.prefixLookup.ms = matchStatePartial
} else {
l.prefixLookup.ms = matchStateFull
var found bool
// note, the prefixes can start off on the same foot, i.e.
// src and srcset.
if l.ms == matchStateWhitespace {
l.i = 0
for j, p := range prefixes {
if r == p.r[l.i] {
l.matches[j] = true
found = true
if l.checkMatchState(r, j) {
return
}
return
}
}
} else {
p.i++
if p.curr[p.i] == r {
fullMatch := len(p.curr) == p.i+1
if fullMatch {
p.curr = nil
l.prefixLookup.ms = matchStateFull
} else {
l.prefixLookup.ms = matchStatePartial
l.matches[j] = false
}
return
}
p.curr = nil
if !found {
l.ms = matchStateNone
}
return
}
l.prefixLookup.ms = matchStateNone
l.i++
for j, m := range l.matches {
// still a match?
if m {
if prefixes[j].r[l.i] == r {
found = true
if l.checkMatchState(r, j) {
return
}
} else {
l.matches[j] = false
}
}
}
if found {
return
}
l.ms = matchStateNone
}
func (l *contentlexer) checkMatchState(r rune, idx int) bool {
if r == '=' {
l.ms = matchStateFull
for k := range l.matches {
if k != idx {
l.matches[k] = false
}
}
return true
}
l.ms = matchStatePartial
return false
}
func (l *contentlexer) emit() {
@ -99,49 +128,108 @@ func (l *contentlexer) emit() {
l.start = l.pos
}
var mainPrefixRunes = []prefixRunes{{'s', 'r', 'c', '='}, {'h', 'r', 'e', 'f', '='}}
type absURLMatcher struct {
prefix int
match []byte
replacement []byte
}
func (a absURLMatcher) isSourceType() bool {
return a.prefix == matchPrefixSrc
}
func checkCandidate(l *contentlexer) {
isSource := l.prefixLookup.first == 's'
func checkCandidateSrc(l *contentlexer) {
for _, m := range l.matchers {
if !m.isSourceType() {
continue
}
l.replaceSimple(m)
}
}
if isSource && !m.isSourceType() || !isSource && m.isSourceType() {
func checkCandidateHref(l *contentlexer) {
for _, m := range l.matchers {
if m.isSourceType() {
continue
}
l.replaceSimple(m)
}
}
func checkCandidateSrcset(l *contentlexer) {
// special case, not frequent (me think)
for _, m := range l.matchers {
if m.isSourceType() {
continue
}
if bytes.HasPrefix(l.content[l.pos:], m.match) {
// check for schemaless URLs
posAfter := l.pos + len(m.match)
if posAfter >= len(l.content) {
return
}
r, _ := utf8.DecodeRune(l.content[posAfter:])
if r == '/' {
// schemaless: skip
return
}
if l.pos > l.start {
l.emit()
}
l.pos += len(m.match)
l.w.Write(m.replacement)
l.start = l.pos
return
if !bytes.HasPrefix(l.content[l.pos:], m.match) {
continue
}
// check for schemaless URLs
posAfter := l.pos + len(m.match)
if posAfter >= len(l.content) {
return
}
r, _ := utf8.DecodeRune(l.content[posAfter:])
if r == '/' {
// schemaless: skip
continue
}
posLastQuote := bytes.Index(l.content[l.pos+1:], m.quote)
// safe guard
if posLastQuote < 0 || posLastQuote > 2000 {
return
}
if l.pos > l.start {
l.emit()
}
section := l.content[l.pos+len(m.quote) : l.pos+posLastQuote+1]
fields := bytes.Fields(section)
l.w.Write([]byte(m.quote))
for i, f := range fields {
if f[0] == '/' {
l.w.Write(m.replacementURL)
l.w.Write(f[1:])
} else {
l.w.Write(f)
}
if i < len(fields)-1 {
l.w.Write([]byte(" "))
}
}
l.w.Write(m.quote)
l.pos += len(section) + (len(m.quote) * 2)
l.start = l.pos
}
}
func (l *contentlexer) replaceSimple(m absURLMatcher) {
if !bytes.HasPrefix(l.content[l.pos:], m.match) {
return
}
// check for schemaless URLs
posAfter := l.pos + len(m.match)
if posAfter >= len(l.content) {
return
}
r, _ := utf8.DecodeRune(l.content[posAfter:])
if r == '/' {
// schemaless: skip
return
}
if l.pos > l.start {
l.emit()
}
l.pos += len(m.match)
l.w.Write(m.quote)
l.w.Write(m.replacementURL)
l.start = l.pos
}
func (l *contentlexer) replace() {
contentLength := len(l.content)
var r rune
@ -152,7 +240,7 @@ func (l *contentlexer) replace() {
break
}
var width int = 1
var width = 1
r = rune(l.content[l.pos])
if r >= utf8.RuneSelf {
r, width = utf8.DecodeRune(l.content[l.pos:])
@ -160,14 +248,24 @@ func (l *contentlexer) replace() {
l.width = width
l.pos += l.width
if r == ' ' {
l.prefixLookup.ms = matchStateWhitespace
} else if l.prefixLookup.ms != matchStateNone {
l.ms = matchStateWhitespace
} else if l.ms != matchStateNone {
l.match(r)
if l.prefixLookup.ms == matchStateFull {
checkCandidate(l)
if l.ms == matchStateFull {
var p *prefix
for i, m := range l.matches {
if m {
p = prefixes[i]
}
l.matches[i] = false
}
if p == nil {
panic("illegal state: curr is nil when state is full")
}
l.ms = matchStateNone
p.f(l)
}
}
}
// Done!
@ -177,15 +275,12 @@ func (l *contentlexer) replace() {
}
func doReplace(ct contentTransformer, matchers []absURLMatcher) {
lexer := &contentlexer{
content: ct.Content(),
w: ct,
prefixLookup: &prefixes{pr: mainPrefixRunes},
matchers: matchers}
content: ct.Content(),
w: ct,
matchers: matchers}
lexer.replace()
}
type absURLReplacer struct {
@ -195,7 +290,7 @@ type absURLReplacer struct {
func newAbsURLReplacer(baseURL string) *absURLReplacer {
u, _ := url.Parse(baseURL)
base := strings.TrimRight(u.String(), "/")
base := []byte(strings.TrimRight(u.String(), "/") + "/")
// HTML
dqHTMLMatch := []byte("\"/")
@ -205,23 +300,23 @@ func newAbsURLReplacer(baseURL string) *absURLReplacer {
dqXMLMatch := []byte("&#34;/")
sqXMLMatch := []byte("&#39;/")
dqHTML := []byte("\"" + base + "/")
sqHTML := []byte("'" + base + "/")
dqHTML := []byte("\"")
sqHTML := []byte("'")
dqXML := []byte("&#34;" + base + "/")
sqXML := []byte("&#39;" + base + "/")
dqXML := []byte("&#34;")
sqXML := []byte("&#39;")
return &absURLReplacer{
htmlMatchers: []absURLMatcher{
{matchPrefixSrc, dqHTMLMatch, dqHTML},
{matchPrefixSrc, sqHTMLMatch, sqHTML},
{matchPrefixHref, dqHTMLMatch, dqHTML},
{matchPrefixHref, sqHTMLMatch, sqHTML}},
{matchPrefixSrc, dqHTMLMatch, dqHTML, base},
{matchPrefixSrc, sqHTMLMatch, sqHTML, base},
{matchPrefixHref, dqHTMLMatch, dqHTML, base},
{matchPrefixHref, sqHTMLMatch, sqHTML, base}},
xmlMatchers: []absURLMatcher{
{matchPrefixSrc, dqXMLMatch, dqXML},
{matchPrefixSrc, sqXMLMatch, sqXML},
{matchPrefixHref, dqXMLMatch, dqXML},
{matchPrefixHref, sqXMLMatch, sqXML},
{matchPrefixSrc, dqXMLMatch, dqXML, base},
{matchPrefixSrc, sqXMLMatch, sqXML, base},
{matchPrefixHref, dqXMLMatch, dqXML, base},
{matchPrefixHref, sqXMLMatch, sqXML, base},
}}
}

View file

@ -25,8 +25,42 @@ const REPLACE_2 = "ᚠᛇᚻ ᛒᛦᚦ ᚠᚱᚩᚠᚢᚱ\nᚠᚱᚪ
// Issue: 816, schemaless links combined with others
const REPLACE_SCHEMALESS_HTML = `Pre. src='//schemaless' src='/normal' <a href="//schemaless">Schemaless</a>. <a href="/normal">normal</a>. Post.`
const REPLACE_SCHEMALESS_HTML_CORRECT = `Pre. src='//schemaless' src='http://base/normal' <a href="//schemaless">Schemaless</a>. <a href="http://base/normal">normal</a>. Post.`
const REPLACE_SCHEMALESS_XML = `Pre. src=&#34;//schemaless&#34; src=&#34;/normal&#34; <a href=&#39;//schemaless&#39;>Schemaless</a>. <a href=&#39;/normal&#39;>normal</a>. Post.`
const REPLACE_SCHEMALESS_XML_CORRECT = `Pre. src=&#34;//schemaless&#34; src=&#34;http://base/normal&#34; <a href=&#39;//schemaless&#39;>Schemaless</a>. <a href=&#39;http://base/normal&#39;>normal</a>. Post.`
const REPLACE_SCHEMALESS_XML = `Pre. src=&#39;//schemaless&#39; src=&#39;/normal&#39; <a href=&#39;//schemaless&#39;>Schemaless</a>. <a href=&#39;/normal&#39;>normal</a>. Post.`
const REPLACE_SCHEMALESS_XML_CORRECT = `Pre. src=&#39;//schemaless&#39; src=&#39;http://base/normal&#39; <a href=&#39;//schemaless&#39;>Schemaless</a>. <a href=&#39;http://base/normal&#39;>normal</a>. Post.`
// srcset=
const SRCSET_BASIC = `Pre. <img srcset="/img/small.jpg 200w /img/big.jpg 700w" alt="text" src="/img/foo.jpg">`
const SRCSET_BASIC_CORRECT = `Pre. <img srcset="http://base/img/small.jpg 200w http://base/img/big.jpg 700w" alt="text" src="http://base/img/foo.jpg">`
const SRCSET_SINGLE_QUOTE = `Pre. <img srcset='/img/small.jpg 200w /img/big.jpg 700w' alt="text" src="/img/foo.jpg"> POST.`
const SRCSET_SINGLE_QUOTE_CORRECT = `Pre. <img srcset='http://base/img/small.jpg 200w http://base/img/big.jpg 700w' alt="text" src="http://base/img/foo.jpg"> POST.`
const SRCSET_XML_BASIC = `Pre. <img srcset=&#34;/img/small.jpg 200w /img/big.jpg 700w&#34; alt=&#34;text&#34; src=&#34;/img/foo.jpg&#34;>`
const SRCSET_XML_BASIC_CORRECT = `Pre. <img srcset=&#34;http://base/img/small.jpg 200w http://base/img/big.jpg 700w&#34; alt=&#34;text&#34; src=&#34;http://base/img/foo.jpg&#34;>`
const SRCSET_XML_SINGLE_QUOTE = `Pre. <img srcset=&#34;/img/small.jpg 200w /img/big.jpg 700w&#34; alt=&#34;text&#34; src=&#34;/img/foo.jpg&#34;>`
const SRCSET_XML_SINGLE_QUOTE_CORRECT = `Pre. <img srcset=&#34;http://base/img/small.jpg 200w http://base/img/big.jpg 700w&#34; alt=&#34;text&#34; src=&#34;http://base/img/foo.jpg&#34;>`
const SRCSET_VARIATIONS = `Pre.
Missing start quote: <img srcset=/img/small.jpg 200w /img/big.jpg 700w" alt="text"> src='/img/foo.jpg'> FOO.
<img srcset='/img.jpg'>
schemaless: <img srcset='//img.jpg' src='//basic.jpg'>
schemaless2: <img srcset="//img.jpg" src="//basic.jpg2> POST
`
const SRCSET_VARIATIONS_CORRECT = `Pre.
Missing start quote: <img srcset=/img/small.jpg 200w /img/big.jpg 700w" alt="text"> src='http://base/img/foo.jpg'> FOO.
<img srcset='http://base/img.jpg'>
schemaless: <img srcset='//img.jpg' src='//basic.jpg'>
schemaless2: <img srcset="//img.jpg" src="//basic.jpg2> POST
`
const SRCSET_XML_VARIATIONS = `Pre.
Missing start quote: &lt;img srcset=/img/small.jpg 200w /img/big.jpg 700w&quot; alt=&quot;text&quot;&gt; src=&#39;/img/foo.jpg&#39;&gt; FOO.
&lt;img srcset=&#39;/img.jpg&#39;&gt;
schemaless: &lt;img srcset=&#39;//img.jpg&#39; src=&#39;//basic.jpg&#39;&gt;
schemaless2: &lt;img srcset=&quot;//img.jpg&quot; src=&quot;//basic.jpg2&gt; POST
`
const SRCSET_XML_VARIATIONS_CORRECT = `Pre.
Missing start quote: &lt;img srcset=/img/small.jpg 200w /img/big.jpg 700w&quot; alt=&quot;text&quot;&gt; src=&#39;http://base/img/foo.jpg&#39;&gt; FOO.
&lt;img srcset=&#39;http://base/img.jpg&#39;&gt;
schemaless: &lt;img srcset=&#39;//img.jpg&#39; src=&#39;//basic.jpg&#39;&gt;
schemaless2: &lt;img srcset=&quot;//img.jpg&quot; src=&quot;//basic.jpg2&gt; POST
`
var abs_url_bench_tests = []test{
{H5_JS_CONTENT_DOUBLE_QUOTE, CORRECT_OUTPUT_SRC_HREF_DQ},
@ -46,6 +80,12 @@ var abs_url_tests = append(abs_url_bench_tests, append(sanity_tests, extra_tests
var extra_tests_xml = []test{{REPLACE_SCHEMALESS_XML, REPLACE_SCHEMALESS_XML_CORRECT}}
var xml_abs_url_tests = append(xml_abs_url_bench_tests, append(sanity_tests, extra_tests_xml...)...)
var srcset_tests = []test{{SRCSET_BASIC, SRCSET_BASIC_CORRECT}, {SRCSET_SINGLE_QUOTE, SRCSET_SINGLE_QUOTE_CORRECT}, {SRCSET_VARIATIONS, SRCSET_VARIATIONS_CORRECT}}
var srcset_xml_tests = []test{
{SRCSET_XML_BASIC, SRCSET_XML_BASIC_CORRECT},
{SRCSET_XML_SINGLE_QUOTE, SRCSET_XML_SINGLE_QUOTE_CORRECT},
{SRCSET_XML_VARIATIONS, SRCSET_XML_VARIATIONS_CORRECT}}
func TestChainZeroTransformers(t *testing.T) {
tr := NewChain()
in := new(bytes.Buffer)
@ -102,6 +142,21 @@ func TestAbsURL(t *testing.T) {
}
func TestAbsURLSrcSet(t *testing.T) {
absURL, _ := absURLFromURL("http://base")
tr := NewChain(absURL...)
apply(t.Errorf, tr, srcset_tests)
}
func TestAbsXMLURLSrcSet(t *testing.T) {
absURLInXML, _ := absURLInXMLFromURL("http://base")
tr := NewChain(absURLInXML...)
apply(t.Errorf, tr, srcset_xml_tests)
}
func BenchmarkXMLAbsURL(b *testing.B) {
absURLInXML, _ := absURLInXMLFromURL("http://base")
tr := NewChain(absURLInXML...)