Improve abs url replacement speed

This commit replaces the multuple `bytes.Containts` and `bytes.Replace` with a custom replacer that does one pass through the document and exploits the fact that there are two common prefixes we search for, `src=` and `href=`.

This is both faster and consumes less memory. There may be even better algos to use here, but we must leave some room for improvements for future versions.

This should also make it possible to solve #816.

```
benchmark              old ns/op     new ns/op     delta
BenchmarkAbsUrl        25795         22597         -12.40%
BenchmarkXmlAbsUrl     17187         11166         -35.03%

benchmark              old allocs     new allocs     delta
BenchmarkAbsUrl        60             33             -45.00%
BenchmarkXmlAbsUrl     30             16             -46.67%

benchmark              old bytes     new bytes     delta
BenchmarkAbsUrl        5844          4167          -28.70%
BenchmarkXmlAbsUrl     3754          2069          -44.89%
```

Fixes #894
This commit is contained in:
bep 2015-02-16 10:48:15 +01:00 committed by spf13
parent 27c03a6dd0
commit f1fec88c30
4 changed files with 357 additions and 53 deletions

2
.gitignore vendored
View file

@ -2,6 +2,8 @@ hugo
docs/public*
hugo.exe
*.test
*.prof
nohup.out
cover.out
*.swp
*.swo

View file

@ -1,64 +1,33 @@
package transform
import (
"bytes"
"net/url"
"strings"
"sync"
)
func AbsURL(absURL string) (trs []link, err error) {
var baseURL *url.URL
var absUrlInit sync.Once
var ar *absurlReplacer
if baseURL, err = url.Parse(absURL); err != nil {
return
// for performance reasons, we reuse the first baseUrl given
func initAbsurlReplacer(baseURL string) {
absUrlInit.Do(func() {
ar = newAbsurlReplacer(baseURL)
})
}
base := strings.TrimRight(baseURL.String(), "/")
func AbsURL(absURL string) (trs []link, err error) {
initAbsurlReplacer(absURL)
var (
srcdq = []byte(" src=\"" + base + "/")
hrefdq = []byte(" href=\"" + base + "/")
srcsq = []byte(" src='" + base + "/")
hrefsq = []byte(" href='" + base + "/")
)
trs = append(trs, func(content []byte) []byte {
content = guardReplace(content, []byte(" src=\"//"), []byte(" src=\"/"), srcdq)
content = guardReplace(content, []byte(" src='//"), []byte(" src='/"), srcsq)
content = guardReplace(content, []byte(" href=\"//"), []byte(" href=\"/"), hrefdq)
content = guardReplace(content, []byte(" href='//"), []byte(" href='/"), hrefsq)
return content
return ar.replaceInHtml(content)
})
return
}
func AbsURLInXML(absURL string) (trs []link, err error) {
var baseURL *url.URL
initAbsurlReplacer(absURL)
if baseURL, err = url.Parse(absURL); err != nil {
return
}
base := strings.TrimRight(baseURL.String(), "/")
var (
srcedq = []byte(" src="" + base + "/")
hrefedq = []byte(" href="" + base + "/")
srcesq = []byte(" src='" + base + "/")
hrefesq = []byte(" href='" + base + "/")
)
trs = append(trs, func(content []byte) []byte {
content = guardReplace(content, []byte(" src="//"), []byte(" src="/"), srcedq)
content = guardReplace(content, []byte(" src='//"), []byte(" src='/"), srcesq)
content = guardReplace(content, []byte(" href="//"), []byte(" href="/"), hrefedq)
content = guardReplace(content, []byte(" href='//"), []byte(" href='/"), hrefesq)
return content
return ar.replaceInXml(content)
})
return
}
func guardReplace(content, guard, match, replace []byte) []byte {
if !bytes.Contains(content, guard) {
content = bytes.Replace(content, match, replace, -1)
}
return content
}

325
transform/absurlreplacer.go Normal file
View file

@ -0,0 +1,325 @@
package transform
import (
"bytes"
bp "github.com/spf13/hugo/bufferpool"
"net/url"
"strings"
"sync"
"unicode/utf8"
)
// position (in bytes)
type pos int
type matchState int
const (
matchStateNone matchState = iota
matchStateWhitespace
matchStatePartial
matchStateFull
)
type item struct {
typ itemType
pos pos
val []byte
}
type itemType int
const (
tText itemType = iota
// matches
tSrcdq
tHrefdq
tSrcsq
tHrefsq
// guards
tGrcdq
tGhrefdq
tGsrcsq
tGhrefsq
)
type contentlexer struct {
content []byte
pos pos // input position
start pos // item start position
width pos // width of last element
matchers []absurlMatcher
state stateFunc
prefixLookup *prefixes
// items delivered to client
items []item
}
type stateFunc func(*contentlexer) stateFunc
type prefixRunes []rune
type prefixes struct {
pr []prefixRunes
curr prefixRunes // current prefix lookup table
i int // current index
// first rune in potential match
first rune
// match-state:
// none, whitespace, partial, full
ms matchState
}
// match returns partial and full match for the prefix in play
// - it's a full match if all prefix runes has checked out in row
// - it's a partial match if it's on its way towards a full match
func (l *contentlexer) match(r rune) {
p := l.prefixLookup
if p.curr == nil {
// assumes prefixes all start off on a different rune
// works in this special case: href, src
p.i = 0
for _, pr := range p.pr {
if pr[p.i] == r {
fullMatch := len(p.pr) == 1
p.first = r
if !fullMatch {
p.curr = pr
l.prefixLookup.ms = matchStatePartial
} else {
l.prefixLookup.ms = matchStateFull
}
return
}
}
} else {
p.i++
if p.curr[p.i] == r {
fullMatch := len(p.curr) == p.i+1
if fullMatch {
p.curr = nil
l.prefixLookup.ms = matchStateFull
} else {
l.prefixLookup.ms = matchStatePartial
}
return
}
p.curr = nil
}
l.prefixLookup.ms = matchStateNone
}
func (l *contentlexer) emit(t itemType) {
l.items = append(l.items, item{t, l.start, l.content[l.start:l.pos]})
l.start = l.pos
}
var mainPrefixRunes = []prefixRunes{{'s', 'r', 'c', '='}, {'h', 'r', 'e', 'f', '='}}
var itemSlicePool = &sync.Pool{
New: func() interface{} {
return make([]item, 0, 8)
},
}
func replace(content []byte, matchers []absurlMatcher) *contentlexer {
var items []item
if x := itemSlicePool.Get(); x != nil {
items = x.([]item)[:0]
defer itemSlicePool.Put(items)
} else {
items = make([]item, 0, 8)
}
lexer := &contentlexer{content: content,
items: items,
prefixLookup: &prefixes{pr: mainPrefixRunes},
matchers: matchers}
lexer.runReplacer()
return lexer
}
func (l *contentlexer) runReplacer() {
for l.state = lexReplacements; l.state != nil; {
l.state = l.state(l)
}
}
type absurlMatcher struct {
replaceType itemType
guardType itemType
match []byte
guard []byte
replacement []byte
guarded bool
}
func (a absurlMatcher) isSourceType() bool {
return a.replaceType == tSrcdq || a.replaceType == tSrcsq
}
func lexReplacements(l *contentlexer) stateFunc {
contentLength := len(l.content)
var r rune
for {
if int(l.pos) >= contentLength {
l.width = 0
break
}
var width int = 1
r = rune(l.content[l.pos])
if r >= utf8.RuneSelf {
r, width = utf8.DecodeRune(l.content[l.pos:])
}
l.width = pos(width)
l.pos += l.width
if r == ' ' {
l.prefixLookup.ms = matchStateWhitespace
} else if l.prefixLookup.ms != matchStateNone {
l.match(r)
if l.prefixLookup.ms == matchStateFull {
checkCandidate(l)
}
}
}
// Done!
if l.pos > l.start {
l.emit(tText)
}
return nil
}
func checkCandidate(l *contentlexer) {
isSource := l.prefixLookup.first == 's'
for _, m := range l.matchers {
if m.guarded {
continue
}
if isSource && !m.isSourceType() || !isSource && m.isSourceType() {
continue
}
s := l.content[l.pos:]
if bytes.HasPrefix(s, m.guard) {
if l.pos > l.start {
l.emit(tText)
}
l.pos += pos(len(m.guard))
l.emit(m.guardType)
m.guarded = true
return
} else if bytes.HasPrefix(s, m.match) {
if l.pos > l.start {
l.emit(tText)
}
l.pos += pos(len(m.match))
l.emit(m.replaceType)
return
}
}
}
func doReplace(content []byte, matchers []absurlMatcher) []byte {
b := bp.GetBuffer()
defer bp.PutBuffer(b)
guards := make([]bool, len(matchers))
replaced := replace(content, matchers)
// first pass: check guards
for _, item := range replaced.items {
if item.typ != tText {
for i, e := range matchers {
if item.typ == e.guardType {
guards[i] = true
break
}
}
}
}
// second pass: do replacements for non-guarded tokens
for _, token := range replaced.items {
switch token.typ {
case tText:
b.Write(token.val)
default:
for i, e := range matchers {
if token.typ == e.replaceType && !guards[i] {
b.Write(e.replacement)
} else if token.typ == e.replaceType || token.typ == e.guardType {
b.Write(token.val)
}
}
}
}
return b.Bytes()
}
type absurlReplacer struct {
htmlMatchers []absurlMatcher
xmlMatchers []absurlMatcher
}
func newAbsurlReplacer(baseUrl string) *absurlReplacer {
u, _ := url.Parse(baseUrl)
base := strings.TrimRight(u.String(), "/")
// HTML
dqHtmlMatch := []byte("\"/")
sqHtmlMatch := []byte("'/")
dqGuard := []byte("\"//")
sqGuard := []byte("'//")
// XML
dqXmlMatch := []byte(""/")
sqXmlMatch := []byte("'/")
dqXmlGuard := []byte(""//")
sqXmlGuard := []byte("'//")
dqHtml := []byte("\"" + base + "/")
sqHtml := []byte("'" + base + "/")
dqXml := []byte(""" + base + "/")
sqXml := []byte("'" + base + "/")
return &absurlReplacer{htmlMatchers: []absurlMatcher{
{tSrcdq, tGrcdq, dqHtmlMatch, dqGuard, dqHtml, false},
{tSrcsq, tGsrcsq, sqHtmlMatch, sqGuard, sqHtml, false},
{tHrefdq, tGhrefdq, dqHtmlMatch, dqGuard, dqHtml, false},
{tHrefsq, tGhrefsq, sqHtmlMatch, sqGuard, sqHtml, false}},
xmlMatchers: []absurlMatcher{
{tSrcdq, tGrcdq, dqXmlMatch, dqXmlGuard, dqXml, false},
{tSrcsq, tGsrcsq, sqXmlMatch, sqXmlGuard, sqXml, false},
{tHrefdq, tGhrefdq, dqXmlMatch, dqXmlGuard, dqXml, false},
{tHrefsq, tGhrefsq, sqXmlMatch, sqXmlGuard, sqXml, false},
}}
}
func (au *absurlReplacer) replaceInHtml(content []byte) []byte {
return doReplace(content, au.htmlMatchers)
}
func (au *absurlReplacer) replaceInXml(content []byte) []byte {
return doReplace(content, au.xmlMatchers)
}

View file

@ -14,21 +14,29 @@ const CORRECT_OUTPUT_SRC_HREF_DQ = "<!DOCTYPE html><html><head><script src=\"foo
const CORRECT_OUTPUT_SRC_HREF_SQ = "<!DOCTYPE html><html><head><script src='foobar.js'></script><script src='http://base/barfoo.js'></script></head><body><nav><h1>title</h1></nav><article>content <a href='foobar'>foobar</a>. <a href='http://base/foobar'>Follow up</a></article></body></html>"
const H5_XML_CONTENT_ABS_URL = "<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"yes\" ?><feed xmlns=\"http://www.w3.org/2005/Atom\"><entry><content type=\"html\">&lt;p&gt;&lt;a href=&#34;/foobar&#34;&gt;foobar&lt;/a&gt;&lt;/p&gt; &lt;p&gt;A video: &lt;iframe src=&#39;/foo&#39;&gt;&lt;/iframe&gt;&lt;/p&gt;</content></entry></feed>"
const CORRECT_OUTPUT_SRC_HREF_IN_XML = "<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"yes\" ?><feed xmlns=\"http://www.w3.org/2005/Atom\"><entry><content type=\"html\">&lt;p&gt;&lt;a href=&#34;http://xml/foobar&#34;&gt;foobar&lt;/a&gt;&lt;/p&gt; &lt;p&gt;A video: &lt;iframe src=&#39;http://xml/foo&#39;&gt;&lt;/iframe&gt;&lt;/p&gt;</content></entry></feed>"
const CORRECT_OUTPUT_SRC_HREF_IN_XML = "<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"yes\" ?><feed xmlns=\"http://www.w3.org/2005/Atom\"><entry><content type=\"html\">&lt;p&gt;&lt;a href=&#34;http://base/foobar&#34;&gt;foobar&lt;/a&gt;&lt;/p&gt; &lt;p&gt;A video: &lt;iframe src=&#39;http://base/foo&#39;&gt;&lt;/iframe&gt;&lt;/p&gt;</content></entry></feed>"
const H5_XML_CONTENT_GUARDED = "<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"yes\" ?><feed xmlns=\"http://www.w3.org/2005/Atom\"><entry><content type=\"html\">&lt;p&gt;&lt;a href=&#34;//foobar&#34;&gt;foobar&lt;/a&gt;&lt;/p&gt; &lt;p&gt;A video: &lt;iframe src=&#39;//foo&#39;&gt;&lt;/iframe&gt;&lt;/p&gt;</content></entry></feed>"
var abs_url_tests = []test{
// additional sanity tests for replacements testing
const REPLACE_1 = "No replacements."
const REPLACE_2 = "ᚠᛇᚻ ᛒᛦᚦ ᚠᚱᚩᚠᚢᚱ\nᚠᚱᚪ ᚷᛖᚻᚹᛦᛚᚳᚢᛗ"
var abs_url_bench_tests = []test{
{H5_JS_CONTENT_DOUBLE_QUOTE, CORRECT_OUTPUT_SRC_HREF_DQ},
{H5_JS_CONTENT_SINGLE_QUOTE, CORRECT_OUTPUT_SRC_HREF_SQ},
{H5_JS_CONTENT_ABS_URL, H5_JS_CONTENT_ABS_URL},
{H5_JS_CONTENT_ABS_URL_SCHEMALESS, H5_JS_CONTENT_ABS_URL_SCHEMALESS},
}
var xml_abs_url_tests = []test{
var xml_abs_url_bench_tests = []test{
{H5_XML_CONTENT_ABS_URL, CORRECT_OUTPUT_SRC_HREF_IN_XML},
{H5_XML_CONTENT_GUARDED, H5_XML_CONTENT_GUARDED},
}
var sanity_tests = []test{{REPLACE_1, REPLACE_1}, {REPLACE_2, REPLACE_2}}
var abs_url_tests = append(abs_url_bench_tests, sanity_tests...)
var xml_abs_url_tests = append(xml_abs_url_bench_tests, sanity_tests...)
func TestChainZeroTransformers(t *testing.T) {
tr := NewChain()
in := new(bytes.Buffer)
@ -44,7 +52,7 @@ func BenchmarkAbsUrl(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
apply(b.Errorf, tr, abs_url_tests)
apply(b.Errorf, tr, abs_url_bench_tests)
}
}
@ -57,17 +65,17 @@ func TestAbsUrl(t *testing.T) {
}
func BenchmarkXmlAbsUrl(b *testing.B) {
absURLInXML, _ := AbsURLInXML("http://xml")
absURLInXML, _ := AbsURLInXML("http://base")
tr := NewChain(absURLInXML...)
b.ResetTimer()
for i := 0; i < b.N; i++ {
apply(b.Errorf, tr, xml_abs_url_tests)
apply(b.Errorf, tr, xml_abs_url_bench_tests)
}
}
func TestXMLAbsUrl(t *testing.T) {
absURLInXML, _ := AbsURLInXML("http://xml")
absURLInXML, _ := AbsURLInXML("http://base")
tr := NewChain(absURLInXML...)
apply(t.Errorf, tr, xml_abs_url_tests)
}