Replace the MD5 hashing of images with xxHash

Note that we only use this for change detection.

The previous implementation invoked `MD5FromReaderFast` that created a MD5 has from 8 64 bytes chunks in the file, which is obviously very fast. The new implementation creates the hash from the entire file and ... seems to be even more effective:

```
name          old time/op    new time/op    delta
HashImage-10    9.45µs ±21%   10.89µs ± 1%     ~     (p=0.343 n=4+4)

name          old alloc/op   new alloc/op   delta
HashImage-10      144B ± 0%        8B ± 0%  -94.44%  (p=0.029 n=4+4)

name          old allocs/op  new allocs/op  delta
HashImage-10      4.00 ± 0%      1.00 ± 0%  -75.00%  (p=0.029 n=4+4)
```
This commit is contained in:
Bjørn Erik Pedersen 2024-07-30 12:52:54 +02:00
parent 8b5d796989
commit d5eda13cb2
7 changed files with 180 additions and 29 deletions

86
common/hashing/hashing.go Normal file
View file

@ -0,0 +1,86 @@
// Copyright 2024 The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package hashing provides common hashing utilities.
package hashing
import (
"encoding/hex"
"io"
"sync"
"github.com/cespare/xxhash/v2"
)
// XXHashFromReader calculates the xxHash for the given reader.
func XXHashFromReader(r io.ReadSeeker) (uint64, int64, error) {
h := getXxHashReadFrom()
defer putXxHashReadFrom(h)
size, err := io.Copy(h, r)
if err != nil {
return 0, 0, err
}
return h.Sum64(), size, nil
}
// XXHashFromString calculates the xxHash for the given string.
func XXHashFromString(s string) (uint64, error) {
h := xxhash.New()
h.WriteString(s)
return h.Sum64(), nil
}
// XxHashFromStringHexEncoded calculates the xxHash for the given string
// and returns the hash as a hex encoded string.
func XxHashFromStringHexEncoded(f string) string {
h := xxhash.New()
h.WriteString(f)
hash := h.Sum(nil)
return hex.EncodeToString(hash)
}
type xxhashReadFrom struct {
buff []byte
*xxhash.Digest
}
func (x *xxhashReadFrom) ReadFrom(r io.Reader) (int64, error) {
for {
n, err := r.Read(x.buff)
if n > 0 {
x.Digest.Write(x.buff[:n])
}
if err != nil {
if err == io.EOF {
err = nil
}
return int64(n), err
}
}
}
var xXhashReadFromPool = sync.Pool{
New: func() any {
return &xxhashReadFrom{Digest: xxhash.New(), buff: make([]byte, 48*1024)}
},
}
func getXxHashReadFrom() *xxhashReadFrom {
return xXhashReadFromPool.Get().(*xxhashReadFrom)
}
func putXxHashReadFrom(h *xxhashReadFrom) {
h.Reset()
xXhashReadFromPool.Put(h)
}

View file

@ -0,0 +1,79 @@
// Copyright 2024 The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package hashing
import (
"strings"
"testing"
"github.com/cespare/xxhash/v2"
qt "github.com/frankban/quicktest"
)
func TestXxHashFromReader(t *testing.T) {
c := qt.New(t)
s := "Hello World"
r := strings.NewReader(s)
got, size, err := XXHashFromReader(r)
c.Assert(err, qt.IsNil)
c.Assert(size, qt.Equals, int64(len(s)))
c.Assert(got, qt.Equals, uint64(7148569436472236994))
}
func TestXxHashFromString(t *testing.T) {
c := qt.New(t)
s := "Hello World"
got, err := XXHashFromString(s)
c.Assert(err, qt.IsNil)
c.Assert(got, qt.Equals, uint64(7148569436472236994))
}
func TestXxHashFromStringHexEncoded(t *testing.T) {
c := qt.New(t)
s := "The quick brown fox jumps over the lazy dog"
got := XxHashFromStringHexEncoded(s)
// Facit: https://asecuritysite.com/encryption/xxhash?val=The%20quick%20brown%20fox%20jumps%20over%20the%20lazy%20dog
c.Assert(got, qt.Equals, "0b242d361fda71bc")
}
func BenchmarkXXHashFromReader(b *testing.B) {
r := strings.NewReader("Hello World")
b.ResetTimer()
for i := 0; i < b.N; i++ {
XXHashFromReader(r)
r.Seek(0, 0)
}
}
func BenchmarkXXHashFromString(b *testing.B) {
s := "Hello World"
b.ResetTimer()
for i := 0; i < b.N; i++ {
XXHashFromString(s)
}
}
func BenchmarkXXHashFromStringHexEncoded(b *testing.B) {
s := "The quick brown fox jumps over the lazy dog"
b.ResetTimer()
for i := 0; i < b.N; i++ {
XxHashFromStringHexEncoded(s)
}
}
func xxHashFromString(f string) uint64 {
h := xxhash.New()
h.WriteString(f)
return h.Sum64()
}

View file

@ -27,12 +27,11 @@ import (
"unicode"
"unicode/utf8"
"github.com/cespare/xxhash/v2"
bp "github.com/gohugoio/hugo/bufferpool"
"github.com/spf13/afero"
"github.com/jdkato/prose/transform"
bp "github.com/gohugoio/hugo/bufferpool"
)
// FilePathSeparator as defined by os.Separator.
@ -258,13 +257,7 @@ func SliceToLower(s []string) []string {
return l
}
// XxHashString takes a string and returns its xxHash hash.
func XxHashString(f string) string {
h := xxhash.New()
h.WriteString(f)
hash := h.Sum(nil)
return hex.EncodeToString(hash)
}
// XXHashFromReader creates a xxHash hash from the given reader.
// MD5String takes a string and returns its MD5 hash.
func MD5String(f string) string {

View file

@ -493,7 +493,7 @@ func (i *imageResource) relTargetPathFromConfig(conf images.ImageConfig) interna
}
h := i.hash()
idStr := fmt.Sprintf("_hu%s_%d", h, i.size())
idStr := fmt.Sprintf("_hu%d_%d", h, i.size())
// Do not change for no good reason.
const md5Threshold = 100

View file

@ -26,6 +26,7 @@ import (
"github.com/gohugoio/hugo/identity"
"github.com/gohugoio/hugo/resources/internal"
"github.com/gohugoio/hugo/common/hashing"
"github.com/gohugoio/hugo/common/herrors"
"github.com/gohugoio/hugo/common/paths"
@ -307,7 +308,7 @@ type fileInfo interface {
}
type hashProvider interface {
hash() string
hash() uint64
}
var _ resource.StaleInfo = (*StaleValue[any])(nil)
@ -403,7 +404,7 @@ func (l *genericResource) size() int64 {
return l.h.size
}
func (l *genericResource) hash() string {
func (l *genericResource) hash() uint64 {
if err := l.h.init(l); err != nil {
panic(err)
}
@ -628,7 +629,7 @@ type targetPather interface {
}
type resourceHash struct {
value string
value uint64
size int64
initOnce sync.Once
}
@ -636,7 +637,7 @@ type resourceHash struct {
func (r *resourceHash) init(l hugio.ReadSeekCloserProvider) error {
var initErr error
r.initOnce.Do(func() {
var hash string
var hash uint64
var size int64
f, err := l.ReadSeekCloser()
if err != nil {
@ -656,6 +657,6 @@ func (r *resourceHash) init(l hugio.ReadSeekCloserProvider) error {
return initErr
}
func hashImage(r io.ReadSeeker) (string, int64, error) {
return helpers.MD5FromReaderFast(r)
func hashImage(r io.ReadSeeker) (uint64, int64, error) {
return hashing.XXHashFromReader(r)
}

View file

@ -16,10 +16,9 @@ package hash
import (
"context"
"encoding/hex"
"hash/fnv"
"github.com/cespare/xxhash/v2"
"github.com/gohugoio/hugo/common/hashing"
"github.com/gohugoio/hugo/deps"
"github.com/gohugoio/hugo/tpl/internal"
"github.com/spf13/cast"
@ -51,14 +50,7 @@ func (ns *Namespace) XxHash(v any) (string, error) {
return "", err
}
hasher := xxhash.New()
_, err = hasher.WriteString(conv)
if err != nil {
return "", err
}
hash := hasher.Sum(nil)
return hex.EncodeToString(hash), nil
return hashing.XxHashFromStringHexEncoded(conv), nil
}
const name = "hash"

View file

@ -18,12 +18,12 @@ import (
"fmt"
"strings"
"github.com/gohugoio/hugo/helpers"
htmltemplate "github.com/gohugoio/hugo/tpl/internal/go_templates/htmltemplate"
texttemplate "github.com/gohugoio/hugo/tpl/internal/go_templates/texttemplate"
"github.com/gohugoio/hugo/tpl/internal/go_templates/texttemplate/parse"
"github.com/gohugoio/hugo/common/hashing"
"github.com/gohugoio/hugo/common/maps"
"github.com/gohugoio/hugo/tpl"
"github.com/mitchellh/mapstructure"
@ -254,7 +254,7 @@ func (c *templateContext) handleDefer(withNode *parse.WithNode) {
c.err = errors.New("resources.PostProcess cannot be used in a deferred template")
return
}
innerHash := helpers.XxHashString(s)
innerHash := hashing.XxHashFromStringHexEncoded(s)
deferredID := tpl.HugoDeferredTemplatePrefix + innerHash
c.deferNodes[deferredID] = inner