mirror of
https://github.com/gohugoio/hugo.git
synced 2024-11-07 20:30:36 -05:00
resource: Use MD5 to identify image files
But only a set of byte chunks spread around in the image file to calculate the fingerprint, which is much faster than reading the whole file: ```bash BenchmarkMD5FromFileFast/full=false-4 300000 4356 ns/op 240 B/op 5 allocs/op BenchmarkMD5FromFileFast/full=true-4 30000 42899 ns/op 32944 B/op 5 allocs/op ``` Fixes #4186
This commit is contained in:
parent
7e76a6fd3b
commit
e50a8c7a14
7 changed files with 176 additions and 26 deletions
6
Gopkg.lock
generated
6
Gopkg.lock
generated
|
@ -240,8 +240,8 @@
|
|||
".",
|
||||
"mem"
|
||||
]
|
||||
revision = "8d919cbe7e2627e417f3e45c3c0e489a5b7e2536"
|
||||
version = "v1.0.0"
|
||||
revision = "ec3a3111d1e1bdff38a61e09d5a5f5e974905611"
|
||||
version = "v1.0.1"
|
||||
|
||||
[[projects]]
|
||||
name = "github.com/spf13/cast"
|
||||
|
@ -369,6 +369,6 @@
|
|||
[solve-meta]
|
||||
analyzer-name = "dep"
|
||||
analyzer-version = 1
|
||||
inputs-digest = "2d9c34c260bc26814a0635c93009daeb9d8ffa56c29c0cff6827ae2d3e9ef96d"
|
||||
inputs-digest = "7259b4caf8e75db0b809f06d4897dc870261252e3aecd68ea1348c87a5da9d50"
|
||||
solver-name = "gps-cdcl"
|
||||
solver-version = 1
|
||||
|
|
|
@ -82,7 +82,7 @@
|
|||
|
||||
[[constraint]]
|
||||
name = "github.com/spf13/afero"
|
||||
version = "1.0.0"
|
||||
version = "1.0.1"
|
||||
|
||||
[[constraint]]
|
||||
name = "github.com/spf13/cast"
|
||||
|
|
|
@ -26,6 +26,8 @@ import (
|
|||
"unicode"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/spf13/afero"
|
||||
|
||||
"github.com/jdkato/prose/transform"
|
||||
|
||||
bp "github.com/gohugoio/hugo/bufferpool"
|
||||
|
@ -372,6 +374,57 @@ func MD5String(f string) string {
|
|||
return hex.EncodeToString(h.Sum([]byte{}))
|
||||
}
|
||||
|
||||
// MD5FromFileFast creates a MD5 hash from the given file. It only reads parts of
|
||||
// the file for speed, so don't use it if the files are very subtly different.
|
||||
// It will not close the file.
|
||||
func MD5FromFileFast(f afero.File) (string, error) {
|
||||
const (
|
||||
// Do not change once set in stone!
|
||||
maxChunks = 8
|
||||
peekSize = 64
|
||||
seek = 2048
|
||||
)
|
||||
|
||||
h := md5.New()
|
||||
buff := make([]byte, peekSize)
|
||||
|
||||
for i := 0; i < maxChunks; i++ {
|
||||
if i > 0 {
|
||||
_, err := f.Seek(seek, 0)
|
||||
if err != nil {
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
|
||||
_, err := io.ReadAtLeast(f, buff, peekSize)
|
||||
if err != nil {
|
||||
if err == io.EOF || err == io.ErrUnexpectedEOF {
|
||||
h.Write(buff)
|
||||
break
|
||||
}
|
||||
return "", err
|
||||
}
|
||||
h.Write(buff)
|
||||
}
|
||||
|
||||
h.Write(buff)
|
||||
|
||||
return hex.EncodeToString(h.Sum(nil)), nil
|
||||
}
|
||||
|
||||
// MD5FromFile creates a MD5 hash from the given file.
|
||||
// It will not close the file.
|
||||
func MD5FromFile(f afero.File) (string, error) {
|
||||
h := md5.New()
|
||||
if _, err := io.Copy(h, f); err != nil {
|
||||
return "", nil
|
||||
}
|
||||
return hex.EncodeToString(h.Sum(nil)), nil
|
||||
}
|
||||
|
||||
// IsWhitespace determines if the given rune is whitespace.
|
||||
func IsWhitespace(r rune) bool {
|
||||
return r == ' ' || r == '\t' || r == '\n' || r == '\r'
|
||||
|
|
|
@ -14,10 +14,12 @@
|
|||
package helpers
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"reflect"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/spf13/afero"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
@ -270,3 +272,91 @@ func TestToLowerMap(t *testing.T) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFastMD5FromFile(t *testing.T) {
|
||||
fs := afero.NewMemMapFs()
|
||||
|
||||
if err := afero.WriteFile(fs, "small.txt", []byte("abc"), 0777); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := afero.WriteFile(fs, "small2.txt", []byte("abd"), 0777); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := afero.WriteFile(fs, "bigger.txt", []byte(strings.Repeat("a bc d e", 100)), 0777); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := afero.WriteFile(fs, "bigger2.txt", []byte(strings.Repeat("c d e f g", 100)), 0777); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
req := require.New(t)
|
||||
|
||||
sf1, err := fs.Open("small.txt")
|
||||
req.NoError(err)
|
||||
sf2, err := fs.Open("small2.txt")
|
||||
req.NoError(err)
|
||||
|
||||
bf1, err := fs.Open("bigger.txt")
|
||||
req.NoError(err)
|
||||
bf2, err := fs.Open("bigger2.txt")
|
||||
req.NoError(err)
|
||||
|
||||
defer sf1.Close()
|
||||
defer sf2.Close()
|
||||
defer bf1.Close()
|
||||
defer bf2.Close()
|
||||
|
||||
m1, err := MD5FromFileFast(sf1)
|
||||
req.NoError(err)
|
||||
req.Equal("308d8a1127b46524b51507424071c22c", m1)
|
||||
|
||||
m2, err := MD5FromFileFast(sf2)
|
||||
req.NoError(err)
|
||||
req.NotEqual(m1, m2)
|
||||
|
||||
m3, err := MD5FromFileFast(bf1)
|
||||
req.NoError(err)
|
||||
req.NotEqual(m2, m3)
|
||||
|
||||
m4, err := MD5FromFileFast(bf2)
|
||||
req.NoError(err)
|
||||
req.NotEqual(m3, m4)
|
||||
|
||||
m5, err := MD5FromFile(bf2)
|
||||
req.NoError(err)
|
||||
req.NotEqual(m4, m5)
|
||||
}
|
||||
|
||||
func BenchmarkMD5FromFileFast(b *testing.B) {
|
||||
fs := afero.NewMemMapFs()
|
||||
|
||||
for _, full := range []bool{false, true} {
|
||||
b.Run(fmt.Sprintf("full=%t", full), func(b *testing.B) {
|
||||
for i := 0; i < b.N; i++ {
|
||||
b.StopTimer()
|
||||
if err := afero.WriteFile(fs, "file.txt", []byte(strings.Repeat("1234567890", 2000)), 0777); err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
f, err := fs.Open("file.txt")
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
b.StartTimer()
|
||||
if full {
|
||||
if _, err := MD5FromFile(f); err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
} else {
|
||||
if _, err := MD5FromFileFast(f); err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
}
|
||||
f.Close()
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -112,6 +112,8 @@ type Image struct {
|
|||
|
||||
imaging *Imaging
|
||||
|
||||
hash string
|
||||
|
||||
*genericResource
|
||||
}
|
||||
|
||||
|
@ -129,6 +131,7 @@ func (i *Image) Height() int {
|
|||
func (i *Image) WithNewBase(base string) Resource {
|
||||
return &Image{
|
||||
imaging: i.imaging,
|
||||
hash: i.hash,
|
||||
genericResource: i.genericResource.WithNewBase(base).(*genericResource)}
|
||||
}
|
||||
|
||||
|
@ -490,6 +493,7 @@ func (i *Image) clone() *Image {
|
|||
|
||||
return &Image{
|
||||
imaging: i.imaging,
|
||||
hash: i.hash,
|
||||
genericResource: &g}
|
||||
}
|
||||
|
||||
|
@ -497,20 +501,11 @@ func (i *Image) setBasePath(conf imageConfig) {
|
|||
i.rel = i.filenameFromConfig(conf)
|
||||
}
|
||||
|
||||
// We need to set this to something static during tests.
|
||||
var fiModTimeFunc = func(fi os.FileInfo) int64 {
|
||||
return fi.ModTime().Unix()
|
||||
}
|
||||
|
||||
func (i *Image) filenameFromConfig(conf imageConfig) string {
|
||||
p1, p2 := helpers.FileAndExt(i.rel)
|
||||
sizeModeStr := fmt.Sprintf("_S%d_T%d", i.osFileInfo.Size(), fiModTimeFunc(i.osFileInfo))
|
||||
// On scaling an already scaled image, we get the file info from the original.
|
||||
// Repeating the same info in the filename makes it stuttery for no good reason.
|
||||
if strings.Contains(p1, sizeModeStr) {
|
||||
sizeModeStr = ""
|
||||
}
|
||||
idStr := fmt.Sprintf("_H%s_%d", i.hash, i.osFileInfo.Size())
|
||||
|
||||
// Do not change for no good reason.
|
||||
const md5Threshold = 100
|
||||
|
||||
key := conf.key()
|
||||
|
@ -518,12 +513,16 @@ func (i *Image) filenameFromConfig(conf imageConfig) string {
|
|||
// It is useful to have the key in clear text, but when nesting transforms, it
|
||||
// can easily be too long to read, and maybe even too long
|
||||
// for the different OSes to handle.
|
||||
if len(p1)+len(sizeModeStr)+len(p2) > md5Threshold {
|
||||
if len(p1)+len(idStr)+len(p2) > md5Threshold {
|
||||
key = helpers.MD5String(p1 + key + p2)
|
||||
p1 = p1[:strings.Index(p1, "_S")]
|
||||
p1 = p1[:strings.Index(p1, "_H")]
|
||||
} else if strings.Contains(p1, idStr) {
|
||||
// On scaling an already scaled image, we get the file info from the original.
|
||||
// Repeating the same info in the filename makes it stuttery for no good reason.
|
||||
idStr = ""
|
||||
}
|
||||
|
||||
return fmt.Sprintf("%s%s_%s%s", p1, sizeModeStr, key, p2)
|
||||
return fmt.Sprintf("%s%s_%s%s", p1, idStr, key, p2)
|
||||
}
|
||||
|
||||
func decodeImaging(m map[string]interface{}) (Imaging, error) {
|
||||
|
|
|
@ -15,7 +15,6 @@ package resource
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
@ -52,9 +51,6 @@ func TestParseImageConfig(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestImageTransform(t *testing.T) {
|
||||
fiModTimeFunc = func(fi os.FileInfo) int64 {
|
||||
return int64(10111213)
|
||||
}
|
||||
|
||||
assert := require.New(t)
|
||||
|
||||
|
@ -86,13 +82,13 @@ func TestImageTransform(t *testing.T) {
|
|||
assert.Equal(200, resizedAndRotated.Height())
|
||||
assertFileCache(assert, image.spec.Fs, resizedAndRotated.RelPermalink(), 125, 200)
|
||||
|
||||
assert.Equal("/a/sunset_S90587_T10111213_300x200_resize_q75_box_center.jpg", resized.RelPermalink())
|
||||
assert.Equal("/a/sunset_H47566bb0ca0462db92c65f4033d77175_90587_300x200_resize_q75_box_center.jpg", resized.RelPermalink())
|
||||
assert.Equal(300, resized.Width())
|
||||
assert.Equal(200, resized.Height())
|
||||
|
||||
fitted, err := resized.Fit("50x50")
|
||||
assert.NoError(err)
|
||||
assert.Equal("/a/sunset_S90587_T10111213_300x200_resize_q75_box_center_50x50_fit_q75_box_center.jpg", fitted.RelPermalink())
|
||||
assert.Equal("/a/sunset_H47566bb0ca0462db92c65f4033d77175_90587_9b37eba4e4e6ea0cc56a59bb5aa98143.jpg", fitted.RelPermalink())
|
||||
assert.Equal(50, fitted.Width())
|
||||
assert.Equal(31, fitted.Height())
|
||||
|
||||
|
@ -100,13 +96,13 @@ func TestImageTransform(t *testing.T) {
|
|||
fittedAgain, _ := fitted.Fit("10x20")
|
||||
fittedAgain, err = fittedAgain.Fit("10x20")
|
||||
assert.NoError(err)
|
||||
assert.Equal("/a/sunset_f1fb715a17c42d5d4602a1870424d590.jpg", fittedAgain.RelPermalink())
|
||||
assert.Equal("/a/sunset_H47566bb0ca0462db92c65f4033d77175_90587_9a8be1402216c385e0dfd73e267c6827.jpg", fittedAgain.RelPermalink())
|
||||
assert.Equal(10, fittedAgain.Width())
|
||||
assert.Equal(6, fittedAgain.Height())
|
||||
|
||||
filled, err := image.Fill("200x100 bottomLeft")
|
||||
assert.NoError(err)
|
||||
assert.Equal("/a/sunset_S90587_T10111213_200x100_fill_q75_box_bottomleft.jpg", filled.RelPermalink())
|
||||
assert.Equal("/a/sunset_H47566bb0ca0462db92c65f4033d77175_90587_200x100_fill_q75_box_bottomleft.jpg", filled.RelPermalink())
|
||||
assert.Equal(200, filled.Width())
|
||||
assert.Equal(100, filled.Height())
|
||||
assertFileCache(assert, image.spec.Fs, filled.RelPermalink(), 200, 100)
|
||||
|
|
|
@ -153,7 +153,19 @@ func (r *Spec) newResource(
|
|||
gr := r.newGenericResource(linker, fi, absPublishDir, absSourceFilename, filepath.ToSlash(relTargetFilename), mimeType)
|
||||
|
||||
if mimeType == "image" {
|
||||
f, err := r.Fs.Source.Open(absSourceFilename)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
hash, err := helpers.MD5FromFileFast(f)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &Image{
|
||||
hash: hash,
|
||||
imaging: r.imaging,
|
||||
genericResource: gr}, nil
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue