2018-12-20 19:22:03 +00:00
// Copyright 2019 The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
2022-03-17 21:03:27 +00:00
//go:build !nodeploy
2020-10-23 07:03:41 +00:00
// +build !nodeploy
2018-12-20 19:22:03 +00:00
package deploy
import (
"bytes"
"compress/gzip"
"context"
"crypto/md5"
2022-04-05 08:42:54 +00:00
"encoding/hex"
all: Rework page store, add a dynacache, improve partial rebuilds, and some general spring cleaning
There are some breaking changes in this commit, see #11455.
Closes #11455
Closes #11549
This fixes a set of bugs (see issue list) and it is also paying some technical debt accumulated over the years. We now build with Staticcheck enabled in the CI build.
The performance should be about the same as before for regular sized Hugo sites, but it should perform and scale much better to larger data sets, as objects that uses lots of memory (e.g. rendered Markdown, big JSON files read into maps with transform.Unmarshal etc.) will now get automatically garbage collected if needed. Performance on partial rebuilds when running the server in fast render mode should be the same, but the change detection should be much more accurate.
A list of the notable new features:
* A new dependency tracker that covers (almost) all of Hugo's API and is used to do fine grained partial rebuilds when running the server.
* A new and simpler tree document store which allows fast lookups and prefix-walking in all dimensions (e.g. language) concurrently.
* You can now configure an upper memory limit allowing for much larger data sets and/or running on lower specced PCs.
We have lifted the "no resources in sub folders" restriction for branch bundles (e.g. sections).
Memory Limit
* Hugos will, by default, set aside a quarter of the total system memory, but you can set this via the OS environment variable HUGO_MEMORYLIMIT (in gigabytes). This is backed by a partitioned LRU cache used throughout Hugo. A cache that gets dynamically resized in low memory situations, allowing Go's Garbage Collector to free the memory.
New Dependency Tracker: Hugo has had a rule based coarse grained approach to server rebuilds that has worked mostly pretty well, but there have been some surprises (e.g. stale content). This is now revamped with a new dependency tracker that can quickly calculate the delta given a changed resource (e.g. a content file, template, JS file etc.). This handles transitive relations, e.g. $page -> js.Build -> JS import, or $page1.Content -> render hook -> site.GetPage -> $page2.Title, or $page1.Content -> shortcode -> partial -> site.RegularPages -> $page2.Content -> shortcode ..., and should also handle changes to aggregated values (e.g. site.Lastmod) effectively.
This covers all of Hugo's API with 2 known exceptions (a list that may not be fully exhaustive):
Changes to files loaded with template func os.ReadFile may not be handled correctly. We recommend loading resources with resources.Get
Changes to Hugo objects (e.g. Page) passed in the template context to lang.Translate may not be detected correctly. We recommend having simple i18n templates without too much data context passed in other than simple types such as strings and numbers.
Note that the cachebuster configuration (when A changes then rebuild B) works well with the above, but we recommend that you revise that configuration, as it in most situations should not be needed. One example where it is still needed is with TailwindCSS and using changes to hugo_stats.json to trigger new CSS rebuilds.
Document Store: Previously, a little simplified, we split the document store (where we store pages and resources) in a tree per language. This worked pretty well, but the structure made some operations harder than they needed to be. We have now restructured it into one Radix tree for all languages. Internally the language is considered to be a dimension of that tree, and the tree can be viewed in all dimensions concurrently. This makes some operations re. language simpler (e.g. finding translations is just a slice range), but the idea is that it should also be relatively inexpensive to add more dimensions if needed (e.g. role).
Fixes #10169
Fixes #10364
Fixes #10482
Fixes #10630
Fixes #10656
Fixes #10694
Fixes #10918
Fixes #11262
Fixes #11439
Fixes #11453
Fixes #11457
Fixes #11466
Fixes #11540
Fixes #11551
Fixes #11556
Fixes #11654
Fixes #11661
Fixes #11663
Fixes #11664
Fixes #11669
Fixes #11671
Fixes #11807
Fixes #11808
Fixes #11809
Fixes #11815
Fixes #11840
Fixes #11853
Fixes #11860
Fixes #11883
Fixes #11904
Fixes #7388
Fixes #7425
Fixes #7436
Fixes #7544
Fixes #7882
Fixes #7960
Fixes #8255
Fixes #8307
Fixes #8863
Fixes #8927
Fixes #9192
Fixes #9324
2023-12-24 18:11:05 +00:00
"errors"
2018-12-20 19:22:03 +00:00
"fmt"
"io"
"mime"
"os"
"path/filepath"
2019-05-03 16:30:46 +00:00
"regexp"
2018-12-20 19:22:03 +00:00
"runtime"
2019-05-03 16:30:46 +00:00
"sort"
2018-12-20 19:22:03 +00:00
"strings"
"sync"
"github.com/dustin/go-humanize"
2020-02-27 06:26:05 +00:00
"github.com/gobwas/glob"
2023-06-16 06:17:42 +00:00
"github.com/gohugoio/hugo/common/loggers"
2018-12-20 19:22:03 +00:00
"github.com/gohugoio/hugo/config"
2024-02-07 17:24:02 +00:00
"github.com/gohugoio/hugo/deploy/deployconfig"
2020-08-03 17:06:18 +00:00
"github.com/gohugoio/hugo/media"
2018-12-20 19:22:03 +00:00
"github.com/spf13/afero"
2019-05-03 20:28:35 +00:00
"golang.org/x/text/unicode/norm"
2018-12-20 19:22:03 +00:00
"gocloud.dev/blob"
2019-09-13 12:53:36 +00:00
_ "gocloud.dev/blob/fileblob" // import
_ "gocloud.dev/blob/gcsblob" // import
_ "gocloud.dev/blob/s3blob" // import
2021-04-29 03:32:19 +00:00
"gocloud.dev/gcerrors"
2018-12-20 19:22:03 +00:00
)
// Deployer supports deploying the site to target cloud providers.
type Deployer struct {
localFs afero . Fs
2019-05-03 23:38:05 +00:00
bucket * blob . Bucket
2018-12-20 19:22:03 +00:00
2023-01-04 17:24:36 +00:00
mediaTypes media . Types // Hugo's MediaType to guess ContentType
2023-06-16 06:17:42 +00:00
quiet bool // true reduces STDOUT // TODO(bep) remove, this is a global feature.
2023-01-04 17:24:36 +00:00
2024-02-07 17:24:02 +00:00
cfg deployconfig . DeployConfig
2023-06-16 06:17:42 +00:00
logger loggers . Logger
2023-01-04 17:24:36 +00:00
2024-02-07 17:24:02 +00:00
target * deployconfig . Target // the target to deploy to
2019-05-03 23:38:05 +00:00
// For tests...
summary deploySummary // summary of latest Deploy results
}
type deploySummary struct {
NumLocal , NumRemote , NumUploads , NumDeletes int
2018-12-20 19:22:03 +00:00
}
2022-04-05 08:42:54 +00:00
const metaMD5Hash = "md5chksum" // the meta key to store md5hash in
2018-12-20 19:22:03 +00:00
// New constructs a new *Deployer.
2023-06-16 06:17:42 +00:00
func New ( cfg config . AllProvider , logger loggers . Logger , localFs afero . Fs ) ( * Deployer , error ) {
2024-02-07 17:24:02 +00:00
dcfg := cfg . GetConfigSection ( deployconfig . DeploymentConfigKey ) . ( deployconfig . DeployConfig )
2023-01-04 17:24:36 +00:00
targetName := dcfg . Target
2018-12-20 19:22:03 +00:00
2019-05-31 17:10:38 +00:00
if len ( dcfg . Targets ) == 0 {
return nil , errors . New ( "no deployment targets found" )
}
2023-01-04 17:24:36 +00:00
mediaTypes := cfg . GetConfigSection ( "mediaTypes" ) . ( media . Types )
2019-05-31 17:10:38 +00:00
2018-12-20 19:22:03 +00:00
// Find the target to deploy to.
2024-02-07 17:24:02 +00:00
var tgt * deployconfig . Target
2019-05-31 17:10:38 +00:00
if targetName == "" {
// Default to the first target.
tgt = dcfg . Targets [ 0 ]
} else {
for _ , t := range dcfg . Targets {
if t . Name == targetName {
tgt = t
}
}
if tgt == nil {
return nil , fmt . Errorf ( "deployment target %q not found" , targetName )
2018-12-20 19:22:03 +00:00
}
}
2020-08-03 17:06:18 +00:00
2018-12-20 19:22:03 +00:00
return & Deployer {
2023-01-04 17:24:36 +00:00
localFs : localFs ,
target : tgt ,
quiet : cfg . BuildExpired ( ) ,
mediaTypes : mediaTypes ,
cfg : dcfg ,
2018-12-20 19:22:03 +00:00
} , nil
}
2019-05-03 23:38:05 +00:00
func ( d * Deployer ) openBucket ( ctx context . Context ) ( * blob . Bucket , error ) {
if d . bucket != nil {
return d . bucket , nil
}
2023-06-16 06:17:42 +00:00
d . logger . Printf ( "Deploying to target %q (%s)\n" , d . target . Name , d . target . URL )
2019-05-03 23:38:05 +00:00
return blob . OpenBucket ( ctx , d . target . URL )
}
2018-12-20 19:22:03 +00:00
// Deploy deploys the site to a target.
func ( d * Deployer ) Deploy ( ctx context . Context ) error {
2023-06-16 06:17:42 +00:00
if d . logger == nil {
d . logger = loggers . NewDefault ( )
}
2019-05-03 23:38:05 +00:00
bucket , err := d . openBucket ( ctx )
2018-12-20 19:22:03 +00:00
if err != nil {
return err
}
2023-01-04 17:24:36 +00:00
if d . cfg . Workers <= 0 {
d . cfg . Workers = 10
}
2018-12-20 19:22:03 +00:00
// Load local files from the source directory.
2020-02-27 06:26:05 +00:00
var include , exclude glob . Glob
if d . target != nil {
2024-02-07 17:24:02 +00:00
include , exclude = d . target . IncludeGlob , d . target . ExcludeGlob
2020-02-27 06:26:05 +00:00
}
2023-06-16 06:17:42 +00:00
local , err := d . walkLocal ( d . localFs , d . cfg . Matchers , include , exclude , d . mediaTypes )
2018-12-20 19:22:03 +00:00
if err != nil {
return err
}
2023-06-16 06:17:42 +00:00
d . logger . Infof ( "Found %d local files.\n" , len ( local ) )
2019-05-03 23:38:05 +00:00
d . summary . NumLocal = len ( local )
2018-12-20 19:22:03 +00:00
// Load remote files from the target.
2023-06-16 06:17:42 +00:00
remote , err := d . walkRemote ( ctx , bucket , include , exclude )
2018-12-20 19:22:03 +00:00
if err != nil {
return err
}
2023-06-16 06:17:42 +00:00
d . logger . Infof ( "Found %d remote files.\n" , len ( remote ) )
2019-05-03 23:38:05 +00:00
d . summary . NumRemote = len ( remote )
2018-12-20 19:22:03 +00:00
// Diff local vs remote to see what changes need to be applied.
2023-06-16 06:17:42 +00:00
uploads , deletes := d . findDiffs ( local , remote , d . cfg . Force )
2019-05-03 23:38:05 +00:00
d . summary . NumUploads = len ( uploads )
d . summary . NumDeletes = len ( deletes )
2018-12-20 19:22:03 +00:00
if len ( uploads ) + len ( deletes ) == 0 {
if ! d . quiet {
2023-06-16 06:17:42 +00:00
d . logger . Println ( "No changes required." )
2018-12-20 19:22:03 +00:00
}
return nil
}
if ! d . quiet {
2023-06-16 06:17:42 +00:00
d . logger . Println ( summarizeChanges ( uploads , deletes ) )
2018-12-20 19:22:03 +00:00
}
// Ask for confirmation before proceeding.
2023-01-04 17:24:36 +00:00
if d . cfg . Confirm && ! d . cfg . DryRun {
2018-12-20 19:22:03 +00:00
fmt . Printf ( "Continue? (Y/n) " )
var confirm string
if _ , err := fmt . Scanln ( & confirm ) ; err != nil {
return err
}
if confirm != "" && confirm [ 0 ] != 'y' && confirm [ 0 ] != 'Y' {
return errors . New ( "aborted" )
}
}
2019-05-03 16:30:46 +00:00
// Order the uploads. They are organized in groups; all uploads in a group
// must be complete before moving on to the next group.
2024-02-07 17:24:02 +00:00
uploadGroups := applyOrdering ( d . cfg . Ordering , uploads )
2019-05-03 16:30:46 +00:00
2023-01-04 17:24:36 +00:00
nParallel := d . cfg . Workers
2018-12-20 19:22:03 +00:00
var errs [ ] error
var errMu sync . Mutex // protects errs
2019-05-03 16:30:46 +00:00
for _ , uploads := range uploadGroups {
// Short-circuit for an empty group.
if len ( uploads ) == 0 {
2018-12-20 19:22:03 +00:00
continue
}
2019-05-03 16:30:46 +00:00
// Within the group, apply uploads in parallel.
sem := make ( chan struct { } , nParallel )
for _ , upload := range uploads {
2023-01-04 17:24:36 +00:00
if d . cfg . DryRun {
2019-05-03 16:30:46 +00:00
if ! d . quiet {
2023-06-16 06:17:42 +00:00
d . logger . Printf ( "[DRY RUN] Would upload: %v\n" , upload )
2019-05-03 16:30:46 +00:00
}
continue
2018-12-20 19:22:03 +00:00
}
2019-05-03 16:30:46 +00:00
sem <- struct { } { }
go func ( upload * fileToUpload ) {
2023-06-16 06:17:42 +00:00
if err := d . doSingleUpload ( ctx , bucket , upload ) ; err != nil {
2019-05-03 16:30:46 +00:00
errMu . Lock ( )
defer errMu . Unlock ( )
errs = append ( errs , err )
}
<- sem
} ( upload )
}
// Wait for all uploads in the group to finish.
for n := nParallel ; n > 0 ; n -- {
sem <- struct { } { }
}
2018-12-20 19:22:03 +00:00
}
2023-01-04 17:24:36 +00:00
if d . cfg . MaxDeletes != - 1 && len ( deletes ) > d . cfg . MaxDeletes {
2023-06-16 06:17:42 +00:00
d . logger . Warnf ( "Skipping %d deletes because it is more than --maxDeletes (%d). If this is expected, set --maxDeletes to a larger number, or -1 to disable this check.\n" , len ( deletes ) , d . cfg . MaxDeletes )
2019-05-03 23:38:05 +00:00
d . summary . NumDeletes = 0
2018-12-20 19:22:03 +00:00
} else {
2019-05-03 16:30:46 +00:00
// Apply deletes in parallel.
sort . Slice ( deletes , func ( i , j int ) bool { return deletes [ i ] < deletes [ j ] } )
sem := make ( chan struct { } , nParallel )
2018-12-20 19:22:03 +00:00
for _ , del := range deletes {
2023-01-04 17:24:36 +00:00
if d . cfg . DryRun {
2018-12-20 19:22:03 +00:00
if ! d . quiet {
2023-06-16 06:17:42 +00:00
d . logger . Printf ( "[DRY RUN] Would delete %s\n" , del )
2018-12-20 19:22:03 +00:00
}
continue
}
sem <- struct { } { }
go func ( del string ) {
2023-06-16 06:17:42 +00:00
d . logger . Infof ( "Deleting %s...\n" , del )
2018-12-20 19:22:03 +00:00
if err := bucket . Delete ( ctx , del ) ; err != nil {
2021-04-29 03:32:19 +00:00
if gcerrors . Code ( err ) == gcerrors . NotFound {
2023-06-16 06:17:42 +00:00
d . logger . Warnf ( "Failed to delete %q because it wasn't found: %v" , del , err )
2021-04-29 03:32:19 +00:00
} else {
errMu . Lock ( )
defer errMu . Unlock ( )
errs = append ( errs , err )
}
2018-12-20 19:22:03 +00:00
}
<- sem
} ( del )
}
2019-05-03 16:30:46 +00:00
// Wait for all deletes to finish.
for n := nParallel ; n > 0 ; n -- {
sem <- struct { } { }
}
2018-12-20 19:22:03 +00:00
}
2023-01-04 17:24:36 +00:00
2018-12-20 19:22:03 +00:00
if len ( errs ) > 0 {
if ! d . quiet {
2023-06-16 06:17:42 +00:00
d . logger . Printf ( "Encountered %d errors.\n" , len ( errs ) )
2018-12-20 19:22:03 +00:00
}
return errs [ 0 ]
}
if ! d . quiet {
2023-06-16 06:17:42 +00:00
d . logger . Println ( "Success!" )
2018-12-20 19:22:03 +00:00
}
2023-01-04 17:24:36 +00:00
if d . cfg . InvalidateCDN {
2019-10-03 12:46:27 +00:00
if d . target . CloudFrontDistributionID != "" {
2023-01-04 17:24:36 +00:00
if d . cfg . DryRun {
2020-10-27 19:41:15 +00:00
if ! d . quiet {
2023-06-16 06:17:42 +00:00
d . logger . Printf ( "[DRY RUN] Would invalidate CloudFront CDN with ID %s\n" , d . target . CloudFrontDistributionID )
2020-10-27 19:41:15 +00:00
}
} else {
2023-06-16 06:17:42 +00:00
d . logger . Println ( "Invalidating CloudFront CDN..." )
2023-07-08 12:00:45 +00:00
if err := InvalidateCloudFront ( ctx , d . target ) ; err != nil {
2023-06-16 06:17:42 +00:00
d . logger . Printf ( "Failed to invalidate CloudFront CDN: %v\n" , err )
2020-10-27 19:41:15 +00:00
return err
}
2019-10-03 12:46:27 +00:00
}
}
if d . target . GoogleCloudCDNOrigin != "" {
2023-01-04 17:24:36 +00:00
if d . cfg . DryRun {
2020-10-27 19:41:15 +00:00
if ! d . quiet {
2023-06-16 06:17:42 +00:00
d . logger . Printf ( "[DRY RUN] Would invalidate Google Cloud CDN with origin %s\n" , d . target . GoogleCloudCDNOrigin )
2020-10-27 19:41:15 +00:00
}
} else {
2023-06-16 06:17:42 +00:00
d . logger . Println ( "Invalidating Google Cloud CDN..." )
2020-10-27 19:41:15 +00:00
if err := InvalidateGoogleCloudCDN ( ctx , d . target . GoogleCloudCDNOrigin ) ; err != nil {
2023-06-16 06:17:42 +00:00
d . logger . Printf ( "Failed to invalidate Google Cloud CDN: %v\n" , err )
2020-10-27 19:41:15 +00:00
return err
}
2019-10-03 12:46:27 +00:00
}
2019-05-01 20:25:06 +00:00
}
2023-06-16 06:17:42 +00:00
d . logger . Println ( "Success!" )
2019-05-01 20:25:06 +00:00
}
2018-12-20 19:22:03 +00:00
return nil
}
// summarizeChanges creates a text description of the proposed changes.
func summarizeChanges ( uploads [ ] * fileToUpload , deletes [ ] string ) string {
uploadSize := int64 ( 0 )
for _ , u := range uploads {
uploadSize += u . Local . UploadSize
}
return fmt . Sprintf ( "Identified %d file(s) to upload, totaling %s, and %d file(s) to delete." , len ( uploads ) , humanize . Bytes ( uint64 ( uploadSize ) ) , len ( deletes ) )
}
// doSingleUpload executes a single file upload.
2023-06-16 06:17:42 +00:00
func ( d * Deployer ) doSingleUpload ( ctx context . Context , bucket * blob . Bucket , upload * fileToUpload ) error {
d . logger . Infof ( "Uploading %v...\n" , upload )
2018-12-20 19:22:03 +00:00
opts := & blob . WriterOptions {
CacheControl : upload . Local . CacheControl ( ) ,
ContentEncoding : upload . Local . ContentEncoding ( ) ,
ContentType : upload . Local . ContentType ( ) ,
2022-04-05 08:42:54 +00:00
Metadata : map [ string ] string { metaMD5Hash : hex . EncodeToString ( upload . Local . MD5 ( ) ) } ,
2018-12-20 19:22:03 +00:00
}
2019-05-03 23:38:05 +00:00
w , err := bucket . NewWriter ( ctx , upload . Local . SlashPath , opts )
2018-12-20 19:22:03 +00:00
if err != nil {
return err
}
2019-05-03 23:38:05 +00:00
r , err := upload . Local . Reader ( )
if err != nil {
return err
}
defer r . Close ( )
_ , err = io . Copy ( w , r )
2018-12-20 19:22:03 +00:00
if err != nil {
return err
}
if err := w . Close ( ) ; err != nil {
return err
}
return nil
}
// localFile represents a local file from the source. Use newLocalFile to
// construct one.
type localFile struct {
2019-05-03 23:38:05 +00:00
// NativePath is the native path to the file (using file.Separator).
NativePath string
// SlashPath is NativePath converted to use /.
SlashPath string
2018-12-20 19:22:03 +00:00
// UploadSize is the size of the content to be uploaded. It may not
// be the same as the local file size if the content will be
// gzipped before upload.
UploadSize int64
2020-08-03 17:06:18 +00:00
fs afero . Fs
2024-02-07 17:24:02 +00:00
matcher * deployconfig . Matcher
2020-08-03 17:06:18 +00:00
md5 [ ] byte // cache
gzipped bytes . Buffer // cached of gzipped contents if gzipping
mediaTypes media . Types
2018-12-20 19:22:03 +00:00
}
// newLocalFile initializes a *localFile.
2024-02-07 17:24:02 +00:00
func newLocalFile ( fs afero . Fs , nativePath , slashpath string , m * deployconfig . Matcher , mt media . Types ) ( * localFile , error ) {
2019-05-03 23:38:05 +00:00
f , err := fs . Open ( nativePath )
2018-12-20 19:22:03 +00:00
if err != nil {
return nil , err
}
2019-05-03 23:38:05 +00:00
defer f . Close ( )
lf := & localFile {
NativePath : nativePath ,
SlashPath : slashpath ,
fs : fs ,
matcher : m ,
2020-08-03 17:06:18 +00:00
mediaTypes : mt ,
2018-12-20 19:22:03 +00:00
}
if m != nil && m . Gzip {
2019-05-03 23:38:05 +00:00
// We're going to gzip the content. Do it once now, and cache the result
// in gzipped. The UploadSize is the size of the gzipped content.
gz := gzip . NewWriter ( & lf . gzipped )
if _ , err := io . Copy ( gz , f ) ; err != nil {
return nil , err
}
if err := gz . Close ( ) ; err != nil {
return nil , err
}
lf . UploadSize = int64 ( lf . gzipped . Len ( ) )
} else {
// Raw content. Just get the UploadSize.
info , err := f . Stat ( )
if err != nil {
return nil , err
}
lf . UploadSize = info . Size ( )
}
return lf , nil
}
// Reader returns an io.ReadCloser for reading the content to be uploaded.
// The caller must call Close on the returned ReaderCloser.
// The reader content may not be the same as the local file content due to
// gzipping.
func ( lf * localFile ) Reader ( ) ( io . ReadCloser , error ) {
if lf . matcher != nil && lf . matcher . Gzip {
// We've got the gzipped contents cached in gzipped.
// Note: we can't use lf.gzipped directly as a Reader, since we it discards
// data after it is read, and we may read it more than once.
2023-02-18 22:43:26 +00:00
return io . NopCloser ( bytes . NewReader ( lf . gzipped . Bytes ( ) ) ) , nil
2019-05-03 23:38:05 +00:00
}
// Not expected to fail since we did it successfully earlier in newLocalFile,
// but could happen due to changes in the underlying filesystem.
return lf . fs . Open ( lf . NativePath )
2018-12-20 19:22:03 +00:00
}
// CacheControl returns the Cache-Control header to use for lf, based on the
// first matching matcher (if any).
func ( lf * localFile ) CacheControl ( ) string {
if lf . matcher == nil {
return ""
}
return lf . matcher . CacheControl
}
// ContentEncoding returns the Content-Encoding header to use for lf, based
// on the matcher's Content-Encoding and Gzip fields.
func ( lf * localFile ) ContentEncoding ( ) string {
if lf . matcher == nil {
return ""
}
if lf . matcher . Gzip {
return "gzip"
}
return lf . matcher . ContentEncoding
}
// ContentType returns the Content-Type header to use for lf.
// It first checks if there's a Content-Type header configured via a matching
// matcher; if not, it tries to generate one based on the filename extension.
// If this fails, the Content-Type will be the empty string. In this case, Go
// Cloud will automatically try to infer a Content-Type based on the file
// content.
func ( lf * localFile ) ContentType ( ) string {
if lf . matcher != nil && lf . matcher . ContentType != "" {
return lf . matcher . ContentType
}
2020-08-03 17:06:18 +00:00
ext := filepath . Ext ( lf . NativePath )
2021-03-11 08:18:01 +00:00
if mimeType , _ , found := lf . mediaTypes . GetFirstBySuffix ( strings . TrimPrefix ( ext , "." ) ) ; found {
2023-01-04 17:24:36 +00:00
return mimeType . Type
2020-08-03 17:06:18 +00:00
}
return mime . TypeByExtension ( ext )
2018-12-20 19:22:03 +00:00
}
// Force returns true if the file should be forced to re-upload based on the
// matching matcher.
func ( lf * localFile ) Force ( ) bool {
return lf . matcher != nil && lf . matcher . Force
}
// MD5 returns an MD5 hash of the content to be uploaded.
func ( lf * localFile ) MD5 ( ) [ ] byte {
if len ( lf . md5 ) > 0 {
return lf . md5
}
2019-05-03 23:38:05 +00:00
h := md5 . New ( )
r , err := lf . Reader ( )
2018-12-20 19:22:03 +00:00
if err != nil {
return nil
}
2019-05-03 23:38:05 +00:00
defer r . Close ( )
2018-12-20 19:22:03 +00:00
if _ , err := io . Copy ( h , r ) ; err != nil {
return nil
}
lf . md5 = h . Sum ( nil )
return lf . md5
}
2020-05-13 23:20:52 +00:00
// knownHiddenDirectory checks if the specified name is a well known
// hidden directory.
func knownHiddenDirectory ( name string ) bool {
2020-12-02 12:23:25 +00:00
knownDirectories := [ ] string {
2020-05-13 23:20:52 +00:00
".well-known" ,
}
for _ , dir := range knownDirectories {
if name == dir {
return true
}
}
return false
}
2019-05-03 23:38:05 +00:00
// walkLocal walks the source directory and returns a flat list of files,
// using localFile.SlashPath as the map keys.
2024-02-07 17:24:02 +00:00
func ( d * Deployer ) walkLocal ( fs afero . Fs , matchers [ ] * deployconfig . Matcher , include , exclude glob . Glob , mediaTypes media . Types ) ( map [ string ] * localFile , error ) {
2018-12-20 19:22:03 +00:00
retval := map [ string ] * localFile { }
err := afero . Walk ( fs , "" , func ( path string , info os . FileInfo , err error ) error {
if err != nil {
return err
}
if info . IsDir ( ) {
// Skip hidden directories.
if path != "" && strings . HasPrefix ( info . Name ( ) , "." ) {
2020-05-13 23:20:52 +00:00
// Except for specific hidden directories
if ! knownHiddenDirectory ( info . Name ( ) ) {
return filepath . SkipDir
}
2018-12-20 19:22:03 +00:00
}
return nil
}
// .DS_Store is an internal MacOS attribute file; skip it.
if info . Name ( ) == ".DS_Store" {
return nil
}
// When a file system is HFS+, its filepath is in NFD form.
if runtime . GOOS == "darwin" {
path = norm . NFC . String ( path )
}
2020-02-27 06:26:05 +00:00
// Check include/exclude matchers.
2019-05-03 23:38:05 +00:00
slashpath := filepath . ToSlash ( path )
2020-02-27 06:26:05 +00:00
if include != nil && ! include . Match ( slashpath ) {
2023-06-16 06:17:42 +00:00
d . logger . Infof ( " dropping %q due to include\n" , slashpath )
2020-02-27 06:26:05 +00:00
return nil
}
if exclude != nil && exclude . Match ( slashpath ) {
2023-06-16 06:17:42 +00:00
d . logger . Infof ( " dropping %q due to exclude\n" , slashpath )
2020-02-27 06:26:05 +00:00
return nil
}
// Find the first matching matcher (if any).
2024-02-07 17:24:02 +00:00
var m * deployconfig . Matcher
2018-12-20 19:22:03 +00:00
for _ , cur := range matchers {
2019-05-03 23:38:05 +00:00
if cur . Matches ( slashpath ) {
2018-12-20 19:22:03 +00:00
m = cur
break
}
}
2020-08-03 17:06:18 +00:00
lf , err := newLocalFile ( fs , path , slashpath , m , mediaTypes )
2018-12-20 19:22:03 +00:00
if err != nil {
return err
}
2019-05-03 23:38:05 +00:00
retval [ lf . SlashPath ] = lf
2018-12-20 19:22:03 +00:00
return nil
} )
if err != nil {
return nil , err
}
return retval , nil
}
// walkRemote walks the target bucket and returns a flat list.
2023-06-16 06:17:42 +00:00
func ( d * Deployer ) walkRemote ( ctx context . Context , bucket * blob . Bucket , include , exclude glob . Glob ) ( map [ string ] * blob . ListObject , error ) {
2018-12-20 19:22:03 +00:00
retval := map [ string ] * blob . ListObject { }
iter := bucket . List ( nil )
for {
obj , err := iter . Next ( ctx )
if err == io . EOF {
break
}
if err != nil {
return nil , err
}
2020-03-08 16:35:32 +00:00
// Check include/exclude matchers.
if include != nil && ! include . Match ( obj . Key ) {
2023-06-16 06:17:42 +00:00
d . logger . Infof ( " remote dropping %q due to include\n" , obj . Key )
2020-03-08 16:35:32 +00:00
continue
}
if exclude != nil && exclude . Match ( obj . Key ) {
2023-06-16 06:17:42 +00:00
d . logger . Infof ( " remote dropping %q due to exclude\n" , obj . Key )
2020-03-08 16:35:32 +00:00
continue
}
2022-04-05 08:42:54 +00:00
// If the remote didn't give us an MD5, use remote attributes MD5, if that doesn't exist compute one.
2019-05-03 20:28:35 +00:00
// This can happen for some providers (e.g., fileblob, which uses the
// local filesystem), but not for the most common Cloud providers
// (S3, GCS, Azure). Although, it can happen for S3 if the blob was uploaded
// via a multi-part upload.
// Although it's unfortunate to have to read the file, it's likely better
// than assuming a delta and re-uploading it.
if len ( obj . MD5 ) == 0 {
2022-04-05 08:42:54 +00:00
var attrMD5 [ ] byte
attrs , err := bucket . Attributes ( ctx , obj . Key )
2019-05-03 20:28:35 +00:00
if err == nil {
2022-04-05 08:42:54 +00:00
md5String , exists := attrs . Metadata [ metaMD5Hash ]
if exists {
attrMD5 , _ = hex . DecodeString ( md5String )
2019-05-03 20:28:35 +00:00
}
2022-04-05 08:42:54 +00:00
}
if len ( attrMD5 ) == 0 {
r , err := bucket . NewReader ( ctx , obj . Key , nil )
if err == nil {
h := md5 . New ( )
if _ , err := io . Copy ( h , r ) ; err == nil {
obj . MD5 = h . Sum ( nil )
}
r . Close ( )
}
} else {
obj . MD5 = attrMD5
2019-05-03 20:28:35 +00:00
}
}
2018-12-20 19:22:03 +00:00
retval [ obj . Key ] = obj
}
return retval , nil
}
// uploadReason is an enum of reasons why a file must be uploaded.
type uploadReason string
const (
reasonUnknown uploadReason = "unknown"
reasonNotFound uploadReason = "not found at target"
reasonForce uploadReason = "--force"
reasonSize uploadReason = "size differs"
reasonMD5Differs uploadReason = "md5 differs"
reasonMD5Missing uploadReason = "remote md5 missing"
)
// fileToUpload represents a single local file that should be uploaded to
// the target.
type fileToUpload struct {
Local * localFile
Reason uploadReason
}
func ( u * fileToUpload ) String ( ) string {
details := [ ] string { humanize . Bytes ( uint64 ( u . Local . UploadSize ) ) }
if s := u . Local . CacheControl ( ) ; s != "" {
details = append ( details , fmt . Sprintf ( "Cache-Control: %q" , s ) )
}
if s := u . Local . ContentEncoding ( ) ; s != "" {
details = append ( details , fmt . Sprintf ( "Content-Encoding: %q" , s ) )
}
if s := u . Local . ContentType ( ) ; s != "" {
details = append ( details , fmt . Sprintf ( "Content-Type: %q" , s ) )
}
2019-05-03 23:38:05 +00:00
return fmt . Sprintf ( "%s (%s): %v" , u . Local . SlashPath , strings . Join ( details , ", " ) , u . Reason )
2018-12-20 19:22:03 +00:00
}
// findDiffs diffs localFiles vs remoteFiles to see what changes should be
// applied to the remote target. It returns a slice of *fileToUpload and a
// slice of paths for files to delete.
2023-06-16 06:17:42 +00:00
func ( d * Deployer ) findDiffs ( localFiles map [ string ] * localFile , remoteFiles map [ string ] * blob . ListObject , force bool ) ( [ ] * fileToUpload , [ ] string ) {
2018-12-20 19:22:03 +00:00
var uploads [ ] * fileToUpload
var deletes [ ] string
found := map [ string ] bool { }
for path , lf := range localFiles {
upload := false
reason := reasonUnknown
if remoteFile , ok := remoteFiles [ path ] ; ok {
// The file exists in remote. Let's see if we need to upload it anyway.
// TODO: We don't register a diff if the metadata (e.g., Content-Type
// header) has changed. This would be difficult/expensive to detect; some
// providers return metadata along with their "List" result, but others
// (notably AWS S3) do not, so gocloud.dev's blob.Bucket doesn't expose
// it in the list result. It would require a separate request per blob
// to fetch. At least for now, we work around this by documenting it and
// providing a "force" flag (to re-upload everything) and a "force" bool
// per matcher (to re-upload all files in a matcher whose headers may have
// changed).
// Idea: extract a sample set of 1 file per extension + 1 file per matcher
// and check those files?
if force {
upload = true
reason = reasonForce
} else if lf . Force ( ) {
upload = true
reason = reasonForce
} else if lf . UploadSize != remoteFile . Size {
upload = true
reason = reasonSize
} else if len ( remoteFile . MD5 ) == 0 {
2019-05-03 20:28:35 +00:00
// This shouldn't happen unless the remote didn't give us an MD5 hash
// from List, AND we failed to compute one by reading the remote file.
// Default to considering the files different.
2018-12-20 19:22:03 +00:00
upload = true
reason = reasonMD5Missing
} else if ! bytes . Equal ( lf . MD5 ( ) , remoteFile . MD5 ) {
upload = true
reason = reasonMD5Differs
}
found [ path ] = true
} else {
// The file doesn't exist in remote.
upload = true
reason = reasonNotFound
}
if upload {
2023-06-16 06:17:42 +00:00
d . logger . Debugf ( "%s needs to be uploaded: %v\n" , path , reason )
2018-12-20 19:22:03 +00:00
uploads = append ( uploads , & fileToUpload { lf , reason } )
} else {
2023-06-16 06:17:42 +00:00
d . logger . Debugf ( "%s exists at target and does not need to be uploaded" , path )
2018-12-20 19:22:03 +00:00
}
}
// Remote files that weren't found locally should be deleted.
for path := range remoteFiles {
if ! found [ path ] {
deletes = append ( deletes , path )
}
}
return uploads , deletes
}
2019-05-03 16:30:46 +00:00
// applyOrdering returns an ordered slice of slices of uploads.
//
// The returned slice will have length len(ordering)+1.
//
// The subslice at index i, for i = 0 ... len(ordering)-1, will have all of the
2019-05-03 23:38:05 +00:00
// uploads whose Local.SlashPath matched the regex at ordering[i] (but not any
2019-05-03 16:30:46 +00:00
// previous ordering regex).
// The subslice at index len(ordering) will have the remaining uploads that
// didn't match any ordering regex.
//
2019-05-03 23:38:05 +00:00
// The subslices are sorted by Local.SlashPath.
2019-05-03 16:30:46 +00:00
func applyOrdering ( ordering [ ] * regexp . Regexp , uploads [ ] * fileToUpload ) [ ] [ ] * fileToUpload {
2019-05-03 23:38:05 +00:00
// Sort the whole slice by Local.SlashPath first.
sort . Slice ( uploads , func ( i , j int ) bool { return uploads [ i ] . Local . SlashPath < uploads [ j ] . Local . SlashPath } )
2019-05-03 16:30:46 +00:00
retval := make ( [ ] [ ] * fileToUpload , len ( ordering ) + 1 )
for _ , u := range uploads {
matched := false
for i , re := range ordering {
2019-05-03 23:38:05 +00:00
if re . MatchString ( u . Local . SlashPath ) {
2019-05-03 16:30:46 +00:00
retval [ i ] = append ( retval [ i ] , u )
matched = true
break
}
}
if ! matched {
retval [ len ( ordering ) ] = append ( retval [ len ( ordering ) ] , u )
}
}
return retval
}