Collect HTML elements during the build to use in PurgeCSS etc.

The main use case for this is to use with resources.PostProcess and resources.PostCSS with purgecss.

You would normally set it up to extract keywords from your templates, doing it from the full /public takes forever for bigger sites.

Doing the template thing misses dynamically created class names etc., and it's hard/impossible to set up in when using themes.

You can enable this in your site config:

```toml
[build]
  writeStats = true
```

It will then write a `hugo_stats.json` file to the project root as part of the build.

If you're only using this for the production build, you should consider putting it below `config/production`.

You can then set it up with PostCSS like this:

```js
const purgecss = require('@fullhuman/postcss-purgecss')({
    content: [ './hugo_stats.json' ],
    defaultExtractor: (content) => {
        let els = JSON.parse(content).htmlElements;
        return els.tags.concat(els.classes, els.ids);
    }
});

module.exports = {
    plugins: [
        require('tailwindcss'),
        require('autoprefixer'),
        ...(process.env.HUGO_ENVIRONMENT === 'production' ? [ purgecss ] : [])
    ]
};
```

Fixes #6999
This commit is contained in:
Bjørn Erik Pedersen 2020-03-03 12:25:03 +01:00
parent 7791a804e2
commit 095bf64c99
No known key found for this signature in database
GPG key ID: 330E6E2BD4859D8F
10 changed files with 501 additions and 29 deletions

1
.gitignore vendored
View file

@ -20,6 +20,7 @@ dock.sh
GoBuilds GoBuilds
dist dist
hugolib/hugo_stats.json
resources/sunset.jpg resources/sunset.jpg
vendor vendor

View file

@ -29,11 +29,16 @@ import (
var DefaultBuild = Build{ var DefaultBuild = Build{
UseResourceCacheWhen: "fallback", UseResourceCacheWhen: "fallback",
WriteStats: false,
} }
// Build holds some build related condfiguration. // Build holds some build related condfiguration.
type Build struct { type Build struct {
UseResourceCacheWhen string // never, fallback, always. Default is fallback UseResourceCacheWhen string // never, fallback, always. Default is fallback
// When enabled, will collect and write a hugo_stats.json with some build
// related aggregated data (e.g. CSS class names).
WriteStats bool
} }
func (b Build) UseResourceCache(err error) bool { func (b Build) UseResourceCache(err error) bool {

2
go.mod
View file

@ -55,7 +55,7 @@ require (
go.opencensus.io v0.22.0 // indirect go.opencensus.io v0.22.0 // indirect
gocloud.dev v0.15.0 gocloud.dev v0.15.0
golang.org/x/image v0.0.0-20191214001246-9130b4cfad52 golang.org/x/image v0.0.0-20191214001246-9130b4cfad52
golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553 // indirect golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553
golang.org/x/oauth2 v0.0.0-20190523182746-aaccbc9213b0 // indirect golang.org/x/oauth2 v0.0.0-20190523182746-aaccbc9213b0 // indirect
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e
golang.org/x/sys v0.0.0-20200107144601-ef85f5a75ddf // indirect golang.org/x/sys v0.0.0-20200107144601-ef85f5a75ddf // indirect

View file

@ -408,7 +408,11 @@ func applyDeps(cfg deps.DepsCfg, sites ...*Site) error {
s.Deps = d s.Deps = d
// Set up the main publishing chain. // Set up the main publishing chain.
pub, err := publisher.NewDestinationPublisher(d.PathSpec.BaseFs.PublishFs, s.outputFormatsConfig, s.mediaTypesConfig, cfg.Cfg) pub, err := publisher.NewDestinationPublisher(
d.ResourceSpec,
s.outputFormatsConfig,
s.mediaTypesConfig,
)
if err != nil { if err != nil {
return err return err

View file

@ -16,11 +16,17 @@ package hugolib
import ( import (
"bytes" "bytes"
"context" "context"
"encoding/json"
"fmt" "fmt"
"os" "os"
"path/filepath"
"runtime/trace" "runtime/trace"
"strings" "strings"
"github.com/gohugoio/hugo/publisher"
"github.com/gohugoio/hugo/hugofs"
"github.com/gohugoio/hugo/common/para" "github.com/gohugoio/hugo/common/para"
"github.com/gohugoio/hugo/config" "github.com/gohugoio/hugo/config"
"github.com/gohugoio/hugo/resources/postpub" "github.com/gohugoio/hugo/resources/postpub"
@ -146,10 +152,10 @@ func (h *HugoSites) Build(config BuildCfg, events ...fsnotify.Event) error {
if err != nil { if err != nil {
h.SendError(err) h.SendError(err)
} }
}
if err := h.postProcess(); err != nil { if err = h.postProcess(); err != nil {
h.SendError(err) h.SendError(err)
}
} }
if h.Metrics != nil { if h.Metrics != nil {
@ -337,6 +343,12 @@ func (h *HugoSites) render(config *BuildCfg) error {
} }
func (h *HugoSites) postProcess() error { func (h *HugoSites) postProcess() error {
// Make sure to write any build stats to disk first so it's available
// to the post processors.
if err := h.writeBuildStats(); err != nil {
return err
}
var toPostProcess []resource.OriginProvider var toPostProcess []resource.OriginProvider
for _, s := range h.Sites { for _, s := range h.Sites {
for _, v := range s.ResourceSpec.PostProcessResources { for _, v := range s.ResourceSpec.PostProcessResources {
@ -422,3 +434,47 @@ func (h *HugoSites) postProcess() error {
return g.Wait() return g.Wait()
} }
type publishStats struct {
CSSClasses string `json:"cssClasses"`
}
func (h *HugoSites) writeBuildStats() error {
if !h.ResourceSpec.BuildConfig.WriteStats {
return nil
}
htmlElements := &publisher.HTMLElements{}
for _, s := range h.Sites {
stats := s.publisher.PublishStats()
htmlElements.Merge(stats.HTMLElements)
}
htmlElements.Sort()
stats := publisher.PublishStats{
HTMLElements: *htmlElements,
}
js, err := json.MarshalIndent(stats, "", " ")
if err != nil {
return err
}
filename := filepath.Join(h.WorkingDir, "hugo_stats.json")
// Make sure it's always written to the OS fs.
if err := afero.WriteFile(hugofs.Os, filename, js, 0666); err != nil {
return err
}
// Write to the destination, too, if a mem fs is in play.
if h.Fs.Source != hugofs.Os {
if err := afero.WriteFile(h.Fs.Destination, filename, js, 0666); err != nil {
return err
}
}
return nil
}

View file

@ -980,3 +980,47 @@ func TestRefIssues(t *testing.T) {
b.AssertFileContent("public/post/nested-a/content-a/index.html", `Content: http://example.com/post/nested-b/content-b/`) b.AssertFileContent("public/post/nested-a/content-a/index.html", `Content: http://example.com/post/nested-b/content-b/`)
} }
func TestClassCollector(t *testing.T) {
b := newTestSitesBuilder(t)
b.WithConfigFile("toml", `
[build]
writeStats = true
`)
b.WithTemplates("index.html", `
<div id="el1" class="a b c">Foo</div>
Some text.
<div class="c d e" id="el2">Foo</div>
`)
b.WithContent("p1.md", "")
b.Build(BuildCfg{})
b.AssertFileContent("hugo_stats.json", `
{
"htmlElements": {
"tags": [
"div"
],
"classes": [
"a",
"b",
"c",
"d",
"e"
],
"ids": [
"el1",
"el2"
]
}
}
`)
}

View file

@ -0,0 +1,268 @@
// Copyright 2020 The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package publisher
import (
"github.com/gohugoio/hugo/helpers"
"golang.org/x/net/html"
yaml "gopkg.in/yaml.v2"
"bytes"
"sort"
"strings"
"sync"
)
func newHTMLElementsCollector() *htmlElementsCollector {
return &htmlElementsCollector{
elementSet: make(map[string]bool),
}
}
func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *cssClassCollectorWriter {
return &cssClassCollectorWriter{
collector: collector,
}
}
// HTMLElements holds lists of tags and attribute values for classes and id.
type HTMLElements struct {
Tags []string `json:"tags"`
Classes []string `json:"classes"`
IDs []string `json:"ids"`
}
func (h *HTMLElements) Merge(other HTMLElements) {
h.Tags = append(h.Tags, other.Tags...)
h.Classes = append(h.Classes, other.Classes...)
h.IDs = append(h.IDs, other.IDs...)
h.Tags = helpers.UniqueStringsReuse(h.Tags)
h.Classes = helpers.UniqueStringsReuse(h.Classes)
h.IDs = helpers.UniqueStringsReuse(h.IDs)
}
func (h *HTMLElements) Sort() {
sort.Strings(h.Tags)
sort.Strings(h.Classes)
sort.Strings(h.IDs)
}
type cssClassCollectorWriter struct {
collector *htmlElementsCollector
buff bytes.Buffer
isCollecting bool
dropValue bool
inQuote bool
}
func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) {
n = len(p)
i := 0
for i < len(p) {
if !w.isCollecting {
for ; i < len(p); i++ {
b := p[i]
if b == '<' {
w.startCollecting()
break
}
}
}
if w.isCollecting {
for ; i < len(p); i++ {
b := p[i]
if !w.inQuote && b == '/' {
// End element, we don't care about those.
w.endCollecting(true)
break
}
w.toggleIfQuote(b)
if !w.inQuote && b == '>' {
w.endCollecting(false)
break
}
w.buff.WriteByte(b)
}
if !w.isCollecting {
if w.dropValue {
w.buff.Reset()
} else {
// First check if we have processed this element before.
w.collector.mu.RLock()
// See https://github.com/dominikh/go-tools/issues/723
//lint:ignore S1030 This construct avoids memory allocation for the string.
seen := w.collector.elementSet[string(w.buff.Bytes())]
w.collector.mu.RUnlock()
if seen {
w.buff.Reset()
continue
}
s := w.buff.String()
w.buff.Reset()
el := parseHTMLElement(s)
w.collector.mu.Lock()
w.collector.elementSet[s] = true
if el.Tag != "" {
w.collector.elements = append(w.collector.elements, el)
}
w.collector.mu.Unlock()
}
}
}
}
return
}
func (c *cssClassCollectorWriter) endCollecting(drop bool) {
c.isCollecting = false
c.inQuote = false
c.dropValue = drop
}
func (c *cssClassCollectorWriter) startCollecting() {
c.isCollecting = true
c.dropValue = false
}
func (c *cssClassCollectorWriter) toggleIfQuote(b byte) {
if isQuote(b) {
c.inQuote = !c.inQuote
}
}
type htmlElement struct {
Tag string
Classes []string
IDs []string
}
type htmlElementsCollector struct {
// Contains the raw HTML string. We will get the same element
// several times, and want to avoid costly reparsing when this
// is used for aggregated data only.
elementSet map[string]bool
elements []htmlElement
mu sync.RWMutex
}
func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
var (
classes []string
ids []string
tags []string
)
for _, el := range c.elements {
classes = append(classes, el.Classes...)
ids = append(ids, el.IDs...)
tags = append(tags, el.Tag)
}
classes = helpers.UniqueStringsSorted(classes)
ids = helpers.UniqueStringsSorted(ids)
tags = helpers.UniqueStringsSorted(tags)
els := HTMLElements{
Classes: classes,
IDs: ids,
Tags: tags,
}
return els
}
func isQuote(b byte) bool {
return b == '"' || b == '\''
}
var htmlJsonFixer = strings.NewReplacer(", ", "\n")
func parseHTMLElement(elStr string) (el htmlElement) {
elStr = strings.TrimSpace(elStr)
if !strings.HasSuffix(elStr, ">") {
elStr += ">"
}
n, err := html.Parse(strings.NewReader(elStr))
if err != nil {
return
}
var walk func(*html.Node)
walk = func(n *html.Node) {
if n.Type == html.ElementNode && strings.Contains(elStr, n.Data) {
el.Tag = n.Data
for _, a := range n.Attr {
switch {
case strings.EqualFold(a.Key, "id"):
// There should be only one, but one never knows...
el.IDs = append(el.IDs, a.Val)
default:
if strings.EqualFold(a.Key, "class") {
el.Classes = append(el.Classes, strings.Fields(a.Val)...)
} else {
key := strings.ToLower(a.Key)
val := strings.TrimSpace(a.Val)
if strings.Contains(key, "class") && strings.HasPrefix(val, "{") {
// This looks like a Vue or AlpineJS class binding.
// Try to unmarshal it as YAML and pull the keys.
// This may look odd, as the source is (probably) JS (JSON), but the YAML
// parser is much more lenient with simple JS input, it seems.
m := make(map[string]interface{})
val = htmlJsonFixer.Replace(strings.Trim(val, "{}"))
// Remove leading space to make it look like YAML.
lines := strings.Split(val, "\n")
for i, l := range lines {
lines[i] = strings.TrimSpace(l)
}
val = strings.Join(lines, "\n")
err := yaml.Unmarshal([]byte(val), &m)
if err == nil {
for k := range m {
el.Classes = append(el.Classes, strings.Fields(k)...)
}
} else {
// Just insert the raw values. This is used for CSS class pruning
// so, it's important not to leave out values that may be a CSS class.
el.Classes = append(el.Classes, strings.Fields(val)...)
}
}
}
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
walk(c)
}
}
walk(n)
return
}

View file

@ -0,0 +1,81 @@
// Copyright 2020 The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package publisher
import (
"fmt"
"strings"
"testing"
qt "github.com/frankban/quicktest"
)
func TestClassCollector(t *testing.T) {
c := qt.New((t))
f := func(tags, classes, ids string) HTMLElements {
var tagss, classess, idss []string
if tags != "" {
tagss = strings.Split(tags, " ")
}
if classes != "" {
classess = strings.Split(classes, " ")
}
if ids != "" {
idss = strings.Split(ids, " ")
}
return HTMLElements{
Tags: tagss,
Classes: classess,
IDs: idss,
}
}
for _, test := range []struct {
name string
html string
expect HTMLElements
}{
{"basic", `<body class="b a"></body>`, f("body", "a b", "")},
{"duplicates", `<div class="b a b"></div>`, f("div", "a b", "")},
{"single quote", `<body class='b a'></body>`, f("body", "a b", "")},
{"no quote", `<body class=b id=myelement></body>`, f("body", "b", "myelement")},
{"AlpineJS bind 1", `<body>
<div x-bind:class="{
'class1': data.open,
'class2 class3': data.foo == 'bar'
}">
</div>
</body>`, f("body div", "class1 class2 class3", "")},
{"Alpine bind 2", `<div x-bind:class="{ 'bg-black': filter.checked }"
class="inline-block mr-1 mb-2 rounded bg-gray-300 px-2 py-2">FOO</div>`,
f("div", "bg-black bg-gray-300 inline-block mb-2 mr-1 px-2 py-2 rounded", "")},
{"Alpine bind 3", `<div x-bind:class="{ 'text-gray-800': !checked, 'text-white': checked }"></div>`, f("div", "text-gray-800 text-white", "")},
{"Alpine bind 4", `<div x-bind:class="{ 'text-gray-800': !checked,
'text-white': checked }"></div>`, f("div", "text-gray-800 text-white", "")},
{"Vue bind", `<div v-bind:class="{ active: isActive }"></div>`, f("div", "active", "")},
} {
c.Run(test.name, func(c *qt.C) {
w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
fmt.Fprint(w, test.html)
got := w.collector.getHTMLElements()
c.Assert(got, qt.DeepEquals, test.expect)
})
}
}

View file

@ -1,4 +1,4 @@
// Copyright 2019 The Hugo Authors. All rights reserved. // Copyright 2020 The Hugo Authors. All rights reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
@ -18,7 +18,8 @@ import (
"io" "io"
"sync/atomic" "sync/atomic"
"github.com/gohugoio/hugo/config" "github.com/gohugoio/hugo/resources"
"github.com/gohugoio/hugo/media" "github.com/gohugoio/hugo/media"
"github.com/gohugoio/hugo/minifiers" "github.com/gohugoio/hugo/minifiers"
@ -68,17 +69,21 @@ type Descriptor struct {
// DestinationPublisher is the default and currently only publisher in Hugo. This // DestinationPublisher is the default and currently only publisher in Hugo. This
// publisher prepares and publishes an item to the defined destination, e.g. /public. // publisher prepares and publishes an item to the defined destination, e.g. /public.
type DestinationPublisher struct { type DestinationPublisher struct {
fs afero.Fs fs afero.Fs
min minifiers.Client min minifiers.Client
htmlElementsCollector *htmlElementsCollector
} }
// NewDestinationPublisher creates a new DestinationPublisher. // NewDestinationPublisher creates a new DestinationPublisher.
func NewDestinationPublisher(fs afero.Fs, outputFormats output.Formats, mediaTypes media.Types, cfg config.Provider) (pub DestinationPublisher, err error) { func NewDestinationPublisher(rs *resources.Spec, outputFormats output.Formats, mediaTypes media.Types) (pub DestinationPublisher, err error) {
pub = DestinationPublisher{fs: fs} fs := rs.BaseFs.PublishFs
pub.min, err = minifiers.New(mediaTypes, outputFormats, cfg) cfg := rs.Cfg
if err != nil { var classCollector *htmlElementsCollector
return if rs.BuildConfig.WriteStats {
classCollector = newHTMLElementsCollector()
} }
pub = DestinationPublisher{fs: fs, htmlElementsCollector: classCollector}
pub.min, err = minifiers.New(mediaTypes, outputFormats, cfg)
return return
} }
@ -111,16 +116,38 @@ func (p DestinationPublisher) Publish(d Descriptor) error {
} }
defer f.Close() defer f.Close()
_, err = io.Copy(f, src) var w io.Writer = f
if p.htmlElementsCollector != nil && d.OutputFormat.IsHTML {
w = io.MultiWriter(w, newHTMLElementsCollectorWriter(p.htmlElementsCollector))
}
_, err = io.Copy(w, src)
if err == nil && d.StatCounter != nil { if err == nil && d.StatCounter != nil {
atomic.AddUint64(d.StatCounter, uint64(1)) atomic.AddUint64(d.StatCounter, uint64(1))
} }
return err return err
} }
func (p DestinationPublisher) PublishStats() PublishStats {
if p.htmlElementsCollector == nil {
return PublishStats{}
}
return PublishStats{
HTMLElements: p.htmlElementsCollector.getHTMLElements(),
}
}
type PublishStats struct {
HTMLElements HTMLElements `json:"htmlElements"`
}
// Publisher publishes a result file. // Publisher publishes a result file.
type Publisher interface { type Publisher interface {
Publish(d Descriptor) error Publish(d Descriptor) error
PublishStats() PublishStats
} }
// XML transformer := transform.New(urlreplacers.NewAbsURLInXMLTransformer(path)) // XML transformer := transform.New(urlreplacers.NewAbsURLInXMLTransformer(path))

View file

@ -1,14 +0,0 @@
// Copyright 2018 The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package publisher