From 3b4f17bbc9ff789faa581ac278ad109d1ac5b816 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Erik=20Pedersen?= Date: Sat, 19 Aug 2017 13:16:00 +0200 Subject: [PATCH] hugolib: Implement "related content" This closes #98, even if this commit does not do full content text search. We may revisit that problem in the future, but that deserves its own issue. Fixes #98 --- common/types/types.go | 44 ++++ common/types/types_test.go | 29 +++ hugolib/page.go | 42 +++ hugolib/pageCache.go | 10 +- hugolib/pageCache_test.go | 4 +- hugolib/pageGroup.go | 4 +- hugolib/pageSort_test.go | 2 +- hugolib/pages_related.go | 191 ++++++++++++++ hugolib/pages_related_test.go | 75 ++++++ hugolib/site.go | 21 ++ related/inverted_index.go | 450 +++++++++++++++++++++++++++++++++ related/inverted_index_test.go | 276 ++++++++++++++++++++ tpl/collections/collections.go | 6 + tpl/collections/init.go | 7 + 14 files changed, 1151 insertions(+), 10 deletions(-) create mode 100644 common/types/types.go create mode 100644 common/types/types_test.go create mode 100644 hugolib/pages_related.go create mode 100644 hugolib/pages_related_test.go create mode 100644 related/inverted_index.go create mode 100644 related/inverted_index_test.go diff --git a/common/types/types.go b/common/types/types.go new file mode 100644 index 000000000..291bf6cf3 --- /dev/null +++ b/common/types/types.go @@ -0,0 +1,44 @@ +// Copyright 2017-present The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package types contains types shared between packages in Hugo. +package types + +import ( + "fmt" + + "github.com/spf13/cast" +) + +// KeyValues holds an key and a slice of values. +type KeyValues struct { + Key interface{} + Values []interface{} +} + +// KeyString returns the key as a string, an empty string if conversion fails. +func (k KeyValues) KeyString() string { + return cast.ToString(k.Key) +} + +func (k KeyValues) String() string { + return fmt.Sprintf("%v: %v", k.Key, k.Values) +} + +func NewKeyValuesStrings(key string, values ...string) KeyValues { + iv := make([]interface{}, len(values)) + for i := 0; i < len(values); i++ { + iv[i] = values[i] + } + return KeyValues{Key: key, Values: iv} +} diff --git a/common/types/types_test.go b/common/types/types_test.go new file mode 100644 index 000000000..7cec8c0c0 --- /dev/null +++ b/common/types/types_test.go @@ -0,0 +1,29 @@ +// Copyright 2017-present The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package types + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestKeyValues(t *testing.T) { + assert := require.New(t) + + kv := NewKeyValuesStrings("key", "a1", "a2") + + assert.Equal("key", kv.KeyString()) + assert.Equal([]interface{}{"a1", "a2"}, kv.Values) +} diff --git a/hugolib/page.go b/hugolib/page.go index c29590802..a723cabb2 100644 --- a/hugolib/page.go +++ b/hugolib/page.go @@ -20,6 +20,8 @@ import ( "reflect" "unicode" + "github.com/gohugoio/hugo/related" + "github.com/bep/gitmap" "github.com/gohugoio/hugo/helpers" @@ -54,6 +56,9 @@ var ( // Assert that it implements the Eqer interface. _ compare.Eqer = (*Page)(nil) _ compare.Eqer = (*PageOutput)(nil) + + // Assert that it implements the interface needed for related searches. + _ related.Document = (*Page)(nil) ) const ( @@ -231,6 +236,28 @@ type Page struct { targetPathDescriptorPrototype *targetPathDescriptor } +// SearchKeywords implements the related.Document interface needed for fast page searches. +func (p *Page) SearchKeywords(cfg related.IndexConfig) ([]related.Keyword, error) { + + v, err := p.Param(cfg.Name) + if err != nil { + return nil, err + } + + return cfg.ToKeywords(v) +} + +// PubDate is when this page was or will be published. +// NOTE: This is currently used for search only and is not meant to be used +// directly in templates. We need to consolidate the dates in this struct. +// TODO(bep) see https://github.com/gohugoio/hugo/issues/3854 +func (p *Page) PubDate() time.Time { + if !p.PublishDate.IsZero() { + return p.PublishDate + } + return p.Date +} + func (p *Page) RSSLink() template.URL { f, found := p.outputFormats.GetByName(output.RSSFormat.Name) if !found { @@ -329,6 +356,21 @@ func (ps Pages) findPagePosByFilePath(inPath string) int { return -1 } +func (ps Pages) removeFirstIfFound(p *Page) Pages { + ii := -1 + for i, pp := range ps { + if pp == p { + ii = i + break + } + } + + if ii != -1 { + ps = append(ps[:ii], ps[ii+1:]...) + } + return ps +} + func (ps Pages) findFirstPagePosByFilePathPrefix(prefix string) int { if prefix == "" { return -1 diff --git a/hugolib/pageCache.go b/hugolib/pageCache.go index e0a3a160b..df381c679 100644 --- a/hugolib/pageCache.go +++ b/hugolib/pageCache.go @@ -36,7 +36,7 @@ func (c *pageCache) get(key string, p Pages, apply func(p Pages)) (Pages, bool) c.RLock() if cached, ok := c.m[key]; ok { for _, ps := range cached { - if probablyEqualPages(p, ps[0]) { + if fastEqualPages(p, ps[0]) { c.RUnlock() return ps[1], true } @@ -51,7 +51,7 @@ func (c *pageCache) get(key string, p Pages, apply func(p Pages)) (Pages, bool) // double-check if cached, ok := c.m[key]; ok { for _, ps := range cached { - if probablyEqualPages(p, ps[0]) { + if fastEqualPages(p, ps[0]) { return ps[1], true } } @@ -73,10 +73,10 @@ func (c *pageCache) get(key string, p Pages, apply func(p Pages)) (Pages, bool) } -// "probably" as in: we do not compare every element for big slices, but that is -// good enough for our use case. +// "fast" as in: we do not compare every element for big slices, but that is +// good enough for our use cases. // TODO(bep) there is a similar method in pagination.go. DRY. -func probablyEqualPages(p1, p2 Pages) bool { +func fastEqualPages(p1, p2 Pages) bool { if p1 == nil && p2 == nil { return true } diff --git a/hugolib/pageCache_test.go b/hugolib/pageCache_test.go index 62837394f..aa2adf6e8 100644 --- a/hugolib/pageCache_test.go +++ b/hugolib/pageCache_test.go @@ -56,8 +56,8 @@ func TestPageCache(t *testing.T) { l1.Unlock() p2, c2 := c1.get("k1", p, nil) assert.True(t, c2) - assert.True(t, probablyEqualPages(p, p2)) - assert.True(t, probablyEqualPages(p, pages)) + assert.True(t, fastEqualPages(p, p2)) + assert.True(t, fastEqualPages(p, pages)) assert.NotNil(t, p) l2.Lock() diff --git a/hugolib/pageGroup.go b/hugolib/pageGroup.go index 343ecf52e..3ccd35a06 100644 --- a/hugolib/pageGroup.go +++ b/hugolib/pageGroup.go @@ -24,8 +24,8 @@ import ( // PageGroup represents a group of pages, grouped by the key. // The key is typically a year or similar. type PageGroup struct { - Key interface{} - Pages Pages + Key interface{} + Pages } type mapKeyValues []reflect.Value diff --git a/hugolib/pageSort_test.go b/hugolib/pageSort_test.go index a17f53dc6..6379dccbe 100644 --- a/hugolib/pageSort_test.go +++ b/hugolib/pageSort_test.go @@ -115,7 +115,7 @@ func TestPageSortReverse(t *testing.T) { assert.Equal(t, 9, p2[0].fuzzyWordCount) assert.Equal(t, 0, p2[9].fuzzyWordCount) // cached - assert.True(t, probablyEqualPages(p2, p1.Reverse())) + assert.True(t, fastEqualPages(p2, p1.Reverse())) } func TestPageSortByParam(t *testing.T) { diff --git a/hugolib/pages_related.go b/hugolib/pages_related.go new file mode 100644 index 000000000..858ad0d11 --- /dev/null +++ b/hugolib/pages_related.go @@ -0,0 +1,191 @@ +// Copyright 2017-present The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hugolib + +import ( + "sync" + + "github.com/gohugoio/hugo/common/types" + "github.com/gohugoio/hugo/related" + "github.com/spf13/cast" +) + +var ( + // Assert that Pages and PageGroup implements the PageGenealogist interface. + _ PageGenealogist = (Pages)(nil) + _ PageGenealogist = PageGroup{} +) + +// A PageGenealogist finds related pages in a page collection. This interface is implemented +// by Pages and PageGroup, which makes it available as `{{ .RegularPages.Related . }}` etc. +type PageGenealogist interface { + + // Template example: + // {{ $related := .RegularPages.Related . }} + Related(doc related.Document) (Pages, error) + + // Template example: + // {{ $related := .RegularPages.RelatedIndices . "tags" "date" }} + RelatedIndices(doc related.Document, indices ...interface{}) (Pages, error) + + // Template example: + // {{ $related := .RegularPages.RelatedTo ( keyVals "tags" "hugo", "rocks") ( keyVals "date" .Date ) }} + RelatedTo(args ...types.KeyValues) (Pages, error) +} + +// Related searches all the configured indices with the search keywords from the +// supplied document. +func (p Pages) Related(doc related.Document) (Pages, error) { + page, err := unwrapPage(doc) + if err != nil { + return nil, err + } + + result, err := p.searchDoc(page) + if err != nil { + return nil, err + } + + return result.removeFirstIfFound(page), nil +} + +// RelatedIndices searches the given indices with the search keywords from the +// supplied document. +func (p Pages) RelatedIndices(doc related.Document, indices ...interface{}) (Pages, error) { + page, err := unwrapPage(doc) + if err != nil { + return nil, err + } + + indicesStr, err := cast.ToStringSliceE(indices) + if err != nil { + return nil, err + } + + result, err := p.searchDoc(page, indicesStr...) + if err != nil { + return nil, err + } + + return result.removeFirstIfFound(page), nil + +} + +// RelatedTo searches the given indices with the corresponding values. +func (p Pages) RelatedTo(args ...types.KeyValues) (Pages, error) { + if len(p) == 0 { + return nil, nil + } + + return p.search(args...) + +} + +func (p Pages) search(args ...types.KeyValues) (Pages, error) { + return p.withInvertedIndex(func(idx *related.InvertedIndex) ([]related.Document, error) { + return idx.SearchKeyValues(args...) + }) + +} + +func (p Pages) searchDoc(doc related.Document, indices ...string) (Pages, error) { + return p.withInvertedIndex(func(idx *related.InvertedIndex) ([]related.Document, error) { + return idx.SearchDoc(doc, indices...) + }) +} + +func (p Pages) withInvertedIndex(search func(idx *related.InvertedIndex) ([]related.Document, error)) (Pages, error) { + if len(p) == 0 { + return nil, nil + } + + cache := p[0].s.relatedDocsHandler + + searchIndex, err := cache.getOrCreateIndex(p) + if err != nil { + return nil, err + } + + result, err := search(searchIndex) + if err != nil { + return nil, err + } + + if len(result) > 0 { + mp := make(Pages, len(result)) + for i, match := range result { + mp[i] = match.(*Page) + } + return mp, nil + } + + return nil, nil +} + +type cachedPostingList struct { + p Pages + + postingList *related.InvertedIndex +} + +type relatedDocsHandler struct { + // This is configured in site or langugage config. + cfg related.Config + + postingLists []*cachedPostingList + mu sync.RWMutex +} + +func newSearchIndexHandler(cfg related.Config) *relatedDocsHandler { + return &relatedDocsHandler{cfg: cfg} +} + +// This assumes that a lock has been aquired. +func (s *relatedDocsHandler) getIndex(p Pages) *related.InvertedIndex { + for _, ci := range s.postingLists { + if fastEqualPages(p, ci.p) { + return ci.postingList + } + } + return nil +} + +func (s *relatedDocsHandler) getOrCreateIndex(p Pages) (*related.InvertedIndex, error) { + s.mu.RLock() + cachedIndex := s.getIndex(p) + if cachedIndex != nil { + s.mu.RUnlock() + return cachedIndex, nil + } + s.mu.RUnlock() + + s.mu.Lock() + defer s.mu.Unlock() + + if cachedIndex := s.getIndex(p); cachedIndex != nil { + return cachedIndex, nil + } + + searchIndex := related.NewInvertedIndex(s.cfg) + + for _, page := range p { + if err := searchIndex.Add(page); err != nil { + return nil, err + } + } + + s.postingLists = append(s.postingLists, &cachedPostingList{p: p, postingList: searchIndex}) + + return searchIndex, nil +} diff --git a/hugolib/pages_related_test.go b/hugolib/pages_related_test.go new file mode 100644 index 000000000..cf5da0983 --- /dev/null +++ b/hugolib/pages_related_test.go @@ -0,0 +1,75 @@ +// Copyright 2017-present The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hugolib + +import ( + "fmt" + "path/filepath" + "testing" + + "github.com/gohugoio/hugo/common/types" + "github.com/gohugoio/hugo/deps" + + "github.com/stretchr/testify/require" +) + +func TestRelated(t *testing.T) { + assert := require.New(t) + + t.Parallel() + + var ( + cfg, fs = newTestCfg() + //th = testHelper{cfg, fs, t} + ) + + pageTmpl := `--- +title: Page %d +keywords: [%s] +date: %s +--- + +Content +` + + writeSource(t, fs, filepath.Join("content", "page1.md"), fmt.Sprintf(pageTmpl, 1, "hugo, says", "2017-01-03")) + writeSource(t, fs, filepath.Join("content", "page2.md"), fmt.Sprintf(pageTmpl, 2, "hugo, rocks", "2017-01-02")) + writeSource(t, fs, filepath.Join("content", "page3.md"), fmt.Sprintf(pageTmpl, 3, "bep, says", "2017-01-01")) + + s := buildSingleSite(t, deps.DepsCfg{Fs: fs, Cfg: cfg}, BuildCfg{SkipRender: true}) + assert.Len(s.RegularPages, 3) + + result, err := s.RegularPages.RelatedTo(types.NewKeyValuesStrings("keywords", "hugo", "rocks")) + + assert.NoError(err) + assert.Len(result, 2) + assert.Equal("Page 2", result[0].Title) + assert.Equal("Page 1", result[1].Title) + + result, err = s.RegularPages.Related(s.RegularPages[0]) + assert.Len(result, 2) + assert.Equal("Page 2", result[0].Title) + assert.Equal("Page 3", result[1].Title) + + result, err = s.RegularPages.RelatedIndices(s.RegularPages[0], "keywords") + assert.Len(result, 2) + assert.Equal("Page 2", result[0].Title) + assert.Equal("Page 3", result[1].Title) + + result, err = s.RegularPages.RelatedTo(types.NewKeyValuesStrings("keywords", "bep", "rocks")) + assert.Len(result, 2) + assert.Equal("Page 2", result[0].Title) + assert.Equal("Page 3", result[1].Title) + +} diff --git a/hugolib/site.go b/hugolib/site.go index 13ca7f144..b8898264a 100644 --- a/hugolib/site.go +++ b/hugolib/site.go @@ -42,6 +42,7 @@ import ( "github.com/gohugoio/hugo/helpers" "github.com/gohugoio/hugo/output" "github.com/gohugoio/hugo/parser" + "github.com/gohugoio/hugo/related" "github.com/gohugoio/hugo/source" "github.com/gohugoio/hugo/tpl" "github.com/gohugoio/hugo/transform" @@ -135,6 +136,8 @@ type Site struct { // The func used to title case titles. titleFunc func(s string) string + relatedDocsHandler *relatedDocsHandler + siteStats *siteStats } @@ -176,6 +179,7 @@ func (s *Site) reset() *Site { layoutHandler: output.NewLayoutHandler(s.PathSpec.ThemeSet()), disabledKinds: s.disabledKinds, titleFunc: s.titleFunc, + relatedDocsHandler: newSearchIndexHandler(s.relatedDocsHandler.cfg), outputFormats: s.outputFormats, outputFormatsConfig: s.outputFormatsConfig, mediaTypesConfig: s.mediaTypesConfig, @@ -231,6 +235,21 @@ func newSite(cfg deps.DepsCfg) (*Site, error) { return nil, err } + var relatedContentConfig related.Config + + if cfg.Language.IsSet("related") { + relatedContentConfig, err = related.DecodeConfig(cfg.Language.Get("related")) + if err != nil { + return nil, err + } + } else { + relatedContentConfig = related.DefaultConfig + taxonomies := cfg.Language.GetStringMapString("taxonomies") + if _, found := taxonomies["tag"]; found { + relatedContentConfig.Add(related.IndexConfig{Name: "tags", Weight: 80}) + } + } + titleFunc := helpers.GetTitleFunc(cfg.Language.GetString("titleCaseStyle")) s := &Site{ @@ -239,6 +258,7 @@ func newSite(cfg deps.DepsCfg) (*Site, error) { Language: cfg.Language, disabledKinds: disabledKinds, titleFunc: titleFunc, + relatedDocsHandler: newSearchIndexHandler(relatedContentConfig), outputFormats: outputFormats, outputFormatsConfig: siteOutputFormatsConfig, mediaTypesConfig: siteMediaTypesConfig, @@ -1607,6 +1627,7 @@ func (s *Site) assembleTaxonomies() { // Prepare site for a new full build. func (s *Site) resetBuildState() { + s.relatedDocsHandler = newSearchIndexHandler(s.relatedDocsHandler.cfg) s.PageCollections = newPageCollectionsFromPages(s.rawAllPages) // TODO(bep) get rid of this double s.Info.PageCollections = s.PageCollections diff --git a/related/inverted_index.go b/related/inverted_index.go new file mode 100644 index 000000000..f0d598d33 --- /dev/null +++ b/related/inverted_index.go @@ -0,0 +1,450 @@ +// Copyright 2017-present The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package related holds code to help finding related content. +package related + +import ( + "errors" + "fmt" + "math" + "sort" + "strings" + "time" + + "github.com/gohugoio/hugo/common/types" + "github.com/mitchellh/mapstructure" +) + +var ( + _ Keyword = (*StringKeyword)(nil) + zeroDate = time.Time{} + + // DefaultConfig is the default related config. + DefaultConfig = Config{ + Threshold: 80, + Indices: IndexConfigs{ + IndexConfig{Name: "keywords", Weight: 100}, + IndexConfig{Name: "date", Weight: 10}, + }, + } +) + +/* +Config is the top level configuration element used to configure how to retrieve +related content in Hugo. + +An example site config.toml: + + [related] + threshold = 1 + [[related.indices]] + name = "keywords" + weight = 200 + [[related.indices]] + name = "tags" + weight = 100 + [[related.indices]] + name = "date" + weight = 1 + pattern = "2006" +*/ +type Config struct { + // Only include matches >= threshold, a normalized rank between 0 and 100. + Threshold int + + // To get stable "See also" sections we, by default, exclude newer related pages. + IncludeNewer bool + + // Will lower case all string values and queries to the indices. + // May get better results, but at a slight performance cost. + ToLower bool + + Indices IndexConfigs +} + +func (c *Config) Add(index IndexConfig) { + if c.ToLower { + index.ToLower = true + } + c.Indices = append(c.Indices, index) +} + +// IndexConfigs holds a set of index configurations. +type IndexConfigs []IndexConfig + +// IndexConfig configures an index. +type IndexConfig struct { + // The index name. This directly maps to a field or Param name. + Name string + + // Contextual pattern used to convert the Param value into a string. + // Currently only used for dates. Can be used to, say, bump posts in the same + // time frame when searching for related documents. + // For dates it follows Go's time.Format patterns, i.e. + // "2006" for YYYY and "200601" for YYYYMM. + Pattern string + + // This field's weight when doing multi-index searches. Higher is "better". + Weight int + + // Will lower case all string values in and queries tothis index. + // May get better accurate results, but at a slight performance cost. + ToLower bool +} + +// Document is the interface an indexable document in Hugo must fulfill. +type Document interface { + // SearchKeywords returns a list of keywords for the given index config. + SearchKeywords(cfg IndexConfig) ([]Keyword, error) + + // When this document was or will be published. + PubDate() time.Time +} + +// InvertedIndex holds an inverted index, also sometimes named posting list, which +// lists, for every possible search term, the documents that contain that term. +type InvertedIndex struct { + cfg Config + index map[string]map[Keyword][]Document + + minWeight int + maxWeight int +} + +func (idx *InvertedIndex) getIndexCfg(name string) (IndexConfig, bool) { + for _, conf := range idx.cfg.Indices { + if conf.Name == name { + return conf, true + } + } + + return IndexConfig{}, false +} + +// NewInvertedIndex creates a new InvertedIndex. +// Documents to index must be added in Add. +func NewInvertedIndex(cfg Config) *InvertedIndex { + idx := &InvertedIndex{index: make(map[string]map[Keyword][]Document), cfg: cfg} + for _, conf := range cfg.Indices { + idx.index[conf.Name] = make(map[Keyword][]Document) + if conf.Weight < idx.minWeight { + // By default, the weight scale starts at 0, but we allow + // negative weights. + idx.minWeight = conf.Weight + } + if conf.Weight > idx.maxWeight { + idx.maxWeight = conf.Weight + } + } + return idx +} + +// Add documents to the inverted index. +// The value must support == and !=. +func (idx *InvertedIndex) Add(docs ...Document) error { + var err error + for _, config := range idx.cfg.Indices { + if config.Weight == 0 { + // Disabled + continue + } + setm := idx.index[config.Name] + + for _, doc := range docs { + var words []Keyword + words, err = doc.SearchKeywords(config) + if err != nil { + continue + } + + for _, keyword := range words { + setm[keyword] = append(setm[keyword], doc) + } + } + } + + return err + +} + +// queryElement holds the index name and keywords that can be used to compose a +// search for related content. +type queryElement struct { + Index string + Keywords []Keyword +} + +func newQueryElement(index string, keywords ...Keyword) queryElement { + return queryElement{Index: index, Keywords: keywords} +} + +type ranks []*rank + +type rank struct { + Doc Document + Weight int + Matches int +} + +func (r *rank) addWeight(w int) { + r.Weight += w + r.Matches++ +} + +func newRank(doc Document, weight int) *rank { + return &rank{Doc: doc, Weight: weight, Matches: 1} +} + +func (r ranks) Len() int { return len(r) } +func (r ranks) Swap(i, j int) { r[i], r[j] = r[j], r[i] } +func (r ranks) Less(i, j int) bool { + if r[i].Weight == r[j].Weight { + return r[i].Doc.PubDate().After(r[j].Doc.PubDate()) + } + return r[i].Weight > r[j].Weight +} + +// SearchDoc finds the documents matching any of the keywords in the given indices +// against the given document. +// The resulting document set will be sorted according to number of matches +// and the index weights, and any matches with a rank below the configured +// threshold (normalize to 0..100) will be removed. +// If an index name is provided, only that index will be queried. +func (idx *InvertedIndex) SearchDoc(doc Document, indices ...string) ([]Document, error) { + var q []queryElement + + var configs IndexConfigs + + if len(indices) == 0 { + configs = idx.cfg.Indices + } else { + configs = make(IndexConfigs, len(indices)) + for i, indexName := range indices { + cfg, found := idx.getIndexCfg(indexName) + if !found { + return nil, fmt.Errorf("index %q not found", indexName) + } + configs[i] = cfg + } + } + + for _, cfg := range configs { + keywords, err := doc.SearchKeywords(cfg) + if err != nil { + return nil, err + } + + q = append(q, newQueryElement(cfg.Name, keywords...)) + + } + + return idx.searchDate(doc.PubDate(), q...) +} + +func (cfg IndexConfig) ToKeywords(v interface{}) ([]Keyword, error) { + var ( + keywords []Keyword + toLower = cfg.ToLower + ) + switch vv := v.(type) { + case string: + if toLower { + vv = strings.ToLower(vv) + } + keywords = append(keywords, StringKeyword(vv)) + case []string: + if toLower { + for i := 0; i < len(vv); i++ { + vv[i] = strings.ToLower(vv[i]) + } + } + keywords = append(keywords, StringsToKeywords(vv...)...) + case time.Time: + layout := "2006" + if cfg.Pattern != "" { + layout = cfg.Pattern + } + keywords = append(keywords, StringKeyword(vv.Format(layout))) + case nil: + return keywords, nil + default: + return keywords, fmt.Errorf("indexing currently not supported for for index %q and type %T", cfg.Name, vv) + } + + return keywords, nil +} + +// SearchKeyValues finds the documents matching any of the keywords in the given indices. +// The resulting document set will be sorted according to number of matches +// and the index weights, and any matches with a rank below the configured +// threshold (normalize to 0..100) will be removed. +func (idx *InvertedIndex) SearchKeyValues(args ...types.KeyValues) ([]Document, error) { + q := make([]queryElement, len(args)) + + for i, arg := range args { + var keywords []Keyword + key := arg.KeyString() + if key == "" { + return nil, fmt.Errorf("index %q not valid", arg.Key) + } + conf, found := idx.getIndexCfg(key) + if !found { + return nil, fmt.Errorf("index %q not found", key) + } + + for _, val := range arg.Values { + k, err := conf.ToKeywords(val) + if err != nil { + return nil, err + } + keywords = append(keywords, k...) + } + + q[i] = newQueryElement(conf.Name, keywords...) + + } + + return idx.search(q...) +} + +func (idx *InvertedIndex) search(query ...queryElement) ([]Document, error) { + return idx.searchDate(zeroDate, query...) +} + +func (idx *InvertedIndex) searchDate(upperDate time.Time, query ...queryElement) ([]Document, error) { + matchm := make(map[Document]*rank, 200) + applyDateFilter := !idx.cfg.IncludeNewer && !upperDate.IsZero() + + for _, el := range query { + setm, found := idx.index[el.Index] + if !found { + return []Document{}, fmt.Errorf("index for %q not found", el.Index) + } + + config, found := idx.getIndexCfg(el.Index) + if !found { + return []Document{}, fmt.Errorf("index config for %q not found", el.Index) + } + + for _, kw := range el.Keywords { + if docs, found := setm[kw]; found { + for _, doc := range docs { + if applyDateFilter { + // Exclude newer than the limit given + if doc.PubDate().After(upperDate) { + continue + } + } + r, found := matchm[doc] + if !found { + matchm[doc] = newRank(doc, config.Weight) + } else { + r.addWeight(config.Weight) + } + } + } + } + } + + if len(matchm) == 0 { + return []Document{}, nil + } + + matches := make(ranks, 0, 100) + + for _, v := range matchm { + avgWeight := v.Weight / v.Matches + weight := norm(avgWeight, idx.minWeight, idx.maxWeight) + threshold := idx.cfg.Threshold / v.Matches + + if weight >= threshold { + matches = append(matches, v) + } + } + + sort.Stable(matches) + + result := make([]Document, len(matches)) + + for i, m := range matches { + result[i] = m.Doc + } + + return result, nil +} + +// normalizes num to a number between 0 and 100. +func norm(num, min, max int) int { + if min > max { + panic("min > max") + } + return int(math.Floor((float64(num-min) / float64(max-min) * 100) + 0.5)) +} + +// DecodeConfig decodes a slice of map into Config. +func DecodeConfig(in interface{}) (Config, error) { + if in == nil { + return Config{}, errors.New("no related config provided") + } + + m, ok := in.(map[string]interface{}) + if !ok { + return Config{}, fmt.Errorf("expected map[string]interface {} got %T", in) + } + + if len(m) == 0 { + return Config{}, errors.New("empty related config provided") + } + + var c Config + + if err := mapstructure.WeakDecode(m, &c); err != nil { + return c, err + } + + if c.Threshold < 0 || c.Threshold > 100 { + return Config{}, errors.New("related threshold must be between 0 and 100") + } + + if c.ToLower { + for i, _ := range c.Indices { + c.Indices[i].ToLower = true + } + } + + return c, nil +} + +// StringKeyword is a string search keyword. +type StringKeyword string + +func (s StringKeyword) String() string { + return string(s) +} + +// Keyword is the interface a keyword in the search index must implement. +type Keyword interface { + String() string +} + +// StringsToKeywords converts the given slice of strings to a slice of Keyword. +func StringsToKeywords(s ...string) []Keyword { + kw := make([]Keyword, len(s)) + + for i := 0; i < len(s); i++ { + kw[i] = StringKeyword(s[i]) + } + + return kw +} diff --git a/related/inverted_index_test.go b/related/inverted_index_test.go new file mode 100644 index 000000000..781a969fb --- /dev/null +++ b/related/inverted_index_test.go @@ -0,0 +1,276 @@ +// Copyright 2017-present The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package related + +import ( + "fmt" + "math/rand" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +type testDoc struct { + keywords map[string][]Keyword + date time.Time +} + +func (k *testDoc) String() string { + s := "\n" + for k, v := range k.keywords { + s += k + ":\t\t" + for _, vv := range v { + s += " " + vv.String() + } + s += "\n" + } + return s +} + +func newTestDoc(name string, keywords ...string) *testDoc { + km := make(map[string][]Keyword) + + time.Sleep(1 * time.Millisecond) + kw := &testDoc{keywords: km, date: time.Now()} + + kw.addKeywords(name, keywords...) + return kw +} + +func (t *testDoc) addKeywords(name string, keywords ...string) *testDoc { + keywordm := createTestKeywords(name, keywords...) + + for k, v := range keywordm { + keywords := make([]Keyword, len(v)) + for i := 0; i < len(v); i++ { + keywords[i] = StringKeyword(v[i]) + } + t.keywords[k] = keywords + } + return t +} + +func createTestKeywords(name string, keywords ...string) map[string][]string { + return map[string][]string{ + name: keywords, + } +} + +func (k *testDoc) SearchKeywords(cfg IndexConfig) ([]Keyword, error) { + return k.keywords[cfg.Name], nil +} + +func (k *testDoc) PubDate() time.Time { + return k.date +} + +func TestSearch(t *testing.T) { + + config := Config{ + Threshold: 90, + IncludeNewer: false, + Indices: IndexConfigs{ + IndexConfig{Name: "tags", Weight: 50}, + IndexConfig{Name: "keywords", Weight: 65}, + }, + } + + idx := NewInvertedIndex(config) + //idx.debug = true + + docs := []Document{ + newTestDoc("tags", "a", "b", "c", "d"), + newTestDoc("tags", "b", "d", "g"), + newTestDoc("tags", "b", "h").addKeywords("keywords", "a"), + newTestDoc("tags", "g", "h").addKeywords("keywords", "a", "b"), + } + + idx.Add(docs...) + + t.Run("count", func(t *testing.T) { + assert := require.New(t) + assert.Len(idx.index, 2) + set1, found := idx.index["tags"] + assert.True(found) + // 6 tags + assert.Len(set1, 6) + + set2, found := idx.index["keywords"] + assert.True(found) + assert.Len(set2, 2) + + }) + + t.Run("search-tags", func(t *testing.T) { + assert := require.New(t) + m, err := idx.search(newQueryElement("tags", StringsToKeywords("a", "b", "d", "z")...)) + assert.NoError(err) + assert.Len(m, 2) + assert.Equal(docs[0], m[0]) + assert.Equal(docs[1], m[1]) + }) + + t.Run("search-tags-and-keywords", func(t *testing.T) { + assert := require.New(t) + m, err := idx.search( + newQueryElement("tags", StringsToKeywords("a", "b", "z")...), + newQueryElement("keywords", StringsToKeywords("a", "b")...)) + assert.NoError(err) + assert.Len(m, 3) + assert.Equal(docs[3], m[0]) + assert.Equal(docs[2], m[1]) + assert.Equal(docs[0], m[2]) + }) + + t.Run("searchdoc-all", func(t *testing.T) { + assert := require.New(t) + doc := newTestDoc("tags", "a").addKeywords("keywords", "a") + m, err := idx.SearchDoc(doc) + assert.NoError(err) + assert.Len(m, 2) + assert.Equal(docs[3], m[0]) + assert.Equal(docs[2], m[1]) + }) + + t.Run("searchdoc-tags", func(t *testing.T) { + assert := require.New(t) + doc := newTestDoc("tags", "a", "b", "d", "z").addKeywords("keywords", "a", "b") + m, err := idx.SearchDoc(doc, "tags") + assert.NoError(err) + assert.Len(m, 2) + assert.Equal(docs[0], m[0]) + assert.Equal(docs[1], m[1]) + }) + + t.Run("searchdoc-keywords-date", func(t *testing.T) { + assert := require.New(t) + doc := newTestDoc("tags", "a", "b", "d", "z").addKeywords("keywords", "a", "b") + // This will get a date newer than the others. + newDoc := newTestDoc("keywords", "a", "b") + idx.Add(newDoc) + + m, err := idx.SearchDoc(doc, "keywords") + assert.NoError(err) + assert.Len(m, 2) + assert.Equal(docs[3], m[0]) + }) + +} + +func BenchmarkRelatedNewIndex(b *testing.B) { + + pages := make([]*testDoc, 100) + numkeywords := 30 + allKeywords := make([]string, numkeywords) + for i := 0; i < numkeywords; i++ { + allKeywords[i] = fmt.Sprintf("keyword%d", i+1) + } + + for i := 0; i < len(pages); i++ { + start := rand.Intn(len(allKeywords)) + end := start + 3 + if end >= len(allKeywords) { + end = start + 1 + } + + kw := newTestDoc("tags", allKeywords[start:end]...) + if i%5 == 0 { + start := rand.Intn(len(allKeywords)) + end := start + 3 + if end >= len(allKeywords) { + end = start + 1 + } + kw.addKeywords("keywords", allKeywords[start:end]...) + } + + pages[i] = kw + } + + cfg := Config{ + Threshold: 50, + Indices: IndexConfigs{ + IndexConfig{Name: "tags", Weight: 100}, + IndexConfig{Name: "keywords", Weight: 200}, + }, + } + + b.Run("singles", func(b *testing.B) { + for i := 0; i < b.N; i++ { + idx := NewInvertedIndex(cfg) + for _, doc := range pages { + idx.Add(doc) + } + } + }) + + b.Run("all", func(b *testing.B) { + for i := 0; i < b.N; i++ { + idx := NewInvertedIndex(cfg) + docs := make([]Document, len(pages)) + for i := 0; i < len(pages); i++ { + docs[i] = pages[i] + } + idx.Add(docs...) + } + }) + +} + +func BenchmarkRelatedMatchesIn(b *testing.B) { + + q1 := newQueryElement("tags", StringsToKeywords("keyword2", "keyword5", "keyword32", "asdf")...) + q2 := newQueryElement("keywords", StringsToKeywords("keyword3", "keyword4")...) + + docs := make([]*testDoc, 1000) + numkeywords := 20 + allKeywords := make([]string, numkeywords) + for i := 0; i < numkeywords; i++ { + allKeywords[i] = fmt.Sprintf("keyword%d", i+1) + } + + cfg := Config{ + Threshold: 20, + Indices: IndexConfigs{ + IndexConfig{Name: "tags", Weight: 100}, + IndexConfig{Name: "keywords", Weight: 200}, + }, + } + + idx := NewInvertedIndex(cfg) + + for i := 0; i < len(docs); i++ { + start := rand.Intn(len(allKeywords)) + end := start + 3 + if end >= len(allKeywords) { + end = start + 1 + } + + index := "tags" + if i%5 == 0 { + index = "keywords" + } + + idx.Add(newTestDoc(index, allKeywords[start:end]...)) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + if i%10 == 0 { + idx.search(q2) + } else { + idx.search(q1) + } + } +} diff --git a/tpl/collections/collections.go b/tpl/collections/collections.go index bc80acbbe..8f09097a7 100644 --- a/tpl/collections/collections.go +++ b/tpl/collections/collections.go @@ -23,6 +23,7 @@ import ( "strings" "time" + "github.com/gohugoio/hugo/common/types" "github.com/gohugoio/hugo/deps" "github.com/gohugoio/hugo/helpers" "github.com/spf13/cast" @@ -641,3 +642,8 @@ func (ns *Namespace) Uniq(l interface{}) (interface{}, error) { } return ret.Interface(), nil } + +// KeyVals creates a key and values wrapper. +func (ns *Namespace) KeyVals(key interface{}, vals ...interface{}) (types.KeyValues, error) { + return types.KeyValues{Key: key, Values: vals}, nil +} diff --git a/tpl/collections/init.go b/tpl/collections/init.go index 4a7c2d875..91b0dea01 100644 --- a/tpl/collections/init.go +++ b/tpl/collections/init.go @@ -63,6 +63,13 @@ func init() { [][2]string{}, ) + ns.AddMethodMapping(ctx.KeyVals, + []string{"keyVals"}, + [][2]string{ + {`{{ keyVals "key" "a" "b" }}`, `key: [a b]`}, + }, + ) + ns.AddMethodMapping(ctx.In, []string{"in"}, [][2]string{