mirror of
https://github.com/gohugoio/hugo.git
synced 2024-11-21 20:46:30 -05:00
parent
d5601e8391
commit
e442a63bb7
4 changed files with 106 additions and 2 deletions
|
@ -160,6 +160,10 @@ applyFilter
|
||||||
weight
|
weight
|
||||||
: An integer weight that indicates _how important_ this parameter is relative to the other parameters. It can be 0, which has the effect of turning this index off, or even negative. Test with different values to see what fits your content best.
|
: An integer weight that indicates _how important_ this parameter is relative to the other parameters. It can be 0, which has the effect of turning this index off, or even negative. Test with different values to see what fits your content best.
|
||||||
|
|
||||||
|
|
||||||
|
cardinalityThreshold (default 0)
|
||||||
|
: {{< new-in "0.111.0" >}}. A percentage (0-100) used to remove common keywords from the index. As an example, setting this to 50 will remove all keywords that are used in more than 50% of the documents in the index.
|
||||||
|
|
||||||
pattern
|
pattern
|
||||||
: This is currently only relevant for dates. When listing related content, we may want to list content that is also close in time. Setting "2006" (default value for date indexes) as the pattern for a date index will add weight to pages published in the same year. For busier blogs, "200601" (year and month) may be a better default.
|
: This is currently only relevant for dates. When listing related content, we may want to list content that is also close in time. Setting "2006" (default value for date indexes) as the pattern for a date index will add weight to pages published in the same year. For busier blogs, "200601" (year and month) may be a better default.
|
||||||
|
|
||||||
|
|
|
@ -135,9 +135,21 @@ type IndexConfig struct {
|
||||||
// This field's weight when doing multi-index searches. Higher is "better".
|
// This field's weight when doing multi-index searches. Higher is "better".
|
||||||
Weight int
|
Weight int
|
||||||
|
|
||||||
|
// A percentage (0-100) used to remove common keywords from the index.
|
||||||
|
// As an example, setting this to 50 will remove all keywords that are
|
||||||
|
// used in more than 50% of the documents in the index.
|
||||||
|
CardinalityThreshold int
|
||||||
|
|
||||||
// Will lower case all string values in and queries tothis index.
|
// Will lower case all string values in and queries tothis index.
|
||||||
// May get better accurate results, but at a slight performance cost.
|
// May get better accurate results, but at a slight performance cost.
|
||||||
ToLower bool
|
ToLower bool
|
||||||
|
|
||||||
|
// Counts the number of documents in the index.
|
||||||
|
numDocs int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (cfg *IndexConfig) incrNumDocs() {
|
||||||
|
cfg.numDocs++
|
||||||
}
|
}
|
||||||
|
|
||||||
// Document is the interface an indexable document in Hugo must fulfill.
|
// Document is the interface an indexable document in Hugo must fulfill.
|
||||||
|
@ -169,6 +181,9 @@ type InvertedIndex struct {
|
||||||
|
|
||||||
minWeight int
|
minWeight int
|
||||||
maxWeight int
|
maxWeight int
|
||||||
|
|
||||||
|
// No modifications after this is set.
|
||||||
|
finalized bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func (idx *InvertedIndex) getIndexCfg(name string) (IndexConfig, bool) {
|
func (idx *InvertedIndex) getIndexCfg(name string) (IndexConfig, bool) {
|
||||||
|
@ -202,8 +217,11 @@ func NewInvertedIndex(cfg Config) *InvertedIndex {
|
||||||
// Add documents to the inverted index.
|
// Add documents to the inverted index.
|
||||||
// The value must support == and !=.
|
// The value must support == and !=.
|
||||||
func (idx *InvertedIndex) Add(ctx context.Context, docs ...Document) error {
|
func (idx *InvertedIndex) Add(ctx context.Context, docs ...Document) error {
|
||||||
|
if idx.finalized {
|
||||||
|
panic("index is finalized")
|
||||||
|
}
|
||||||
var err error
|
var err error
|
||||||
for _, config := range idx.cfg.Indices {
|
for i, config := range idx.cfg.Indices {
|
||||||
if config.Weight == 0 {
|
if config.Weight == 0 {
|
||||||
// Disabled
|
// Disabled
|
||||||
continue
|
continue
|
||||||
|
@ -211,6 +229,7 @@ func (idx *InvertedIndex) Add(ctx context.Context, docs ...Document) error {
|
||||||
setm := idx.index[config.Name]
|
setm := idx.index[config.Name]
|
||||||
|
|
||||||
for _, doc := range docs {
|
for _, doc := range docs {
|
||||||
|
var added bool
|
||||||
var words []Keyword
|
var words []Keyword
|
||||||
words, err = doc.RelatedKeywords(config)
|
words, err = doc.RelatedKeywords(config)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -218,22 +237,60 @@ func (idx *InvertedIndex) Add(ctx context.Context, docs ...Document) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, keyword := range words {
|
for _, keyword := range words {
|
||||||
|
added = true
|
||||||
setm[keyword] = append(setm[keyword], doc)
|
setm[keyword] = append(setm[keyword], doc)
|
||||||
}
|
}
|
||||||
|
|
||||||
if config.Type == TypeFragments {
|
if config.Type == TypeFragments {
|
||||||
if fp, ok := doc.(FragmentProvider); ok {
|
if fp, ok := doc.(FragmentProvider); ok {
|
||||||
for _, fragment := range fp.Fragments(ctx).Identifiers {
|
for _, fragment := range fp.Fragments(ctx).Identifiers {
|
||||||
|
added = true
|
||||||
setm[FragmentKeyword(fragment)] = append(setm[FragmentKeyword(fragment)], doc)
|
setm[FragmentKeyword(fragment)] = append(setm[FragmentKeyword(fragment)], doc)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if added {
|
||||||
|
c := &idx.cfg.Indices[i]
|
||||||
|
(*c).incrNumDocs()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (idx *InvertedIndex) Finalize(ctx context.Context) error {
|
||||||
|
if idx.finalized {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, config := range idx.cfg.Indices {
|
||||||
|
if config.CardinalityThreshold == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
setm := idx.index[config.Name]
|
||||||
|
numDocs := config.numDocs
|
||||||
|
if numDocs == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove high cardinality terms.
|
||||||
|
for k, v := range setm {
|
||||||
|
percentageWithKeyword := int(math.Ceil(float64(len(v)) / float64(numDocs) * 100))
|
||||||
|
if percentageWithKeyword > config.CardinalityThreshold {
|
||||||
|
delete(setm, k)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
idx.finalized = true
|
||||||
|
|
||||||
|
return nil
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
// queryElement holds the index name and keywords that can be used to compose a
|
// queryElement holds the index name and keywords that can be used to compose a
|
||||||
// search for related content.
|
// search for related content.
|
||||||
type queryElement struct {
|
type queryElement struct {
|
||||||
|
@ -548,12 +605,16 @@ func DecodeConfig(m maps.Params) (Config, error) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for i := range c.Indices {
|
for i := range c.Indices {
|
||||||
if c.Indices[i].Type == "" {
|
icfg := c.Indices[i]
|
||||||
|
if icfg.Type == "" {
|
||||||
c.Indices[i].Type = TypeBasic
|
c.Indices[i].Type = TypeBasic
|
||||||
}
|
}
|
||||||
if !validTypes[c.Indices[i].Type] {
|
if !validTypes[c.Indices[i].Type] {
|
||||||
return c, fmt.Errorf("invalid index type %q. Must be one of %v", c.Indices[i].Type, xmaps.Keys(validTypes))
|
return c, fmt.Errorf("invalid index type %q. Must be one of %v", c.Indices[i].Type, xmaps.Keys(validTypes))
|
||||||
}
|
}
|
||||||
|
if icfg.CardinalityThreshold < 0 || icfg.CardinalityThreshold > 100 {
|
||||||
|
return Config{}, errors.New("cardinalityThreshold threshold must be between 0 and 100")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return c, nil
|
return c, nil
|
||||||
|
|
|
@ -86,6 +86,41 @@ func (d *testDoc) PublishDate() time.Time {
|
||||||
return d.date
|
return d.date
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestCardinalityThreshold(t *testing.T) {
|
||||||
|
c := qt.New(t)
|
||||||
|
config := Config{
|
||||||
|
Threshold: 90,
|
||||||
|
IncludeNewer: false,
|
||||||
|
Indices: IndexConfigs{
|
||||||
|
IndexConfig{Name: "tags", Weight: 50, CardinalityThreshold: 79},
|
||||||
|
IndexConfig{Name: "keywords", Weight: 65, CardinalityThreshold: 90},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
idx := NewInvertedIndex(config)
|
||||||
|
hasKeyword := func(index, keyword string) bool {
|
||||||
|
_, found := idx.index[index][StringKeyword(keyword)]
|
||||||
|
return found
|
||||||
|
}
|
||||||
|
|
||||||
|
docs := []Document{
|
||||||
|
newTestDoc("tags", "a", "b", "c", "d"),
|
||||||
|
newTestDoc("tags", "b", "d", "g"),
|
||||||
|
newTestDoc("tags", "b", "d", "g"),
|
||||||
|
newTestDoc("tags", "b", "h").addKeywords("keywords", "a"),
|
||||||
|
newTestDoc("tags", "g", "h").addKeywords("keywords", "a", "b", "z"),
|
||||||
|
}
|
||||||
|
|
||||||
|
idx.Add(context.Background(), docs...)
|
||||||
|
c.Assert(idx.Finalize(context.Background()), qt.IsNil)
|
||||||
|
// Only tags=b should be removed.
|
||||||
|
c.Assert(hasKeyword("tags", "a"), qt.Equals, true)
|
||||||
|
c.Assert(hasKeyword("tags", "b"), qt.Equals, false)
|
||||||
|
c.Assert(hasKeyword("tags", "d"), qt.Equals, true)
|
||||||
|
c.Assert(hasKeyword("keywords", "b"), qt.Equals, true)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
func TestSearch(t *testing.T) {
|
func TestSearch(t *testing.T) {
|
||||||
config := Config{
|
config := Config{
|
||||||
Threshold: 90,
|
Threshold: 90,
|
||||||
|
|
|
@ -236,5 +236,9 @@ func (s *RelatedDocsHandler) getOrCreateIndex(ctx context.Context, p Pages) (*re
|
||||||
|
|
||||||
s.postingLists = append(s.postingLists, &cachedPostingList{p: p, postingList: searchIndex})
|
s.postingLists = append(s.postingLists, &cachedPostingList{p: p, postingList: searchIndex})
|
||||||
|
|
||||||
|
if err := searchIndex.Finalize(ctx); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
return searchIndex, nil
|
return searchIndex, nil
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue