// Copyright 2024 The Hugo Authors. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package page import ( "context" "html/template" "regexp" "strings" "unicode" "unicode/utf8" "github.com/gohugoio/hugo/common/types" "github.com/gohugoio/hugo/markup/tableofcontents" "github.com/gohugoio/hugo/media" "github.com/gohugoio/hugo/tpl" ) type Content interface { Content(context.Context) (template.HTML, error) ContentWithoutSummary(context.Context) (template.HTML, error) Summary(context.Context) (Summary, error) Plain(context.Context) string PlainWords(context.Context) []string WordCount(context.Context) int FuzzyWordCount(context.Context) int ReadingTime(context.Context) int Len(context.Context) int } type Markup interface { Render(context.Context) (Content, error) RenderString(ctx context.Context, args ...any) (template.HTML, error) RenderShortcodes(context.Context) (template.HTML, error) Fragments(context.Context) *tableofcontents.Fragments } var _ types.PrintableValueProvider = Summary{} const ( SummaryTypeAuto = "auto" SummaryTypeManual = "manual" SummaryTypeFrontMatter = "frontmatter" ) type Summary struct { Text template.HTML Type string // "auto", "manual" or "frontmatter" Truncated bool } func (s Summary) IsZero() bool { return s.Text == "" } func (s Summary) PrintableValue() any { return s.Text } var _ types.PrintableValueProvider = (*Summary)(nil) type HtmlSummary struct { source string SummaryLowHigh types.LowHigh[string] SummaryEndTag types.LowHigh[string] WrapperStart types.LowHigh[string] WrapperEnd types.LowHigh[string] Divider types.LowHigh[string] } func (s HtmlSummary) wrap(ss string) string { if s.WrapperStart.IsZero() { return ss } return s.source[s.WrapperStart.Low:s.WrapperStart.High] + ss + s.source[s.WrapperEnd.Low:s.WrapperEnd.High] } func (s HtmlSummary) wrapLeft(ss string) string { if s.WrapperStart.IsZero() { return ss } return s.source[s.WrapperStart.Low:s.WrapperStart.High] + ss } func (s HtmlSummary) Value(l types.LowHigh[string]) string { return s.source[l.Low:l.High] } func (s HtmlSummary) trimSpace(ss string) string { return strings.TrimSpace(ss) } func (s HtmlSummary) Content() string { if s.Divider.IsZero() { return s.source } ss := s.source[:s.Divider.Low] ss += s.source[s.Divider.High:] return s.trimSpace(ss) } func (s HtmlSummary) Summary() string { if s.Divider.IsZero() { return s.trimSpace(s.wrap(s.Value(s.SummaryLowHigh))) } ss := s.source[s.SummaryLowHigh.Low:s.Divider.Low] if s.SummaryLowHigh.High > s.Divider.High { ss += s.source[s.Divider.High:s.SummaryLowHigh.High] } if !s.SummaryEndTag.IsZero() { ss += s.Value(s.SummaryEndTag) } return s.trimSpace(s.wrap(ss)) } func (s HtmlSummary) ContentWithoutSummary() string { if s.Divider.IsZero() { if s.SummaryLowHigh.Low == s.WrapperStart.High && s.SummaryLowHigh.High == s.WrapperEnd.Low { return "" } return s.trimSpace(s.wrapLeft(s.source[s.SummaryLowHigh.High:])) } if s.SummaryEndTag.IsZero() { return s.trimSpace(s.wrapLeft(s.source[s.Divider.High:])) } return s.trimSpace(s.wrapLeft(s.source[s.SummaryEndTag.High:])) } func (s HtmlSummary) Truncated() bool { return s.SummaryLowHigh.High < len(s.source) } func (s *HtmlSummary) resolveParagraphTagAndSetWrapper(mt media.Type) tagReStartEnd { ptag := startEndP switch mt.SubType { case media.DefaultContentTypes.AsciiDoc.SubType: ptag = startEndDiv case media.DefaultContentTypes.ReStructuredText.SubType: const markerStart = "
" const markerEnd = "
" i1 := strings.Index(s.source, markerStart) i2 := strings.LastIndex(s.source, markerEnd) if i1 > -1 && i2 > -1 { s.WrapperStart = types.LowHigh[string]{Low: 0, High: i1 + len(markerStart)} s.WrapperEnd = types.LowHigh[string]{Low: i2, High: len(s.source)} } } return ptag } // Avoid counting words that are most likely HTML tokens. var ( isProbablyHTMLTag = regexp.MustCompile(`^<\/?[A-Za-z]+>?$`) isProablyHTMLAttribute = regexp.MustCompile(`^[A-Za-z]+=["']`) ) func isProbablyHTMLToken(s string) bool { return s == ">" || isProbablyHTMLTag.MatchString(s) || isProablyHTMLAttribute.MatchString(s) } // ExtractSummaryFromHTML extracts a summary from the given HTML content. func ExtractSummaryFromHTML(mt media.Type, input string, numWords int, isCJK bool) (result HtmlSummary) { result.source = input ptag := result.resolveParagraphTagAndSetWrapper(mt) if numWords <= 0 { return result } var count int countWord := func(word string) int { word = strings.TrimSpace(word) if len(word) == 0 { return 0 } if isProbablyHTMLToken(word) { return 0 } if isCJK { word = tpl.StripHTML(word) runeCount := utf8.RuneCountInString(word) if len(word) == runeCount { return 1 } else { return runeCount } } return 1 } high := len(input) if result.WrapperEnd.Low > 0 { high = result.WrapperEnd.Low } for j := result.WrapperStart.High; j < high; { s := input[j:] closingIndex := strings.Index(s, "") if closingIndex == -1 { break } s = s[:closingIndex] // Count the words in the current paragraph. var wi int for i, r := range s { if unicode.IsSpace(r) || (i+utf8.RuneLen(r) == len(s)) { word := s[wi:i] count += countWord(word) wi = i if count >= numWords { break } } } if count >= numWords { result.SummaryLowHigh = types.LowHigh[string]{ Low: result.WrapperStart.High, High: j + closingIndex + len(ptag.tagName) + 3, } return } j += closingIndex + len(ptag.tagName) + 2 } result.SummaryLowHigh = types.LowHigh[string]{ Low: result.WrapperStart.High, High: high, } return } // ExtractSummaryFromHTMLWithDivider extracts a summary from the given HTML content with // a manual summary divider. func ExtractSummaryFromHTMLWithDivider(mt media.Type, input, divider string) (result HtmlSummary) { result.source = input result.Divider.Low = strings.Index(input, divider) result.Divider.High = result.Divider.Low + len(divider) if result.Divider.Low == -1 { // No summary. return } ptag := result.resolveParagraphTagAndSetWrapper(mt) if !mt.IsHTML() { result.Divider, result.SummaryEndTag = expandSummaryDivider(result.source, ptag, result.Divider) } result.SummaryLowHigh = types.LowHigh[string]{ Low: result.WrapperStart.High, High: result.Divider.Low, } return } var ( pOrDiv = regexp.MustCompile(`]?>|]?>$`) startEndDiv = tagReStartEnd{ startEndOfString: regexp.MustCompile(`]*?>$`), endEndOfString: regexp.MustCompile(`$`), tagName: "div", } startEndP = tagReStartEnd{ startEndOfString: regexp.MustCompile(`]*?>$`), endEndOfString: regexp.MustCompile(`

$`), tagName: "p", } ) type tagReStartEnd struct { startEndOfString *regexp.Regexp endEndOfString *regexp.Regexp tagName string } func expandSummaryDivider(s string, re tagReStartEnd, divider types.LowHigh[string]) (types.LowHigh[string], types.LowHigh[string]) { var endMarkup types.LowHigh[string] if divider.IsZero() { return divider, endMarkup } lo, hi := divider.Low, divider.High var preserveEndMarkup bool // Find the start of the paragraph. for i := lo - 1; i >= 0; i-- { if s[i] == '>' { if match := re.startEndOfString.FindString(s[:i+1]); match != "" { lo = i - len(match) + 1 break } if match := pOrDiv.FindString(s[:i+1]); match != "" { i -= len(match) - 1 continue } } r, _ := utf8.DecodeRuneInString(s[i:]) if !unicode.IsSpace(r) { preserveEndMarkup = true break } } divider.Low = lo // Now walk forward to the end of the paragraph. for ; hi < len(s); hi++ { if s[hi] != '>' { continue } if match := re.endEndOfString.FindString(s[:hi+1]); match != "" { hi++ break } } if preserveEndMarkup { endMarkup.Low = divider.High endMarkup.High = hi } else { divider.High = hi } // Consume trailing newline if any. if divider.High < len(s) && s[divider.High] == '\n' { divider.High++ } return divider, endMarkup }