hugo/resources/page/page_markup.go
Bjørn Erik Pedersen 3d6baedaec Don't count HTML markup in auto summaries
This commit also fixes a bug where a `</picture>` end tag was wrongly used to detect a end paragraph. This should be very rare, though.

Closes #12837
2024-09-10 11:03:47 +02:00

362 lines
8.5 KiB
Go

// Copyright 2024 The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package page
import (
"context"
"html/template"
"regexp"
"strings"
"unicode"
"unicode/utf8"
"github.com/gohugoio/hugo/common/types"
"github.com/gohugoio/hugo/markup/tableofcontents"
"github.com/gohugoio/hugo/media"
"github.com/gohugoio/hugo/tpl"
)
type Content interface {
Content(context.Context) (template.HTML, error)
ContentWithoutSummary(context.Context) (template.HTML, error)
Summary(context.Context) (Summary, error)
Plain(context.Context) string
PlainWords(context.Context) []string
WordCount(context.Context) int
FuzzyWordCount(context.Context) int
ReadingTime(context.Context) int
Len(context.Context) int
}
type Markup interface {
Render(context.Context) (Content, error)
RenderString(ctx context.Context, args ...any) (template.HTML, error)
RenderShortcodes(context.Context) (template.HTML, error)
Fragments(context.Context) *tableofcontents.Fragments
}
var _ types.PrintableValueProvider = Summary{}
const (
SummaryTypeAuto = "auto"
SummaryTypeManual = "manual"
SummaryTypeFrontMatter = "frontmatter"
)
type Summary struct {
Text template.HTML
Type string // "auto", "manual" or "frontmatter"
Truncated bool
}
func (s Summary) IsZero() bool {
return s.Text == ""
}
func (s Summary) PrintableValue() any {
return s.Text
}
var _ types.PrintableValueProvider = (*Summary)(nil)
type HtmlSummary struct {
source string
SummaryLowHigh types.LowHigh[string]
SummaryEndTag types.LowHigh[string]
WrapperStart types.LowHigh[string]
WrapperEnd types.LowHigh[string]
Divider types.LowHigh[string]
}
func (s HtmlSummary) wrap(ss string) string {
if s.WrapperStart.IsZero() {
return ss
}
return s.source[s.WrapperStart.Low:s.WrapperStart.High] + ss + s.source[s.WrapperEnd.Low:s.WrapperEnd.High]
}
func (s HtmlSummary) wrapLeft(ss string) string {
if s.WrapperStart.IsZero() {
return ss
}
return s.source[s.WrapperStart.Low:s.WrapperStart.High] + ss
}
func (s HtmlSummary) Value(l types.LowHigh[string]) string {
return s.source[l.Low:l.High]
}
func (s HtmlSummary) trimSpace(ss string) string {
return strings.TrimSpace(ss)
}
func (s HtmlSummary) Content() string {
if s.Divider.IsZero() {
return s.source
}
ss := s.source[:s.Divider.Low]
ss += s.source[s.Divider.High:]
return s.trimSpace(ss)
}
func (s HtmlSummary) Summary() string {
if s.Divider.IsZero() {
return s.trimSpace(s.wrap(s.Value(s.SummaryLowHigh)))
}
ss := s.source[s.SummaryLowHigh.Low:s.Divider.Low]
if s.SummaryLowHigh.High > s.Divider.High {
ss += s.source[s.Divider.High:s.SummaryLowHigh.High]
}
if !s.SummaryEndTag.IsZero() {
ss += s.Value(s.SummaryEndTag)
}
return s.trimSpace(s.wrap(ss))
}
func (s HtmlSummary) ContentWithoutSummary() string {
if s.Divider.IsZero() {
if s.SummaryLowHigh.Low == s.WrapperStart.High && s.SummaryLowHigh.High == s.WrapperEnd.Low {
return ""
}
return s.trimSpace(s.wrapLeft(s.source[s.SummaryLowHigh.High:]))
}
if s.SummaryEndTag.IsZero() {
return s.trimSpace(s.wrapLeft(s.source[s.Divider.High:]))
}
return s.trimSpace(s.wrapLeft(s.source[s.SummaryEndTag.High:]))
}
func (s HtmlSummary) Truncated() bool {
return s.SummaryLowHigh.High < len(s.source)
}
func (s *HtmlSummary) resolveParagraphTagAndSetWrapper(mt media.Type) tagReStartEnd {
ptag := startEndP
switch mt.SubType {
case media.DefaultContentTypes.AsciiDoc.SubType:
ptag = startEndDiv
case media.DefaultContentTypes.ReStructuredText.SubType:
const markerStart = "<div class=\"document\">"
const markerEnd = "</div>"
i1 := strings.Index(s.source, markerStart)
i2 := strings.LastIndex(s.source, markerEnd)
if i1 > -1 && i2 > -1 {
s.WrapperStart = types.LowHigh[string]{Low: 0, High: i1 + len(markerStart)}
s.WrapperEnd = types.LowHigh[string]{Low: i2, High: len(s.source)}
}
}
return ptag
}
// Avoid counting words that are most likely HTML tokens.
var (
isProbablyHTMLTag = regexp.MustCompile(`^<\/?[A-Za-z]+>?$`)
isProablyHTMLAttribute = regexp.MustCompile(`^[A-Za-z]+=["']`)
)
func isProbablyHTMLToken(s string) bool {
return s == ">" || isProbablyHTMLTag.MatchString(s) || isProablyHTMLAttribute.MatchString(s)
}
// ExtractSummaryFromHTML extracts a summary from the given HTML content.
func ExtractSummaryFromHTML(mt media.Type, input string, numWords int, isCJK bool) (result HtmlSummary) {
result.source = input
ptag := result.resolveParagraphTagAndSetWrapper(mt)
if numWords <= 0 {
return result
}
var count int
countWord := func(word string) int {
word = strings.TrimSpace(word)
if len(word) == 0 {
return 0
}
if isProbablyHTMLToken(word) {
return 0
}
if isCJK {
word = tpl.StripHTML(word)
runeCount := utf8.RuneCountInString(word)
if len(word) == runeCount {
return 1
} else {
return runeCount
}
}
return 1
}
high := len(input)
if result.WrapperEnd.Low > 0 {
high = result.WrapperEnd.Low
}
for j := result.WrapperStart.High; j < high; {
s := input[j:]
closingIndex := strings.Index(s, "</"+ptag.tagName+">")
if closingIndex == -1 {
break
}
s = s[:closingIndex]
// Count the words in the current paragraph.
var wi int
for i, r := range s {
if unicode.IsSpace(r) || (i+utf8.RuneLen(r) == len(s)) {
word := s[wi:i]
count += countWord(word)
wi = i
if count >= numWords {
break
}
}
}
if count >= numWords {
result.SummaryLowHigh = types.LowHigh[string]{
Low: result.WrapperStart.High,
High: j + closingIndex + len(ptag.tagName) + 3,
}
return
}
j += closingIndex + len(ptag.tagName) + 2
}
result.SummaryLowHigh = types.LowHigh[string]{
Low: result.WrapperStart.High,
High: high,
}
return
}
// ExtractSummaryFromHTMLWithDivider extracts a summary from the given HTML content with
// a manual summary divider.
func ExtractSummaryFromHTMLWithDivider(mt media.Type, input, divider string) (result HtmlSummary) {
result.source = input
result.Divider.Low = strings.Index(input, divider)
result.Divider.High = result.Divider.Low + len(divider)
if result.Divider.Low == -1 {
// No summary.
return
}
ptag := result.resolveParagraphTagAndSetWrapper(mt)
if !mt.IsHTML() {
result.Divider, result.SummaryEndTag = expandSummaryDivider(result.source, ptag, result.Divider)
}
result.SummaryLowHigh = types.LowHigh[string]{
Low: result.WrapperStart.High,
High: result.Divider.Low,
}
return
}
var (
pOrDiv = regexp.MustCompile(`<p[^>]?>|<div[^>]?>$`)
startEndDiv = tagReStartEnd{
startEndOfString: regexp.MustCompile(`<div[^>]*?>$`),
endEndOfString: regexp.MustCompile(`</div>$`),
tagName: "div",
}
startEndP = tagReStartEnd{
startEndOfString: regexp.MustCompile(`<p[^>]*?>$`),
endEndOfString: regexp.MustCompile(`</p>$`),
tagName: "p",
}
)
type tagReStartEnd struct {
startEndOfString *regexp.Regexp
endEndOfString *regexp.Regexp
tagName string
}
func expandSummaryDivider(s string, re tagReStartEnd, divider types.LowHigh[string]) (types.LowHigh[string], types.LowHigh[string]) {
var endMarkup types.LowHigh[string]
if divider.IsZero() {
return divider, endMarkup
}
lo, hi := divider.Low, divider.High
var preserveEndMarkup bool
// Find the start of the paragraph.
for i := lo - 1; i >= 0; i-- {
if s[i] == '>' {
if match := re.startEndOfString.FindString(s[:i+1]); match != "" {
lo = i - len(match) + 1
break
}
if match := pOrDiv.FindString(s[:i+1]); match != "" {
i -= len(match) - 1
continue
}
}
r, _ := utf8.DecodeRuneInString(s[i:])
if !unicode.IsSpace(r) {
preserveEndMarkup = true
break
}
}
divider.Low = lo
// Now walk forward to the end of the paragraph.
for ; hi < len(s); hi++ {
if s[hi] != '>' {
continue
}
if match := re.endEndOfString.FindString(s[:hi+1]); match != "" {
hi++
break
}
}
if preserveEndMarkup {
endMarkup.Low = divider.High
endMarkup.High = hi
} else {
divider.High = hi
}
// Consume trailing newline if any.
if divider.High < len(s) && s[divider.High] == '\n' {
divider.High++
}
return divider, endMarkup
}