hugo/resources/page/page_markup_test.go

// Copyright 2024 The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package page

import (
	"strings"
	"testing"

	qt "github.com/frankban/quicktest"
	"github.com/gohugoio/hugo/common/types"
	"github.com/gohugoio/hugo/media"
)

func TestExtractSummaryFromHTML(t *testing.T) {
	c := qt.New(t)

	tests := []struct {
		mt                          media.Type
		input                       string
		isCJK                       bool
		numWords                    int
		expectSummary               string
		expectContentWithoutSummary string
	}{
		{media.Builtin.ReStructuredTextType, "<div class=\"document\">\n\n\n<p>Simple Page</p>\n</div>", false, 70, "<div class=\"document\">\n\n\n<p>Simple Page</p>\n</div>", ""},
		{media.Builtin.ReStructuredTextType, "<div class=\"document\"><p>First paragraph</p><p>Second paragraph</p></div>", false, 2, `<div class="document"><p>First paragraph</p></div>`, "<div class=\"document\"><p>Second paragraph</p></div>"},
		{media.Builtin.MarkdownType, "<p>First paragraph</p>", false, 10, "<p>First paragraph</p>", ""},
		{media.Builtin.MarkdownType, "<p>First paragraph</p><p>Second paragraph</p>", false, 2, "<p>First paragraph</p>", "<p>Second paragraph</p>"},
		{media.Builtin.MarkdownType, "<p>First paragraph</p><p>Second paragraph</p><p>Third paragraph</p>", false, 3, "<p>First paragraph</p><p>Second paragraph</p>", "<p>Third paragraph</p>"},
		{media.Builtin.AsciiDocType, "<div><p>First paragraph</p></div><div><p>Second paragraph</p></div>", false, 2, "<div><p>First paragraph</p></div>", "<div><p>Second paragraph</p></div>"},
		{media.Builtin.MarkdownType, "<p>这是中文，全中文</p><p>a这是中文，全中文</p>", true, 5, "<p>这是中文，全中文</p>", "<p>a这是中文，全中文</p>"},
	}

	for i, test := range tests {
		summary := ExtractSummaryFromHTML(test.mt, test.input, test.numWords, test.isCJK)
		c.Assert(summary.Summary(), qt.Equals, test.expectSummary, qt.Commentf("Summary %d", i))
		c.Assert(summary.ContentWithoutSummary(), qt.Equals, test.expectContentWithoutSummary, qt.Commentf("ContentWithoutSummary %d", i))
	}
}

// See https://discourse.gohugo.io/t/automatic-summarys-summarylength-seems-broken-in-the-case-of-plainify/51466/4
// Also issue 12837
func TestExtractSummaryFromHTMLLotsOfHTMLInSummary(t *testing.T) {
	c := qt.New(t)

	input := `
<p>
<div>
  <picture>
    <img src="imgs/1.jpg" alt="1"/>
  </picture>
  <picture>
    <img src="imgs/2.jpg" alt="2"/>
  </picture>
  <picture>
    <img src="imgs/3.jpg" alt="3"/>
  </picture>
  <picture>
    <img src="imgs/4.jpg" alt="4"/>
  </picture>
  <picture>
    <img src="imgs/5.jpg" alt="5"/>
  </picture>
</div>
</p>
<p>
This is a story about a cat.
</p>
<p>
The cat was white and fluffy.
</p>
<p>
And it liked milk.
</p>
`

	summary := ExtractSummaryFromHTML(media.Builtin.MarkdownType, input, 10, false)
	c.Assert(strings.HasSuffix(summary.Summary(), "<p>\nThis is a story about a cat.\n</p>\n<p>\nThe cat was white and fluffy.\n</p>"), qt.IsTrue)
}

func TestExtractSummaryFromHTMLWithDivider(t *testing.T) {
	c := qt.New(t)

	const divider = "FOOO"

	tests := []struct {
		mt                          media.Type
		input                       string
		expectSummary               string
		expectContentWithoutSummary string
		expectContent               string
	}{
		{media.Builtin.MarkdownType, "<p>First paragraph</p><p>FOOO</p><p>Second paragraph</p>", "<p>First paragraph</p>", "<p>Second paragraph</p>", "<p>First paragraph</p><p>Second paragraph</p>"},
		{media.Builtin.MarkdownType, "<p>First paragraph</p>\n<p>FOOO</p>\n<p>Second paragraph</p>", "<p>First paragraph</p>", "<p>Second paragraph</p>", "<p>First paragraph</p>\n<p>Second paragraph</p>"},
		{media.Builtin.MarkdownType, "<p>FOOO</p>\n<p>First paragraph</p>", "", "<p>First paragraph</p>", "<p>First paragraph</p>"},
		{media.Builtin.MarkdownType, "<p>First paragraph</p><p>Second paragraphFOOO</p><p>Third paragraph</p>", "<p>First paragraph</p><p>Second paragraph</p>", "<p>Third paragraph</p>", "<p>First paragraph</p><p>Second paragraph</p><p>Third paragraph</p>"},
		{media.Builtin.MarkdownType, "<p>这是中文，全中文FOOO</p><p>a这是中文，全中文</p>", "<p>这是中文，全中文</p>", "<p>a这是中文，全中文</p>", "<p>这是中文，全中文</p><p>a这是中文，全中文</p>"},
		{media.Builtin.MarkdownType, `<p>a <strong>b</strong>` + "\v" + ` c</p>` + "\n<p>FOOO</p>", "<p>a <strong>b</strong>\v c</p>", "", "<p>a <strong>b</strong>\v c</p>"},

		{media.Builtin.HTMLType, "<p>First paragraph</p>FOOO<p>Second paragraph</p>", "<p>First paragraph</p>", "<p>Second paragraph</p>", "<p>First paragraph</p><p>Second paragraph</p>"},

		{media.Builtin.ReStructuredTextType, "<div class=\"document\">\n\n\n<p>This is summary.</p>\n<p>FOOO</p>\n<p>This is content.</p>\n</div>", "<div class=\"document\">\n\n\n<p>This is summary.</p>\n</div>", "<div class=\"document\"><p>This is content.</p>\n</div>", "<div class=\"document\">\n\n\n<p>This is summary.</p>\n<p>This is content.</p>\n</div>"},
		{media.Builtin.ReStructuredTextType, "<div class=\"document\"><p>First paragraphFOOO</p><p>Second paragraph</p></div>", "<div class=\"document\"><p>First paragraph</p></div>", "<div class=\"document\"><p>Second paragraph</p></div>", `<div class="document"><p>First paragraph</p><p>Second paragraph</p></div>`},

		{media.Builtin.AsciiDocType, "<div class=\"paragraph\"><p>Summary Next Line</p></div><div class=\"paragraph\"><p>FOOO</p></div><div class=\"paragraph\"><p>Some more text</p></div>", "<div class=\"paragraph\"><p>Summary Next Line</p></div>", "<div class=\"paragraph\"><p>Some more text</p></div>", "<div class=\"paragraph\"><p>Summary Next Line</p></div><div class=\"paragraph\"><p>Some more text</p></div>"},
		{media.Builtin.AsciiDocType, "<div class=\"paragraph\">\n<p>Summary Next Line</p>\n</div>\n<div class=\"paragraph\">\n<p>FOOO</p>\n</div>\n<div class=\"paragraph\">\n<p>Some more text</p>\n</div>\n", "<div class=\"paragraph\">\n<p>Summary Next Line</p>\n</div>", "<div class=\"paragraph\">\n<p>Some more text</p>\n</div>", "<div class=\"paragraph\">\n<p>Summary Next Line</p>\n</div>\n<div class=\"paragraph\">\n<p>Some more text</p>\n</div>"},
		{media.Builtin.AsciiDocType, "<div><p>FOOO</p></div><div><p>First paragraph</p></div>", "", "<div><p>First paragraph</p></div>", "<div><p>First paragraph</p></div>"},
		{media.Builtin.AsciiDocType, "<div><p>First paragraphFOOO</p></div><div><p>Second paragraph</p></div>", "<div><p>First paragraph</p></div>", "<div><p>Second paragraph</p></div>", "<div><p>First paragraph</p></div><div><p>Second paragraph</p></div>"},
	}

	for i, test := range tests {
		summary := ExtractSummaryFromHTMLWithDivider(test.mt, test.input, divider)
		c.Assert(summary.Summary(), qt.Equals, test.expectSummary, qt.Commentf("Summary %d", i))
		c.Assert(summary.ContentWithoutSummary(), qt.Equals, test.expectContentWithoutSummary, qt.Commentf("ContentWithoutSummary %d", i))
		c.Assert(summary.Content(), qt.Equals, test.expectContent, qt.Commentf("Content %d", i))
	}
}

func TestExpandDivider(t *testing.T) {
	c := qt.New(t)

	for i, test := range []struct {
		input           string
		divider         string
		ptag            tagReStartEnd
		expect          string
		expectEndMarkup string
	}{
		{"<p>First paragraph</p>\n<p>FOOO</p>\n<p>Second paragraph</p>", "FOOO", startEndP, "<p>FOOO</p>\n", ""},
		{"<div class=\"paragraph\">\n<p>FOOO</p>\n</div>", "FOOO", startEndDiv, "<div class=\"paragraph\">\n<p>FOOO</p>\n</div>", ""},
		{"<div><p>FOOO</p></div><div><p>Second paragraph</p></div>", "FOOO", startEndDiv, "<div><p>FOOO</p></div>", ""},
		{"<div><p>First paragraphFOOO</p></div><div><p>Second paragraph</p></div>", "FOOO", startEndDiv, "FOOO", "</p></div>"},
		{"   <p> abc FOOO  </p>  ", "FOOO", startEndP, "FOOO", "  </p>"},
		{"   <p>  FOOO  </p>  ", "FOOO", startEndP, "<p>  FOOO  </p>", ""},
		{"   <p>\n  \nFOOO  </p>  ", "FOOO", startEndP, "<p>\n  \nFOOO  </p>", ""},
		{"   <div>  FOOO  </div>  ", "FOOO", startEndDiv, "<div>  FOOO  </div>", ""},
	} {

		l := types.LowHigh[string]{Low: strings.Index(test.input, test.divider), High: strings.Index(test.input, test.divider) + len(test.divider)}
		e, t := expandSummaryDivider(test.input, test.ptag, l)
		c.Assert(test.input[e.Low:e.High], qt.Equals, test.expect, qt.Commentf("[%d] Test.expect %q", i, test.input))
		c.Assert(test.input[t.Low:t.High], qt.Equals, test.expectEndMarkup, qt.Commentf("[%d] Test.expectEndMarkup %q", i, test.input))
	}
}

func TestIsProbablyHTMLToken(t *testing.T) {
	c := qt.New(t)

	for i, test := range []struct {
		input  string
		expect bool
	}{
		{"<p>", true},
		{"<p", true},
		{"width=\"32\"", true},
		{"width='32'", true},
		{"<p>Æøå", false},
	} {
		c.Assert(isProbablyHTMLToken(test.input), qt.Equals, test.expect, qt.Commentf("[%d] Test.expect %q", i, test.input))
	}
}

func BenchmarkSummaryFromHTML(b *testing.B) {
	b.StopTimer()
	input := "<p>First paragraph</p><p>Second paragraph</p>"
	b.StartTimer()
	for i := 0; i < b.N; i++ {
		summary := ExtractSummaryFromHTML(media.Builtin.MarkdownType, input, 2, false)
		if s := summary.Content(); s != input {
			b.Fatalf("unexpected content: %q", s)
		}
		if s := summary.ContentWithoutSummary(); s != "<p>Second paragraph</p>" {
			b.Fatalf("unexpected content without summary: %q", s)
		}
		if s := summary.Summary(); s != "<p>First paragraph</p>" {
			b.Fatalf("unexpected summary: %q", s)
		}
	}
}

func BenchmarkSummaryFromHTMLWithDivider(b *testing.B) {
	b.StopTimer()
	input := "<p>First paragraph</p><p>FOOO</p><p>Second paragraph</p>"
	b.StartTimer()
	for i := 0; i < b.N; i++ {
		summary := ExtractSummaryFromHTMLWithDivider(media.Builtin.MarkdownType, input, "FOOO")
		if s := summary.Content(); s != "<p>First paragraph</p><p>Second paragraph</p>" {
			b.Fatalf("unexpected content: %q", s)
		}
		if s := summary.ContentWithoutSummary(); s != "<p>Second paragraph</p>" {
			b.Fatalf("unexpected content without summary: %q", s)
		}
		if s := summary.Summary(); s != "<p>First paragraph</p>" {
			b.Fatalf("unexpected summary: %q", s)
		}
	}
}