From 34033e349aea4b7853e4aa8b77800816f659d62d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Erik=20Pedersen?= Date: Fri, 5 Apr 2024 10:46:27 +0200 Subject: [PATCH] Strip hash sign (#) from file paths/URLs The general way Hugo does this now is: * Sanitize the file paths so the work as URLs * When we create the final RelPermalink/Permalink, we use Go's `url.Parse` to escape it so it work for the browser. So, leaving anything in the first step that does not work with the second step, just doesn't work. It's a little bit odd that `url.Parse` silently truncates this URL without any error, but that's for another day. I have another better test coverage for this. Fixes #12342 Fixes #4926 See #8232 --- common/paths/path.go | 2 +- common/paths/path_test.go | 22 +++++++++++++++++++--- helpers/path_test.go | 2 +- helpers/url.go | 16 ++-------------- hugolib/page_permalink_test.go | 22 ++++++++++++++++++++++ 5 files changed, 45 insertions(+), 19 deletions(-) diff --git a/common/paths/path.go b/common/paths/path.go index 906270cae..66a4b28c7 100644 --- a/common/paths/path.go +++ b/common/paths/path.go @@ -318,7 +318,7 @@ func isAllowedPathCharacter(s string, i int, r rune) bool { } // Check for the most likely first (faster). isAllowed := unicode.IsLetter(r) || unicode.IsDigit(r) - isAllowed = isAllowed || r == '.' || r == '/' || r == '\\' || r == '_' || r == '#' || r == '+' || r == '~' || r == '-' || r == '@' + isAllowed = isAllowed || r == '.' || r == '/' || r == '\\' || r == '_' || r == '+' || r == '~' || r == '-' || r == '@' isAllowed = isAllowed || unicode.IsMark(r) isAllowed = isAllowed || (r == '%' && i+2 < len(s) && ishex(s[i+1]) && ishex(s[i+2])) return isAllowed diff --git a/common/paths/path_test.go b/common/paths/path_test.go index 3605bfc43..bf6892659 100644 --- a/common/paths/path_test.go +++ b/common/paths/path_test.go @@ -14,7 +14,10 @@ package paths import ( + "fmt" + "net/url" "path/filepath" + "strings" "testing" qt "github.com/frankban/quicktest" @@ -204,17 +207,30 @@ func TestSanitize(t *testing.T) { {"трям/трям", "трям/трям"}, {"은행", "은행"}, {"Банковский кассир", "Банковский-кассир"}, - // Issue #1488 - {"संस्कृत", "संस्कृत"}, - {"a%C3%B1ame", "a%C3%B1ame"}, // Issue #1292 + + {"संस्कृत", "संस्कृत"}, // Issue #1488 {"this+is+a+test", "this+is+a+test"}, // Issue #1290 {"~foo", "~foo"}, // Issue #2177 + // Issue #2342 + {"foo#bar", "foobar"}, + {"foo@bar", "foo@bar"}, } for _, test := range tests { c.Assert(Sanitize(test.input), qt.Equals, test.expected) + + // Make sure they survive the URL roundtrip, which makes sure that this works with URLs (e.g. in Permalink) + protocol := "https://" + urlString := fmt.Sprintf("%s%s", protocol, test.expected) + unescaped, err := url.PathUnescape(strings.TrimPrefix(URLEscape(urlString), protocol)) + c.Assert(err, qt.IsNil) + c.Assert(unescaped, qt.Equals, test.expected) + } + + // Some special cases. + c.Assert(Sanitize("a%C3%B1ame"), qt.Equals, "a%C3%B1ame") // Issue #1292 } func BenchmarkSanitize(b *testing.B) { diff --git a/helpers/path_test.go b/helpers/path_test.go index 6f3699589..3257d5abd 100644 --- a/helpers/path_test.go +++ b/helpers/path_test.go @@ -35,7 +35,7 @@ func TestMakePath(t *testing.T) { expected string removeAccents bool }{ - {"dot.slash/backslash\\underscore_pound#plus+hyphen-", "dot.slash/backslash\\underscore_pound#plus+hyphen-", true}, + {"dot.slash/backslash\\underscore_pound#plus+hyphen-", "dot.slash/backslash\\underscore_poundplus+hyphen-", true}, {"abcXYZ0123456789", "abcXYZ0123456789", true}, {"%20 %2", "%20-2", true}, {"foo- bar", "foo-bar", true}, diff --git a/helpers/url.go b/helpers/url.go index d5a613029..bdf73d983 100644 --- a/helpers/url.go +++ b/helpers/url.go @@ -28,25 +28,13 @@ import ( // uri: Vim (text editor) // urlize: vim-text-editor func (p *PathSpec) URLize(uri string) string { - return p.URLEscape(p.MakePathSanitized(uri)) + return paths.URLEscape(p.MakePathSanitized(uri)) } // URLizeFilename creates an URL from a filename by escaping unicode letters // and turn any filepath separator into forward slashes. func (p *PathSpec) URLizeFilename(filename string) string { - return p.URLEscape(filepath.ToSlash(filename)) -} - -// URLEscape escapes unicode letters. -func (p *PathSpec) URLEscape(uri string) string { - // escape unicode letters - parsedURI, err := url.Parse(uri) - if err != nil { - // if net/url can not parse URL it means Sanitize works incorrectly - panic(err) - } - x := parsedURI.String() - return x + return paths.URLEscape(filepath.ToSlash(filename)) } // AbsURL creates an absolute URL from the relative path given and the BaseURL set in config. diff --git a/hugolib/page_permalink_test.go b/hugolib/page_permalink_test.go index bc89638d3..97fb1990e 100644 --- a/hugolib/page_permalink_test.go +++ b/hugolib/page_permalink_test.go @@ -157,3 +157,25 @@ Some content. b.AssertFileContent("public/myblog/p2/index.html", "Single: A page|Hello|en|RelPermalink: /myblog/p2/|Permalink: https://example.com/myblog/p2/|") b.AssertFileContent("public/myblog/p3/index.html", "Single: A page|Hello|en|RelPermalink: /myblog/p3/|Permalink: https://example.com/myblog/p3/|") } + +func TestPermalinkHashInSlugIssue12342(t *testing.T) { + files := ` +-- hugo.toml -- +disableKind = ["taxonomy", "term", "section"] +baseURL = "https://example.com/" +[permalinks] +posts = "/posts/:year/:month/:slug/" +-- content/posts/p1.md -- +--- +title: 'Newsletter #4' +date: 2024-04-04T12:27:52-07:00 +--- +Foo +-- layouts/_default/single.html -- +{{ .Title }}|{{ .RelPermalink }}|{{ .Permalink }}|$ +` + + b := Test(t, files) + + b.AssertFileContent("public/posts/2024/04/newsletter-4/index.html", "Newsletter #4|/posts/2024/04/newsletter-4/|https://example.com/posts/2024/04/newsletter-4/|$") +}