1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-06-27 16:36:00 +00:00

fix(readability): do not remove elements within code blocks

`<span class="hljs-comment"># exit 1</span>` will match the `unlikelyCandidatesRegexp` because it contains the `comment` string.
This commit is contained in:
Frédéric Guillot 2025-06-19 21:03:53 -07:00 committed by GitHub
parent 491d51c95f
commit 6d58052504
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 27 additions and 0 deletions

View file

@ -162,6 +162,11 @@ func removeUnlikelyCandidates(document *goquery.Document) {
return
}
// Don't remove elements within code blocks (pre or code tags)
if s.Closest("pre, code").Length() > 0 {
return
}
if class, ok := s.Attr("class"); ok {
if shouldRemove(class) {
s.Remove()

View file

@ -164,6 +164,28 @@ func TestRemoveBlacklist(t *testing.T) {
}
}
func TestNestedSpanInCodeBlock(t *testing.T) {
html := `
<html>
<head>
<title>Test</title>
</head>
<body>
<article><p>Some content</p><pre><code class="hljs-built_in">Code block with <span class="hljs-built_in">nested span</span> <span class="hljs-comment"># exit 1</span></code></pre></article>
</body>
</html>`
want := `<div><div><p>Some content</p><pre><code class="hljs-built_in">Code block with <span class="hljs-built_in">nested span</span> <span class="hljs-comment"># exit 1</span></code></pre></div></div>`
_, result, err := ExtractContent(strings.NewReader(html))
if err != nil {
t.Fatal(err)
}
if result != want {
t.Errorf(`Invalid content, got %s instead of %s`, result, want)
}
}
func BenchmarkExtractContent(b *testing.B) {
var testCases = map[string][]byte{
"miniflux_github.html": {},