mirror of
https://github.com/miniflux/v2.git
synced 2025-06-27 16:36:00 +00:00
fix(readability): do not remove elements within code blocks
`<span class="hljs-comment"># exit 1</span>` will match the `unlikelyCandidatesRegexp` because it contains the `comment` string.
This commit is contained in:
parent
491d51c95f
commit
6d58052504
2 changed files with 27 additions and 0 deletions
|
@ -162,6 +162,11 @@ func removeUnlikelyCandidates(document *goquery.Document) {
|
|||
return
|
||||
}
|
||||
|
||||
// Don't remove elements within code blocks (pre or code tags)
|
||||
if s.Closest("pre, code").Length() > 0 {
|
||||
return
|
||||
}
|
||||
|
||||
if class, ok := s.Attr("class"); ok {
|
||||
if shouldRemove(class) {
|
||||
s.Remove()
|
||||
|
|
|
@ -164,6 +164,28 @@ func TestRemoveBlacklist(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestNestedSpanInCodeBlock(t *testing.T) {
|
||||
html := `
|
||||
<html>
|
||||
<head>
|
||||
<title>Test</title>
|
||||
</head>
|
||||
<body>
|
||||
<article><p>Some content</p><pre><code class="hljs-built_in">Code block with <span class="hljs-built_in">nested span</span> <span class="hljs-comment"># exit 1</span></code></pre></article>
|
||||
</body>
|
||||
</html>`
|
||||
want := `<div><div><p>Some content</p><pre><code class="hljs-built_in">Code block with <span class="hljs-built_in">nested span</span> <span class="hljs-comment"># exit 1</span></code></pre></div></div>`
|
||||
|
||||
_, result, err := ExtractContent(strings.NewReader(html))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if result != want {
|
||||
t.Errorf(`Invalid content, got %s instead of %s`, result, want)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkExtractContent(b *testing.B) {
|
||||
var testCases = map[string][]byte{
|
||||
"miniflux_github.html": {},
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue