diff --git a/internal/reader/readability/readability.go b/internal/reader/readability/readability.go index f3d2e976..bad0424c 100644 --- a/internal/reader/readability/readability.go +++ b/internal/reader/readability/readability.go @@ -162,6 +162,11 @@ func removeUnlikelyCandidates(document *goquery.Document) { return } + // Don't remove elements within code blocks (pre or code tags) + if s.Closest("pre, code").Length() > 0 { + return + } + if class, ok := s.Attr("class"); ok { if shouldRemove(class) { s.Remove() diff --git a/internal/reader/readability/readability_test.go b/internal/reader/readability/readability_test.go index e6deb889..ddb11afe 100644 --- a/internal/reader/readability/readability_test.go +++ b/internal/reader/readability/readability_test.go @@ -164,6 +164,28 @@ func TestRemoveBlacklist(t *testing.T) { } } +func TestNestedSpanInCodeBlock(t *testing.T) { + html := ` + + + Test + + +

Some content

Code block with nested span # exit 1
+ + ` + want := `

Some content

Code block with nested span # exit 1
` + + _, result, err := ExtractContent(strings.NewReader(html)) + if err != nil { + t.Fatal(err) + } + + if result != want { + t.Errorf(`Invalid content, got %s instead of %s`, result, want) + } +} + func BenchmarkExtractContent(b *testing.B) { var testCases = map[string][]byte{ "miniflux_github.html": {},