diff --git a/internal/reader/readability/readability_test.go b/internal/reader/readability/readability_test.go
index 708af27e..952e69d2 100644
--- a/internal/reader/readability/readability_test.go
+++ b/internal/reader/readability/readability_test.go
@@ -362,3 +362,283 @@ func TestGetClassWeightRegexPatterns(t *testing.T) {
})
}
}
+
+func TestRemoveUnlikelyCandidates(t *testing.T) {
+ testCases := []struct {
+ name string
+ html string
+ expected string
+ }{
+ {
+ name: "removes elements with popupbody class",
+ html: `
popup content
good content
`,
+ expected: `good content
`,
+ },
+ {
+ name: "removes elements with -ad in class",
+ html: `ad content
good content
`,
+ expected: `good content
`,
+ },
+ {
+ name: "removes elements with g-plus in class",
+ html: `social content
good content
`,
+ expected: `good content
`,
+ },
+ {
+ name: "removes elements with unlikely candidates in class",
+ html: `banner
good content
`,
+ expected: `good content
`,
+ },
+ {
+ name: "preserves elements with unlikely candidates but also good candidates in class",
+ html: `mixed content
good content
`,
+ expected: `mixed content
good content
`,
+ },
+ {
+ name: "removes elements with unlikely candidates in id",
+ html: `banner
good content
`,
+ expected: `good content
`,
+ },
+ {
+ name: "preserves elements with unlikely candidates but also good candidates in id",
+ html: `mixed content
good content
`,
+ expected: `mixed content
good content
`,
+ },
+ {
+ name: "preserves html and body tags",
+ html: `content
`,
+ expected: ``,
+ },
+ {
+ name: "preserves elements within code blocks",
+ html: `code content
remove this
`,
+ expected: `code content
`,
+ },
+ {
+ name: "preserves elements within pre tags",
+ html: ``,
+ expected: ``,
+ },
+ {
+ name: "case insensitive matching",
+ html: `uppercase banner
mixed case banner
good content
`,
+ expected: `good content
`,
+ },
+ {
+ name: "multiple unlikely patterns in single class",
+ html: `good content
`,
+ expected: `good content
`,
+ },
+ {
+ name: "elements without class or id are preserved",
+ html: `no attributes
paragraph
`,
+ expected: `no attributes
paragraph
`,
+ },
+ {
+ name: "removes nested unlikely elements",
+ html: `nested banner
good content
`,
+ expected: ``,
+ },
+ {
+ name: "comprehensive unlikely candidates test",
+ html: `breadcrumbs
cover-wrap
legends
modal
related
replies
skyscraper
social
supplemental
yom-remote
good content
`,
+ expected: `good content
`,
+ },
+ {
+ name: "preserves good candidates that contain unlikely words",
+ html: `should be preserved
should be preserved
should be preserved
should be preserved
should be preserved
should be preserved
should be removed
`,
+ expected: `should be preserved
should be preserved
should be preserved
should be preserved
should be preserved
should be preserved
`,
+ },
+ }
+
+ for _, tc := range testCases {
+ t.Run(tc.name, func(t *testing.T) {
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html))
+ if err != nil {
+ t.Fatalf("Failed to parse HTML: %v", err)
+ }
+
+ removeUnlikelyCandidates(doc)
+
+ result, err := doc.Html()
+ if err != nil {
+ t.Fatalf("Failed to get HTML: %v", err)
+ }
+
+ // Normalize whitespace for comparison
+ result = strings.TrimSpace(result)
+ expected := strings.TrimSpace(tc.expected)
+
+ if result != expected {
+ t.Errorf("\nExpected:\n%s\n\nGot:\n%s", expected, result)
+ }
+ })
+ }
+}
+
+func TestRemoveUnlikelyCandidatesShouldRemoveFunction(t *testing.T) {
+ // Test the internal shouldRemove function behavior through the public interface
+ testCases := []struct {
+ name string
+ attr string
+ attrType string // "class" or "id"
+ expected bool // true if should be removed
+ }{
+ // Special hardcoded cases
+ {"popupbody in class", "popupbody", "class", true},
+ {"contains popupbody in class", "main-popupbody-content", "class", true},
+ {"ad suffix in class", "super-ad", "class", true},
+ {"ad in middle of class", "pre-ad-post", "class", true},
+ {"g-plus in class", "g-plus-share", "class", true},
+ {"contains g-plus in class", "social-g-plus-button", "class", true},
+
+ // Unlikely candidates regexp
+ {"banner class", "banner", "class", true},
+ {"breadcrumbs class", "breadcrumbs", "class", true},
+ {"comment class", "comment", "class", true},
+ {"sidebar class", "sidebar", "class", true},
+ {"footer class", "footer", "class", true},
+
+ // Unlikely candidates with good candidates (should not be removed)
+ {"banner with article", "banner article", "class", false},
+ {"comment with main", "comment main", "class", false},
+ {"sidebar with body", "sidebar body", "class", false},
+ {"footer with column", "footer column", "class", false},
+ {"menu with shadow", "menu shadow", "class", false},
+
+ // Case insensitive
+ {"uppercase banner", "BANNER", "class", true},
+ {"mixed case comment", "Comment", "class", true},
+ {"uppercase with good", "BANNER ARTICLE", "class", false},
+
+ // ID attributes
+ {"banner id", "banner", "id", true},
+ {"comment id", "comment", "id", true},
+ {"banner with article id", "banner article", "id", false},
+
+ // Good candidates only
+ {"article class", "article", "class", false},
+ {"main class", "main", "class", false},
+ {"content class", "content", "class", false},
+ {"body class", "body", "class", false},
+
+ // No matches
+ {"random class", "random-class", "class", false},
+ {"normal content", "normal-content", "class", false},
+ {"empty string", "", "class", false},
+ }
+
+ for _, tc := range testCases {
+ t.Run(tc.name, func(t *testing.T) {
+ var html string
+ if tc.attrType == "class" {
+ html = `content
`
+ } else {
+ html = `content
`
+ }
+
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+ if err != nil {
+ t.Fatalf("Failed to parse HTML: %v", err)
+ }
+
+ // Count elements before removal
+ beforeCount := doc.Find("div").Length()
+
+ removeUnlikelyCandidates(doc)
+
+ // Count elements after removal
+ afterCount := doc.Find("div").Length()
+
+ wasRemoved := beforeCount > afterCount
+
+ if wasRemoved != tc.expected {
+ t.Errorf("Expected element to be removed: %v, but was removed: %v", tc.expected, wasRemoved)
+ }
+ })
+ }
+}
+
+func TestRemoveUnlikelyCandidatesPreservation(t *testing.T) {
+ testCases := []struct {
+ name string
+ html string
+ description string
+ }{
+ {
+ name: "preserves html tag",
+ html: `content
`,
+ description: "HTML tag should never be removed regardless of class",
+ },
+ {
+ name: "preserves body tag",
+ html: `content
`,
+ description: "Body tag should never be removed regardless of class",
+ },
+ {
+ name: "preserves elements in pre tags",
+ html: `code
`,
+ description: "Elements within pre tags should be preserved",
+ },
+ {
+ name: "preserves elements in code tags",
+ html: `
`,
+ description: "Elements within code tags should be preserved",
+ },
+ {
+ name: "preserves nested elements in code blocks",
+ html: `
`,
+ description: "Deeply nested elements in code blocks should be preserved",
+ },
+ {
+ name: "preserves elements in mixed code scenarios",
+ html: ``,
+ description: "Multiple code block scenarios should work correctly",
+ },
+ }
+
+ for _, tc := range testCases {
+ t.Run(tc.name, func(t *testing.T) {
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html))
+ if err != nil {
+ t.Fatalf("Failed to parse HTML: %v", err)
+ }
+
+ // Count specific elements before removal
+ beforeHtml := doc.Find("html").Length()
+ beforeBody := doc.Find("body").Length()
+ beforePre := doc.Find("pre").Length()
+ beforeCode := doc.Find("code").Length()
+
+ removeUnlikelyCandidates(doc)
+
+ // Count specific elements after removal
+ afterHtml := doc.Find("html").Length()
+ afterBody := doc.Find("body").Length()
+ afterPre := doc.Find("pre").Length()
+ afterCode := doc.Find("code").Length()
+
+ // These elements should always be preserved
+ if beforeHtml != afterHtml {
+ t.Errorf("HTML elements were removed: before=%d, after=%d", beforeHtml, afterHtml)
+ }
+ if beforeBody != afterBody {
+ t.Errorf("Body elements were removed: before=%d, after=%d", beforeBody, afterBody)
+ }
+ if beforePre != afterPre {
+ t.Errorf("Pre elements were removed: before=%d, after=%d", beforePre, afterPre)
+ }
+ if beforeCode != afterCode {
+ t.Errorf("Code elements were removed: before=%d, after=%d", beforeCode, afterCode)
+ }
+
+ // Verify that elements within code blocks are preserved
+ if tc.name == "preserves elements in pre tags" || tc.name == "preserves elements in code tags" || tc.name == "preserves nested elements in code blocks" {
+ spanInCode := doc.Find("pre span, code span, pre div, code div").Length()
+ if spanInCode == 0 {
+ t.Error("Elements within code blocks were incorrectly removed")
+ }
+ }
+ })
+ }
+}