diff --git a/internal/reader/readability/readability_test.go b/internal/reader/readability/readability_test.go index 708af27e..952e69d2 100644 --- a/internal/reader/readability/readability_test.go +++ b/internal/reader/readability/readability_test.go @@ -362,3 +362,283 @@ func TestGetClassWeightRegexPatterns(t *testing.T) { }) } } + +func TestRemoveUnlikelyCandidates(t *testing.T) { + testCases := []struct { + name string + html string + expected string + }{ + { + name: "removes elements with popupbody class", + html: `
popup content
good content
`, + expected: `
good content
`, + }, + { + name: "removes elements with -ad in class", + html: `
ad content
good content
`, + expected: `
good content
`, + }, + { + name: "removes elements with g-plus in class", + html: `
social content
good content
`, + expected: `
good content
`, + }, + { + name: "removes elements with unlikely candidates in class", + html: `
good content
`, + expected: `
good content
`, + }, + { + name: "preserves elements with unlikely candidates but also good candidates in class", + html: `
good content
`, + expected: `
good content
`, + }, + { + name: "removes elements with unlikely candidates in id", + html: `
good content
`, + expected: `
good content
`, + }, + { + name: "preserves elements with unlikely candidates but also good candidates in id", + html: `
mixed content
good content
`, + expected: `
mixed content
good content
`, + }, + { + name: "preserves html and body tags", + html: ``, + expected: ``, + }, + { + name: "preserves elements within code blocks", + html: `
`, + expected: `
`, + }, + { + name: "preserves elements within pre tags", + html: `
`, + expected: `
`, + }, + { + name: "case insensitive matching", + html: `
good content
`, + expected: `
good content
`, + }, + { + name: "multiple unlikely patterns in single class", + html: `
good content
`, + expected: `
good content
`, + }, + { + name: "elements without class or id are preserved", + html: `
no attributes

paragraph

`, + expected: `
no attributes

paragraph

`, + }, + { + name: "removes nested unlikely elements", + html: `

good content

`, + expected: `

good content

`, + }, + { + name: "comprehensive unlikely candidates test", + html: `
combx
comment
community
cover-wrap
disqus
extra
foot
header
legends
remark
replies
rss
shoutbox
skyscraper
social
supplemental
ad-break
agegate
pager
yom-remote
good content
`, + expected: `
good content
`, + }, + { + name: "preserves good candidates that contain unlikely words", + html: `
should be preserved
should be preserved
should be removed
`, + expected: `
should be preserved
should be preserved
`, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html)) + if err != nil { + t.Fatalf("Failed to parse HTML: %v", err) + } + + removeUnlikelyCandidates(doc) + + result, err := doc.Html() + if err != nil { + t.Fatalf("Failed to get HTML: %v", err) + } + + // Normalize whitespace for comparison + result = strings.TrimSpace(result) + expected := strings.TrimSpace(tc.expected) + + if result != expected { + t.Errorf("\nExpected:\n%s\n\nGot:\n%s", expected, result) + } + }) + } +} + +func TestRemoveUnlikelyCandidatesShouldRemoveFunction(t *testing.T) { + // Test the internal shouldRemove function behavior through the public interface + testCases := []struct { + name string + attr string + attrType string // "class" or "id" + expected bool // true if should be removed + }{ + // Special hardcoded cases + {"popupbody in class", "popupbody", "class", true}, + {"contains popupbody in class", "main-popupbody-content", "class", true}, + {"ad suffix in class", "super-ad", "class", true}, + {"ad in middle of class", "pre-ad-post", "class", true}, + {"g-plus in class", "g-plus-share", "class", true}, + {"contains g-plus in class", "social-g-plus-button", "class", true}, + + // Unlikely candidates regexp + {"banner class", "banner", "class", true}, + {"breadcrumbs class", "breadcrumbs", "class", true}, + {"comment class", "comment", "class", true}, + {"sidebar class", "sidebar", "class", true}, + {"footer class", "footer", "class", true}, + + // Unlikely candidates with good candidates (should not be removed) + {"banner with article", "banner article", "class", false}, + {"comment with main", "comment main", "class", false}, + {"sidebar with body", "sidebar body", "class", false}, + {"footer with column", "footer column", "class", false}, + {"menu with shadow", "menu shadow", "class", false}, + + // Case insensitive + {"uppercase banner", "BANNER", "class", true}, + {"mixed case comment", "Comment", "class", true}, + {"uppercase with good", "BANNER ARTICLE", "class", false}, + + // ID attributes + {"banner id", "banner", "id", true}, + {"comment id", "comment", "id", true}, + {"banner with article id", "banner article", "id", false}, + + // Good candidates only + {"article class", "article", "class", false}, + {"main class", "main", "class", false}, + {"content class", "content", "class", false}, + {"body class", "body", "class", false}, + + // No matches + {"random class", "random-class", "class", false}, + {"normal content", "normal-content", "class", false}, + {"empty string", "", "class", false}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + var html string + if tc.attrType == "class" { + html = `
content
` + } else { + html = `
content
` + } + + doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) + if err != nil { + t.Fatalf("Failed to parse HTML: %v", err) + } + + // Count elements before removal + beforeCount := doc.Find("div").Length() + + removeUnlikelyCandidates(doc) + + // Count elements after removal + afterCount := doc.Find("div").Length() + + wasRemoved := beforeCount > afterCount + + if wasRemoved != tc.expected { + t.Errorf("Expected element to be removed: %v, but was removed: %v", tc.expected, wasRemoved) + } + }) + } +} + +func TestRemoveUnlikelyCandidatesPreservation(t *testing.T) { + testCases := []struct { + name string + html string + description string + }{ + { + name: "preserves html tag", + html: `
content
`, + description: "HTML tag should never be removed regardless of class", + }, + { + name: "preserves body tag", + html: `
content
`, + description: "Body tag should never be removed regardless of class", + }, + { + name: "preserves elements in pre tags", + html: `
`, + description: "Elements within pre tags should be preserved", + }, + { + name: "preserves elements in code tags", + html: `code`, + description: "Elements within code tags should be preserved", + }, + { + name: "preserves nested elements in code blocks", + html: `
`, + description: "Deeply nested elements in code blocks should be preserved", + }, + { + name: "preserves elements in mixed code scenarios", + html: `
code
`, + description: "Multiple code block scenarios should work correctly", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html)) + if err != nil { + t.Fatalf("Failed to parse HTML: %v", err) + } + + // Count specific elements before removal + beforeHtml := doc.Find("html").Length() + beforeBody := doc.Find("body").Length() + beforePre := doc.Find("pre").Length() + beforeCode := doc.Find("code").Length() + + removeUnlikelyCandidates(doc) + + // Count specific elements after removal + afterHtml := doc.Find("html").Length() + afterBody := doc.Find("body").Length() + afterPre := doc.Find("pre").Length() + afterCode := doc.Find("code").Length() + + // These elements should always be preserved + if beforeHtml != afterHtml { + t.Errorf("HTML elements were removed: before=%d, after=%d", beforeHtml, afterHtml) + } + if beforeBody != afterBody { + t.Errorf("Body elements were removed: before=%d, after=%d", beforeBody, afterBody) + } + if beforePre != afterPre { + t.Errorf("Pre elements were removed: before=%d, after=%d", beforePre, afterPre) + } + if beforeCode != afterCode { + t.Errorf("Code elements were removed: before=%d, after=%d", beforeCode, afterCode) + } + + // Verify that elements within code blocks are preserved + if tc.name == "preserves elements in pre tags" || tc.name == "preserves elements in code tags" || tc.name == "preserves nested elements in code blocks" { + spanInCode := doc.Find("pre span, code span, pre div, code div").Length() + if spanInCode == 0 { + t.Error("Elements within code blocks were incorrectly removed") + } + } + }) + } +}