diff --git a/internal/reader/readability/readability_test.go b/internal/reader/readability/readability_test.go index ddb11afe..708af27e 100644 --- a/internal/reader/readability/readability_test.go +++ b/internal/reader/readability/readability_test.go @@ -8,6 +8,8 @@ import ( "os" "strings" "testing" + + "github.com/PuerkitoBio/goquery" ) func TestBaseURL(t *testing.T) { @@ -204,3 +206,159 @@ func BenchmarkExtractContent(b *testing.B) { } } } + +func TestGetClassWeight(t *testing.T) { + testCases := []struct { + name string + html string + expected float32 + }{ + { + name: "no class or id", + html: `
content
`, + expected: 0, + }, + { + name: "positive class only", + html: `
content
`, + expected: 25, + }, + { + name: "negative class only", + html: `
content
`, + expected: -25, + }, + { + name: "positive id only", + html: `
content
`, + expected: 25, + }, + { + name: "negative id only", + html: ``, + expected: -25, + }, + { + name: "positive class and positive id", + html: `
content
`, + expected: 50, + }, + { + name: "negative class and negative id", + html: ``, + expected: -50, + }, + { + name: "positive class and negative id", + html: `
content
`, + expected: 0, + }, + { + name: "negative class and positive id", + html: ``, + expected: 0, + }, + { + name: "multiple positive classes", + html: `
content
`, + expected: 25, + }, + { + name: "multiple negative classes", + html: ``, + expected: -25, + }, + { + name: "mixed positive and negative classes", + html: `
content
`, + expected: -25, // negative takes precedence since it's checked first + }, + { + name: "case insensitive class", + html: `
content
`, + expected: 25, + }, + { + name: "case insensitive id", + html: `
content
`, + expected: 25, + }, + { + name: "non-matching class and id", + html: `
content
`, + expected: 0, + }, + { + name: "empty class and id", + html: `
content
`, + expected: 0, + }, + { + name: "class with special characters", + html: `
content
`, + expected: -25, // matches com- in negative regex + }, + { + name: "id with special characters", + html: `
content
`, + expected: 25, // matches h-entry in positive regex + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html)) + if err != nil { + t.Fatalf("Failed to parse HTML: %v", err) + } + + selection := doc.Find("div").First() + if selection.Length() == 0 { + t.Fatal("No div element found in HTML") + } + + result := getClassWeight(selection) + if result != tc.expected { + t.Errorf("Expected weight %f, got %f", tc.expected, result) + } + }) + } +} + +func TestGetClassWeightRegexPatterns(t *testing.T) { + // Test specific regex patterns used in getClassWeight + positiveWords := []string{"article", "body", "content", "entry", "hentry", "h-entry", "main", "page", "pagination", "post", "text", "blog", "story"} + negativeWords := []string{"hid", "banner", "combx", "comment", "com-", "contact", "foot", "masthead", "media", "meta", "modal", "outbrain", "promo", "related", "scroll", "share", "shoutbox", "sidebar", "skyscraper", "sponsor", "shopping", "tags", "tool", "widget", "byline", "author", "dateline", "writtenby"} + + for _, word := range positiveWords { + t.Run("positive_"+word, func(t *testing.T) { + html := `
content
` + doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) + if err != nil { + t.Fatalf("Failed to parse HTML: %v", err) + } + + selection := doc.Find("div").First() + result := getClassWeight(selection) + if result != 25 { + t.Errorf("Expected positive weight 25 for word '%s', got %f", word, result) + } + }) + } + + for _, word := range negativeWords { + t.Run("negative_"+word, func(t *testing.T) { + html := `
content
` + doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) + if err != nil { + t.Fatalf("Failed to parse HTML: %v", err) + } + + selection := doc.Find("div").First() + result := getClassWeight(selection) + if result != -25 { + t.Errorf("Expected negative weight -25 for word '%s', got %f", word, result) + } + }) + } +}