diff --git a/internal/reader/readability/readability_test.go b/internal/reader/readability/readability_test.go
index ddb11afe..708af27e 100644
--- a/internal/reader/readability/readability_test.go
+++ b/internal/reader/readability/readability_test.go
@@ -8,6 +8,8 @@ import (
"os"
"strings"
"testing"
+
+ "github.com/PuerkitoBio/goquery"
)
func TestBaseURL(t *testing.T) {
@@ -204,3 +206,159 @@ func BenchmarkExtractContent(b *testing.B) {
}
}
}
+
+func TestGetClassWeight(t *testing.T) {
+ testCases := []struct {
+ name string
+ html string
+ expected float32
+ }{
+ {
+ name: "no class or id",
+ html: `
content
`,
+ expected: 0,
+ },
+ {
+ name: "positive class only",
+ html: `content
`,
+ expected: 25,
+ },
+ {
+ name: "negative class only",
+ html: ``,
+ expected: -25,
+ },
+ {
+ name: "positive id only",
+ html: `content
`,
+ expected: 25,
+ },
+ {
+ name: "negative id only",
+ html: ``,
+ expected: -25,
+ },
+ {
+ name: "positive class and positive id",
+ html: `content
`,
+ expected: 50,
+ },
+ {
+ name: "negative class and negative id",
+ html: ``,
+ expected: -50,
+ },
+ {
+ name: "positive class and negative id",
+ html: `content
`,
+ expected: 0,
+ },
+ {
+ name: "negative class and positive id",
+ html: `content
`,
+ expected: 0,
+ },
+ {
+ name: "multiple positive classes",
+ html: `content
`,
+ expected: 25,
+ },
+ {
+ name: "multiple negative classes",
+ html: ``,
+ expected: -25,
+ },
+ {
+ name: "mixed positive and negative classes",
+ html: `content
`,
+ expected: -25, // negative takes precedence since it's checked first
+ },
+ {
+ name: "case insensitive class",
+ html: `content
`,
+ expected: 25,
+ },
+ {
+ name: "case insensitive id",
+ html: `content
`,
+ expected: 25,
+ },
+ {
+ name: "non-matching class and id",
+ html: `content
`,
+ expected: 0,
+ },
+ {
+ name: "empty class and id",
+ html: `content
`,
+ expected: 0,
+ },
+ {
+ name: "class with special characters",
+ html: `content
`,
+ expected: -25, // matches com- in negative regex
+ },
+ {
+ name: "id with special characters",
+ html: `content
`,
+ expected: 25, // matches h-entry in positive regex
+ },
+ }
+
+ for _, tc := range testCases {
+ t.Run(tc.name, func(t *testing.T) {
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html))
+ if err != nil {
+ t.Fatalf("Failed to parse HTML: %v", err)
+ }
+
+ selection := doc.Find("div").First()
+ if selection.Length() == 0 {
+ t.Fatal("No div element found in HTML")
+ }
+
+ result := getClassWeight(selection)
+ if result != tc.expected {
+ t.Errorf("Expected weight %f, got %f", tc.expected, result)
+ }
+ })
+ }
+}
+
+func TestGetClassWeightRegexPatterns(t *testing.T) {
+ // Test specific regex patterns used in getClassWeight
+ positiveWords := []string{"article", "body", "content", "entry", "hentry", "h-entry", "main", "page", "pagination", "post", "text", "blog", "story"}
+ negativeWords := []string{"hid", "banner", "combx", "comment", "com-", "contact", "foot", "masthead", "media", "meta", "modal", "outbrain", "promo", "related", "scroll", "share", "shoutbox", "sidebar", "skyscraper", "sponsor", "shopping", "tags", "tool", "widget", "byline", "author", "dateline", "writtenby"}
+
+ for _, word := range positiveWords {
+ t.Run("positive_"+word, func(t *testing.T) {
+ html := `content
`
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+ if err != nil {
+ t.Fatalf("Failed to parse HTML: %v", err)
+ }
+
+ selection := doc.Find("div").First()
+ result := getClassWeight(selection)
+ if result != 25 {
+ t.Errorf("Expected positive weight 25 for word '%s', got %f", word, result)
+ }
+ })
+ }
+
+ for _, word := range negativeWords {
+ t.Run("negative_"+word, func(t *testing.T) {
+ html := `content
`
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+ if err != nil {
+ t.Fatalf("Failed to parse HTML: %v", err)
+ }
+
+ selection := doc.Find("div").First()
+ result := getClassWeight(selection)
+ if result != -25 {
+ t.Errorf("Expected negative weight -25 for word '%s', got %f", word, result)
+ }
+ })
+ }
+}