From a62b97bdddd0da69d094e603e25e0058e69b048f Mon Sep 17 00:00:00 2001 From: jvoisin Date: Thu, 10 Jul 2025 17:24:23 +0200 Subject: [PATCH] refactor(readability): get rid of getClassWeight Its naming was confusing, and its code simple enough that it could be inlined. --- internal/reader/readability/readability.go | 25 ++++------- .../reader/readability/readability_test.go | 43 ------------------- 2 files changed, 8 insertions(+), 60 deletions(-) diff --git a/internal/reader/readability/readability.go b/internal/reader/readability/readability.go index 6d1d4289..41d9582f 100644 --- a/internal/reader/readability/readability.go +++ b/internal/reader/readability/readability.go @@ -318,7 +318,13 @@ func scoreNode(s *goquery.Selection) *candidate { c.score -= 5 } - c.score += getClassWeight(s) + if class, ok := s.Attr("class"); ok { + c.score += getWeight(class) + } + if id, ok := s.Attr("id"); ok { + c.score += getWeight(id) + } + return c } @@ -335,22 +341,7 @@ func getLinkDensity(s *goquery.Selection) float32 { return float32(linkLength) / float32(sum) } -// Get an elements class/id weight. Uses regular expressions to tell if this -// element looks good or bad. -func getClassWeight(s *goquery.Selection) float32 { - weight := 0 - - if class, ok := s.Attr("class"); ok { - weight += getWeight(class) - } - if id, ok := s.Attr("id"); ok { - weight += getWeight(id) - } - - return float32(weight) -} - -func getWeight(s string) int { +func getWeight(s string) float32 { s = strings.ToLower(s) for _, keyword := range negativeKeywords { if strings.Contains(s, keyword) { diff --git a/internal/reader/readability/readability_test.go b/internal/reader/readability/readability_test.go index 6c111f08..a4a5094a 100644 --- a/internal/reader/readability/readability_test.go +++ b/internal/reader/readability/readability_test.go @@ -350,49 +350,6 @@ func TestGetClassWeight(t *testing.T) { if selection.Length() == 0 { t.Fatal("No div element found in HTML") } - - result := getClassWeight(selection) - if result != tc.expected { - t.Errorf("Expected weight %f, got %f", tc.expected, result) - } - }) - } -} - -func TestGetClassWeightRegexPatterns(t *testing.T) { - // Test specific regex patterns used in getClassWeight - positiveWords := []string{"article", "body", "content", "entry", "hentry", "h-entry", "main", "page", "pagination", "post", "text", "blog", "story"} - negativeWords := []string{"hid", "banner", "combx", "comment", "com-", "contact", "foot", "masthead", "media", "meta", "modal", "outbrain", "promo", "related", "scroll", "share", "shoutbox", "sidebar", "skyscraper", "sponsor", "shopping", "tags", "tool", "widget", "byline", "author", "dateline", "writtenby"} - - for _, word := range positiveWords { - t.Run("positive_"+word, func(t *testing.T) { - html := `
content
` - doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) - if err != nil { - t.Fatalf("Failed to parse HTML: %v", err) - } - - selection := doc.Find("div").First() - result := getClassWeight(selection) - if result != 25 { - t.Errorf("Expected positive weight 25 for word '%s', got %f", word, result) - } - }) - } - - for _, word := range negativeWords { - t.Run("negative_"+word, func(t *testing.T) { - html := `
content
` - doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) - if err != nil { - t.Fatalf("Failed to parse HTML: %v", err) - } - - selection := doc.Find("div").First() - result := getClassWeight(selection) - if result != -25 { - t.Errorf("Expected negative weight -25 for word '%s', got %f", word, result) - } }) } }