1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-08-06 17:41:00 +00:00

refactor(readability): get rid of getClassWeight

Its naming was confusing, and its code simple enough that it could be inlined.
This commit is contained in:
jvoisin 2025-07-10 17:24:23 +02:00 committed by Frédéric Guillot
parent 1de9cf4241
commit a62b97bddd
2 changed files with 8 additions and 60 deletions

View file

@ -318,7 +318,13 @@ func scoreNode(s *goquery.Selection) *candidate {
c.score -= 5
}
c.score += getClassWeight(s)
if class, ok := s.Attr("class"); ok {
c.score += getWeight(class)
}
if id, ok := s.Attr("id"); ok {
c.score += getWeight(id)
}
return c
}
@ -335,22 +341,7 @@ func getLinkDensity(s *goquery.Selection) float32 {
return float32(linkLength) / float32(sum)
}
// Get an elements class/id weight. Uses regular expressions to tell if this
// element looks good or bad.
func getClassWeight(s *goquery.Selection) float32 {
weight := 0
if class, ok := s.Attr("class"); ok {
weight += getWeight(class)
}
if id, ok := s.Attr("id"); ok {
weight += getWeight(id)
}
return float32(weight)
}
func getWeight(s string) int {
func getWeight(s string) float32 {
s = strings.ToLower(s)
for _, keyword := range negativeKeywords {
if strings.Contains(s, keyword) {

View file

@ -350,49 +350,6 @@ func TestGetClassWeight(t *testing.T) {
if selection.Length() == 0 {
t.Fatal("No div element found in HTML")
}
result := getClassWeight(selection)
if result != tc.expected {
t.Errorf("Expected weight %f, got %f", tc.expected, result)
}
})
}
}
func TestGetClassWeightRegexPatterns(t *testing.T) {
// Test specific regex patterns used in getClassWeight
positiveWords := []string{"article", "body", "content", "entry", "hentry", "h-entry", "main", "page", "pagination", "post", "text", "blog", "story"}
negativeWords := []string{"hid", "banner", "combx", "comment", "com-", "contact", "foot", "masthead", "media", "meta", "modal", "outbrain", "promo", "related", "scroll", "share", "shoutbox", "sidebar", "skyscraper", "sponsor", "shopping", "tags", "tool", "widget", "byline", "author", "dateline", "writtenby"}
for _, word := range positiveWords {
t.Run("positive_"+word, func(t *testing.T) {
html := `<div class="` + word + `">content</div>`
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
t.Fatalf("Failed to parse HTML: %v", err)
}
selection := doc.Find("div").First()
result := getClassWeight(selection)
if result != 25 {
t.Errorf("Expected positive weight 25 for word '%s', got %f", word, result)
}
})
}
for _, word := range negativeWords {
t.Run("negative_"+word, func(t *testing.T) {
html := `<div class="` + word + `">content</div>`
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
t.Fatalf("Failed to parse HTML: %v", err)
}
selection := doc.Find("div").First()
result := getClassWeight(selection)
if result != -25 {
t.Errorf("Expected negative weight -25 for word '%s', got %f", word, result)
}
})
}
}