mirror of
https://github.com/miniflux/v2.git
synced 2025-08-06 17:41:00 +00:00
refactor(readability): get rid of getClassWeight
Its naming was confusing, and its code simple enough that it could be inlined.
This commit is contained in:
parent
1de9cf4241
commit
a62b97bddd
2 changed files with 8 additions and 60 deletions
|
@ -318,7 +318,13 @@ func scoreNode(s *goquery.Selection) *candidate {
|
|||
c.score -= 5
|
||||
}
|
||||
|
||||
c.score += getClassWeight(s)
|
||||
if class, ok := s.Attr("class"); ok {
|
||||
c.score += getWeight(class)
|
||||
}
|
||||
if id, ok := s.Attr("id"); ok {
|
||||
c.score += getWeight(id)
|
||||
}
|
||||
|
||||
return c
|
||||
}
|
||||
|
||||
|
@ -335,22 +341,7 @@ func getLinkDensity(s *goquery.Selection) float32 {
|
|||
return float32(linkLength) / float32(sum)
|
||||
}
|
||||
|
||||
// Get an elements class/id weight. Uses regular expressions to tell if this
|
||||
// element looks good or bad.
|
||||
func getClassWeight(s *goquery.Selection) float32 {
|
||||
weight := 0
|
||||
|
||||
if class, ok := s.Attr("class"); ok {
|
||||
weight += getWeight(class)
|
||||
}
|
||||
if id, ok := s.Attr("id"); ok {
|
||||
weight += getWeight(id)
|
||||
}
|
||||
|
||||
return float32(weight)
|
||||
}
|
||||
|
||||
func getWeight(s string) int {
|
||||
func getWeight(s string) float32 {
|
||||
s = strings.ToLower(s)
|
||||
for _, keyword := range negativeKeywords {
|
||||
if strings.Contains(s, keyword) {
|
||||
|
|
|
@ -350,49 +350,6 @@ func TestGetClassWeight(t *testing.T) {
|
|||
if selection.Length() == 0 {
|
||||
t.Fatal("No div element found in HTML")
|
||||
}
|
||||
|
||||
result := getClassWeight(selection)
|
||||
if result != tc.expected {
|
||||
t.Errorf("Expected weight %f, got %f", tc.expected, result)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetClassWeightRegexPatterns(t *testing.T) {
|
||||
// Test specific regex patterns used in getClassWeight
|
||||
positiveWords := []string{"article", "body", "content", "entry", "hentry", "h-entry", "main", "page", "pagination", "post", "text", "blog", "story"}
|
||||
negativeWords := []string{"hid", "banner", "combx", "comment", "com-", "contact", "foot", "masthead", "media", "meta", "modal", "outbrain", "promo", "related", "scroll", "share", "shoutbox", "sidebar", "skyscraper", "sponsor", "shopping", "tags", "tool", "widget", "byline", "author", "dateline", "writtenby"}
|
||||
|
||||
for _, word := range positiveWords {
|
||||
t.Run("positive_"+word, func(t *testing.T) {
|
||||
html := `<div class="` + word + `">content</div>`
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to parse HTML: %v", err)
|
||||
}
|
||||
|
||||
selection := doc.Find("div").First()
|
||||
result := getClassWeight(selection)
|
||||
if result != 25 {
|
||||
t.Errorf("Expected positive weight 25 for word '%s', got %f", word, result)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
for _, word := range negativeWords {
|
||||
t.Run("negative_"+word, func(t *testing.T) {
|
||||
html := `<div class="` + word + `">content</div>`
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to parse HTML: %v", err)
|
||||
}
|
||||
|
||||
selection := doc.Find("div").First()
|
||||
result := getClassWeight(selection)
|
||||
if result != -25 {
|
||||
t.Errorf("Expected negative weight -25 for word '%s', got %f", word, result)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue