1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-08-06 17:41:00 +00:00

test(readability): increase test coverage

This commit is contained in:
Frédéric Guillot 2025-06-30 21:18:39 -07:00
parent 99c5bcdb01
commit 6eeccae7cd
2 changed files with 1209 additions and 46 deletions

View file

@ -27,8 +27,8 @@ var (
maybeCandidate = [...]string{"and", "article", "body", "column", "main", "shadow"} maybeCandidate = [...]string{"and", "article", "body", "column", "main", "shadow"}
unlikelyCandidate = [...]string{"banner", "breadcrumbs", "combx", "comment", "community", "cover-wrap", "disqus", "extra", "foot", "header", "legends", "menu", "modal", "related", "remark", "replies", "rss", "shoutbox", "sidebar", "skyscraper", "social", "sponsor", "supplemental", "ad-break", "agegate", "pagination", "pager", "popup", "yom-remote"} unlikelyCandidate = [...]string{"banner", "breadcrumbs", "combx", "comment", "community", "cover-wrap", "disqus", "extra", "foot", "header", "legends", "menu", "modal", "related", "remark", "replies", "rss", "shoutbox", "sidebar", "skyscraper", "social", "sponsor", "supplemental", "ad-break", "agegate", "pagination", "pager", "popup", "yom-remote"}
positive = [...]string{"article", "blog", "body", "content", "entry", "h-entry", "hentry", "main", "page", "pagination", "post", "story", "text"} positiveKeywords = [...]string{"article", "blog", "body", "content", "entry", "h-entry", "hentry", "main", "page", "pagination", "post", "story", "text"}
negative = [...]string{"author", "banner", "byline", "com-", "combx", "comment", "contact", "dateline", "foot", "hid", "masthead", "media", "meta", "modal", "outbrain", "promo", "related", "scroll", "share", "shopping", "shoutbox", "sidebar", "skyscraper", "sponsor", "tags", "tool", "widget", "writtenby"} negativeKeywords = [...]string{"author", "banner", "byline", "com-", "combx", "comment", "contact", "dateline", "foot", "hid", "masthead", "media", "meta", "modal", "outbrain", "promo", "related", "scroll", "share", "shopping", "shoutbox", "sidebar", "skyscraper", "sponsor", "tags", "tool", "widget", "writtenby"}
) )
type candidate struct { type candidate struct {
@ -37,23 +37,31 @@ type candidate struct {
} }
func (c *candidate) Node() *html.Node { func (c *candidate) Node() *html.Node {
if c.selection.Length() == 0 {
return nil
}
return c.selection.Get(0) return c.selection.Get(0)
} }
func (c *candidate) String() string { func (c *candidate) String() string {
node := c.Node()
if node == nil {
return fmt.Sprintf("empty => %f", c.score)
}
id, _ := c.selection.Attr("id") id, _ := c.selection.Attr("id")
class, _ := c.selection.Attr("class") class, _ := c.selection.Attr("class")
switch { switch {
case id != "" && class != "": case id != "" && class != "":
return fmt.Sprintf("%s#%s.%s => %f", c.Node().DataAtom, id, class, c.score) return fmt.Sprintf("%s#%s.%s => %f", node.DataAtom, id, class, c.score)
case id != "": case id != "":
return fmt.Sprintf("%s#%s => %f", c.Node().DataAtom, id, c.score) return fmt.Sprintf("%s#%s => %f", node.DataAtom, id, c.score)
case class != "": case class != "":
return fmt.Sprintf("%s.%s => %f", c.Node().DataAtom, class, c.score) return fmt.Sprintf("%s.%s => %f", node.DataAtom, class, c.score)
} }
return fmt.Sprintf("%s => %f", c.Node().DataAtom, c.score) return fmt.Sprintf("%s => %f", node.DataAtom, c.score)
} }
type candidateList map[*html.Node]*candidate type candidateList map[*html.Node]*candidate
@ -111,7 +119,8 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
tag := "div" tag := "div"
node := s.Get(0) node := s.Get(0)
if node == topCandidate.Node() { topNode := topCandidate.Node()
if topNode != nil && node == topNode {
append = true append = true
} else if c, ok := candidates[node]; ok && c.score >= siblingScoreThreshold { } else if c, ok := candidates[node]; ok && c.score >= siblingScoreThreshold {
append = true append = true
@ -147,14 +156,14 @@ func shouldRemoveCandidate(str string) bool {
str = strings.ToLower(str) str = strings.ToLower(str)
// Those candidates have no false-positives, no need to check against `maybeCandidate` // Those candidates have no false-positives, no need to check against `maybeCandidate`
for _, strong := range strongCandidates { for _, strongCandidate := range strongCandidates {
if strings.Contains(str, strong) { if strings.Contains(str, strongCandidate) {
return true return true
} }
} }
for _, unlikely := range unlikelyCandidate { for _, unlikelyCandidate := range unlikelyCandidate {
if strings.Contains(str, unlikely) { if strings.Contains(str, unlikelyCandidate) {
// Do we have a false positive? // Do we have a false positive?
for _, maybe := range maybeCandidate { for _, maybe := range maybeCandidate {
if strings.Contains(str, maybe) { if strings.Contains(str, maybe) {
@ -268,6 +277,11 @@ func getCandidates(document *goquery.Document) candidateList {
func scoreNode(s *goquery.Selection) *candidate { func scoreNode(s *goquery.Selection) *candidate {
c := &candidate{selection: s, score: 0} c := &candidate{selection: s, score: 0}
// Check if selection is empty to avoid panic
if s.Length() == 0 {
return c
}
switch s.Get(0).DataAtom.String() { switch s.Get(0).DataAtom.String() {
case "div": case "div":
c.score += 5 c.score += 5
@ -314,13 +328,13 @@ func getClassWeight(s *goquery.Selection) float32 {
func getWeight(s string) int { func getWeight(s string) int {
s = strings.ToLower(s) s = strings.ToLower(s)
for _, pos := range negative { for _, keyword := range negativeKeywords {
if strings.Contains(s, pos) { if strings.Contains(s, keyword) {
return -25 return -25
} }
} }
for _, pos := range positive { for _, keyword := range positiveKeywords {
if strings.Contains(s, pos) { if strings.Contains(s, keyword) {
return +25 return +25
} }
} }

File diff suppressed because it is too large Load diff