mirror of
https://github.com/miniflux/v2.git
synced 2025-06-27 16:36:00 +00:00
Merge daedd26e2b
into fcf86e33b9
This commit is contained in:
commit
2dcd5bece4
1 changed files with 29 additions and 17 deletions
|
@ -23,8 +23,9 @@ const (
|
||||||
var (
|
var (
|
||||||
divToPElementsRegexp = regexp.MustCompile(`(?i)<(?:a|blockquote|dl|div|img|ol|p|pre|table|ul)[ />]`)
|
divToPElementsRegexp = regexp.MustCompile(`(?i)<(?:a|blockquote|dl|div|img|ol|p|pre|table|ul)[ />]`)
|
||||||
|
|
||||||
okMaybeItsACandidateRegexp = regexp.MustCompile(`and|article|body|column|main|shadow`)
|
strongCandidates = [...]string{"popupbody", "-ad", "g-plus"}
|
||||||
unlikelyCandidatesRegexp = regexp.MustCompile(`banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`)
|
maybeCandidate = [...]string{"and", "article", "body", "column", "main", "shadow"}
|
||||||
|
unlikelyCandidate = [...]string{"banner", "breadcrumbs", "combx", "comment", "community", "cover-wrap", "disqus", "extra", "foot", "header", "legends", "menu", "modal", "related", "remark", "replies", "rss", "shoutbox", "sidebar", "skyscraper", "social", "sponsor", "supplemental", "ad-break", "agegate", "pagination", "pager", "popup", "yom-remote"}
|
||||||
|
|
||||||
negativeRegexp = regexp.MustCompile(`hid|banner|combx|comment|com-|contact|foot|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget|byline|author|dateline|writtenby`)
|
negativeRegexp = regexp.MustCompile(`hid|banner|combx|comment|com-|contact|foot|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget|byline|author|dateline|writtenby`)
|
||||||
positiveRegexp = regexp.MustCompile(`article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`)
|
positiveRegexp = regexp.MustCompile(`article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`)
|
||||||
|
@ -145,18 +146,33 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
|
||||||
output.WriteString("</div>")
|
output.WriteString("</div>")
|
||||||
return output.String()
|
return output.String()
|
||||||
}
|
}
|
||||||
|
func shouldRemoveCandidate(str string) bool {
|
||||||
func removeUnlikelyCandidates(document *goquery.Document) {
|
|
||||||
var shouldRemove = func(str string) bool {
|
|
||||||
str = strings.ToLower(str)
|
str = strings.ToLower(str)
|
||||||
if strings.Contains(str, "popupbody") || strings.Contains(str, "-ad") || strings.Contains(str, "g-plus") {
|
|
||||||
|
// Those candidates have no false-positives, no need to check against `maybeCandidate`
|
||||||
|
for _, strong := range strongCandidates {
|
||||||
|
if strings.Contains(str, strong) {
|
||||||
return true
|
return true
|
||||||
} else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) {
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, unlikely := range unlikelyCandidate {
|
||||||
|
if strings.Contains(str, unlikely) {
|
||||||
|
// Do we have a false positive?
|
||||||
|
for _, maybe := range maybeCandidate {
|
||||||
|
if strings.Contains(str, maybe) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Nope, it's a true positive!
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
}
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func removeUnlikelyCandidates(document *goquery.Document) {
|
||||||
document.Find("*").Each(func(i int, s *goquery.Selection) {
|
document.Find("*").Each(func(i int, s *goquery.Selection) {
|
||||||
if s.Length() == 0 || s.Get(0).Data == "html" || s.Get(0).Data == "body" {
|
if s.Length() == 0 || s.Get(0).Data == "html" || s.Get(0).Data == "body" {
|
||||||
return
|
return
|
||||||
|
@ -167,15 +183,11 @@ func removeUnlikelyCandidates(document *goquery.Document) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if class, ok := s.Attr("class"); ok {
|
if class, ok := s.Attr("class"); ok && shouldRemoveCandidate(class) {
|
||||||
if shouldRemove(class) {
|
|
||||||
s.Remove()
|
s.Remove()
|
||||||
}
|
} else if id, ok := s.Attr("id"); ok && shouldRemoveCandidate(id) {
|
||||||
} else if id, ok := s.Attr("id"); ok {
|
|
||||||
if shouldRemove(id) {
|
|
||||||
s.Remove()
|
s.Remove()
|
||||||
}
|
}
|
||||||
}
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue