1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-06-27 16:36:00 +00:00

refactor(readability): simplify the regexes in internal/reader/readability/readability.go

- Use strings.ToLower() instead of having case-insensitive regex
- Remove overlapping words in the regex
- Split a condition to increase readability
This commit is contained in:
Julien Voisin 2024-12-08 00:56:19 +00:00 committed by GitHub
parent 2f56ebd3a6
commit 2671f57edd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -26,12 +26,12 @@ var (
divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`) divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`)
sentenceRegexp = regexp.MustCompile(`\.( |$)`) sentenceRegexp = regexp.MustCompile(`\.( |$)`)
blacklistCandidatesRegexp = regexp.MustCompile(`(?i)popupbody|-ad|g-plus`) blacklistCandidatesRegexp = regexp.MustCompile(`popupbody|-ad|g-plus`)
okMaybeItsACandidateRegexp = regexp.MustCompile(`(?i)and|article|body|column|main|shadow`) okMaybeItsACandidateRegexp = regexp.MustCompile(`and|article|body|column|main|shadow`)
unlikelyCandidatesRegexp = regexp.MustCompile(`(?i)banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`) unlikelyCandidatesRegexp = regexp.MustCompile(`banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`)
negativeRegexp = regexp.MustCompile(`(?i)hidden|^hid$|hid$|hid|^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget|byline|author|dateline|writtenby|p-author`) negativeRegexp = regexp.MustCompile(`hid|banner|combx|comment|com-|contact|foot|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget|byline|author|dateline|writtenby`)
positiveRegexp = regexp.MustCompile(`(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`) positiveRegexp = regexp.MustCompile(`article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`)
) )
type candidate struct { type candidate struct {
@ -154,9 +154,11 @@ func removeUnlikelyCandidates(document *goquery.Document) {
} }
class, _ := s.Attr("class") class, _ := s.Attr("class")
id, _ := s.Attr("id") id, _ := s.Attr("id")
str := class + id str := strings.ToLower(class + id)
if blacklistCandidatesRegexp.MatchString(str) || (unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str)) { if blacklistCandidatesRegexp.MatchString(str) {
removeNodes(s)
} else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) {
removeNodes(s) removeNodes(s)
} }
}) })
@ -277,6 +279,9 @@ func getClassWeight(s *goquery.Selection) float32 {
class, _ := s.Attr("class") class, _ := s.Attr("class")
id, _ := s.Attr("id") id, _ := s.Attr("id")
class = strings.ToLower(class)
id = strings.ToLower(id)
if class != "" { if class != "" {
if negativeRegexp.MatchString(class) { if negativeRegexp.MatchString(class) {
weight -= 25 weight -= 25