diff --git a/internal/reader/readability/readability.go b/internal/reader/readability/readability.go index 8f10b145..46771eeb 100644 --- a/internal/reader/readability/readability.go +++ b/internal/reader/readability/readability.go @@ -23,7 +23,6 @@ const ( var ( divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`) - blacklistCandidatesRegexp = regexp.MustCompile(`popupbody|-ad|g-plus`) okMaybeItsACandidateRegexp = regexp.MustCompile(`and|article|body|column|main|shadow`) unlikelyCandidatesRegexp = regexp.MustCompile(`banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`) @@ -81,9 +80,7 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er } } - document.Find("script,style").Each(func(i int, s *goquery.Selection) { - s.Remove() - }) + document.Find("script,style").Remove() transformMisusedDivsIntoParagraphs(document) removeUnlikelyCandidates(document) @@ -150,18 +147,29 @@ func getArticle(topCandidate *candidate, candidates candidateList) string { } func removeUnlikelyCandidates(document *goquery.Document) { + var shouldRemove = func(str string) bool { + str = strings.ToLower(str) + if strings.Contains(str, "popupbody") || strings.Contains(str, "-ad") || strings.Contains(str, "g-plus") { + return true + } else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) { + return true + } + return false + } + document.Find("*").Each(func(i int, s *goquery.Selection) { if s.Length() == 0 || s.Get(0).Data == "html" || s.Get(0).Data == "body" { return } - class, _ := s.Attr("class") - id, _ := s.Attr("id") - str := strings.ToLower(class + id) - if blacklistCandidatesRegexp.MatchString(str) { - s.Remove() - } else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) { - s.Remove() + if class, ok := s.Attr("class"); ok { + if shouldRemove(class) { + s.Remove() + } + } else if id, ok := s.Attr("id"); ok { + if shouldRemove(id) { + s.Remove() + } } }) } @@ -279,10 +287,8 @@ func getLinkDensity(s *goquery.Selection) float32 { // element looks good or bad. func getClassWeight(s *goquery.Selection) float32 { weight := 0 - class, _ := s.Attr("class") - id, _ := s.Attr("id") - if class != "" { + if class, ok := s.Attr("class"); ok { class = strings.ToLower(class) if negativeRegexp.MatchString(class) { weight -= 25 @@ -291,7 +297,7 @@ func getClassWeight(s *goquery.Selection) float32 { } } - if id != "" { + if id, ok := s.Attr("id"); ok { id = strings.ToLower(id) if negativeRegexp.MatchString(id) { weight -= 25 diff --git a/internal/reader/readability/readability_test.go b/internal/reader/readability/readability_test.go index 8baee1a0..e6deb889 100644 --- a/internal/reader/readability/readability_test.go +++ b/internal/reader/readability/readability_test.go @@ -4,6 +4,8 @@ package readability // import "miniflux.app/v2/internal/reader/readability" import ( + "bytes" + "os" "strings" "testing" ) @@ -161,3 +163,22 @@ func TestRemoveBlacklist(t *testing.T) { t.Errorf(`Invalid content, got %s instead of %s`, content, want) } } + +func BenchmarkExtractContent(b *testing.B) { + var testCases = map[string][]byte{ + "miniflux_github.html": {}, + "miniflux_wikipedia.html": {}, + } + for filename := range testCases { + data, err := os.ReadFile("testdata/" + filename) + if err != nil { + b.Fatalf(`Unable to read file %q: %v`, filename, err) + } + testCases[filename] = data + } + for range b.N { + for _, v := range testCases { + ExtractContent(bytes.NewReader(v)) + } + } +} diff --git a/internal/reader/readability/testdata b/internal/reader/readability/testdata new file mode 120000 index 00000000..83507da4 --- /dev/null +++ b/internal/reader/readability/testdata @@ -0,0 +1 @@ +../../reader/sanitizer/testdata/ \ No newline at end of file