From 2df59b48656459ab172dd01365f776f4ff258c29 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Fri, 13 Dec 2024 23:43:07 +0100 Subject: [PATCH] Refactor internal/reader/readability/testdata - Use chained strings.Contains instead of a regex for blacklistCandidatesRegexp, as this is a bit faster - Simplify a Find.Each.Remove to Find.Remove - Don't concatenate id and class for removeUnlikelyCandidates, as it makes no sense to match on overlaps. It might also marginally improve performances, as regex now have to run on two strings separately, instead of both. - Add a small benchmark --- internal/reader/readability/readability.go | 36 +++++++++++-------- .../reader/readability/readability_test.go | 21 +++++++++++ internal/reader/readability/testdata | 1 + 3 files changed, 43 insertions(+), 15 deletions(-) create mode 120000 internal/reader/readability/testdata diff --git a/internal/reader/readability/readability.go b/internal/reader/readability/readability.go index 8f10b145..46771eeb 100644 --- a/internal/reader/readability/readability.go +++ b/internal/reader/readability/readability.go @@ -23,7 +23,6 @@ const ( var ( divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`) - blacklistCandidatesRegexp = regexp.MustCompile(`popupbody|-ad|g-plus`) okMaybeItsACandidateRegexp = regexp.MustCompile(`and|article|body|column|main|shadow`) unlikelyCandidatesRegexp = regexp.MustCompile(`banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`) @@ -81,9 +80,7 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er } } - document.Find("script,style").Each(func(i int, s *goquery.Selection) { - s.Remove() - }) + document.Find("script,style").Remove() transformMisusedDivsIntoParagraphs(document) removeUnlikelyCandidates(document) @@ -150,18 +147,29 @@ func getArticle(topCandidate *candidate, candidates candidateList) string { } func removeUnlikelyCandidates(document *goquery.Document) { + var shouldRemove = func(str string) bool { + str = strings.ToLower(str) + if strings.Contains(str, "popupbody") || strings.Contains(str, "-ad") || strings.Contains(str, "g-plus") { + return true + } else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) { + return true + } + return false + } + document.Find("*").Each(func(i int, s *goquery.Selection) { if s.Length() == 0 || s.Get(0).Data == "html" || s.Get(0).Data == "body" { return } - class, _ := s.Attr("class") - id, _ := s.Attr("id") - str := strings.ToLower(class + id) - if blacklistCandidatesRegexp.MatchString(str) { - s.Remove() - } else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) { - s.Remove() + if class, ok := s.Attr("class"); ok { + if shouldRemove(class) { + s.Remove() + } + } else if id, ok := s.Attr("id"); ok { + if shouldRemove(id) { + s.Remove() + } } }) } @@ -279,10 +287,8 @@ func getLinkDensity(s *goquery.Selection) float32 { // element looks good or bad. func getClassWeight(s *goquery.Selection) float32 { weight := 0 - class, _ := s.Attr("class") - id, _ := s.Attr("id") - if class != "" { + if class, ok := s.Attr("class"); ok { class = strings.ToLower(class) if negativeRegexp.MatchString(class) { weight -= 25 @@ -291,7 +297,7 @@ func getClassWeight(s *goquery.Selection) float32 { } } - if id != "" { + if id, ok := s.Attr("id"); ok { id = strings.ToLower(id) if negativeRegexp.MatchString(id) { weight -= 25 diff --git a/internal/reader/readability/readability_test.go b/internal/reader/readability/readability_test.go index 8baee1a0..e6deb889 100644 --- a/internal/reader/readability/readability_test.go +++ b/internal/reader/readability/readability_test.go @@ -4,6 +4,8 @@ package readability // import "miniflux.app/v2/internal/reader/readability" import ( + "bytes" + "os" "strings" "testing" ) @@ -161,3 +163,22 @@ func TestRemoveBlacklist(t *testing.T) { t.Errorf(`Invalid content, got %s instead of %s`, content, want) } } + +func BenchmarkExtractContent(b *testing.B) { + var testCases = map[string][]byte{ + "miniflux_github.html": {}, + "miniflux_wikipedia.html": {}, + } + for filename := range testCases { + data, err := os.ReadFile("testdata/" + filename) + if err != nil { + b.Fatalf(`Unable to read file %q: %v`, filename, err) + } + testCases[filename] = data + } + for range b.N { + for _, v := range testCases { + ExtractContent(bytes.NewReader(v)) + } + } +} diff --git a/internal/reader/readability/testdata b/internal/reader/readability/testdata new file mode 120000 index 00000000..83507da4 --- /dev/null +++ b/internal/reader/readability/testdata @@ -0,0 +1 @@ +../../reader/sanitizer/testdata/ \ No newline at end of file