1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-07-02 16:38:37 +00:00

Refactor internal/reader/readability/testdata

- Use chained strings.Contains instead of a regex for
  blacklistCandidatesRegexp, as this is a bit faster
- Simplify a Find.Each.Remove to Find.Remove
- Don't concatenate id and class for removeUnlikelyCandidates, as it makes no
  sense to match on overlaps. It might also marginally improve performances, as
  regex now have to run on two strings separately, instead of both.
- Add a small benchmark
This commit is contained in:
jvoisin 2024-12-13 23:43:07 +01:00 committed by Frédéric Guillot
parent 777d0dd248
commit 2df59b4865
3 changed files with 43 additions and 15 deletions

View file

@ -23,7 +23,6 @@ const (
var ( var (
divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`) divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`)
blacklistCandidatesRegexp = regexp.MustCompile(`popupbody|-ad|g-plus`)
okMaybeItsACandidateRegexp = regexp.MustCompile(`and|article|body|column|main|shadow`) okMaybeItsACandidateRegexp = regexp.MustCompile(`and|article|body|column|main|shadow`)
unlikelyCandidatesRegexp = regexp.MustCompile(`banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`) unlikelyCandidatesRegexp = regexp.MustCompile(`banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`)
@ -81,9 +80,7 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er
} }
} }
document.Find("script,style").Each(func(i int, s *goquery.Selection) { document.Find("script,style").Remove()
s.Remove()
})
transformMisusedDivsIntoParagraphs(document) transformMisusedDivsIntoParagraphs(document)
removeUnlikelyCandidates(document) removeUnlikelyCandidates(document)
@ -150,18 +147,29 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
} }
func removeUnlikelyCandidates(document *goquery.Document) { func removeUnlikelyCandidates(document *goquery.Document) {
var shouldRemove = func(str string) bool {
str = strings.ToLower(str)
if strings.Contains(str, "popupbody") || strings.Contains(str, "-ad") || strings.Contains(str, "g-plus") {
return true
} else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) {
return true
}
return false
}
document.Find("*").Each(func(i int, s *goquery.Selection) { document.Find("*").Each(func(i int, s *goquery.Selection) {
if s.Length() == 0 || s.Get(0).Data == "html" || s.Get(0).Data == "body" { if s.Length() == 0 || s.Get(0).Data == "html" || s.Get(0).Data == "body" {
return return
} }
class, _ := s.Attr("class")
id, _ := s.Attr("id")
str := strings.ToLower(class + id)
if blacklistCandidatesRegexp.MatchString(str) { if class, ok := s.Attr("class"); ok {
s.Remove() if shouldRemove(class) {
} else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) { s.Remove()
s.Remove() }
} else if id, ok := s.Attr("id"); ok {
if shouldRemove(id) {
s.Remove()
}
} }
}) })
} }
@ -279,10 +287,8 @@ func getLinkDensity(s *goquery.Selection) float32 {
// element looks good or bad. // element looks good or bad.
func getClassWeight(s *goquery.Selection) float32 { func getClassWeight(s *goquery.Selection) float32 {
weight := 0 weight := 0
class, _ := s.Attr("class")
id, _ := s.Attr("id")
if class != "" { if class, ok := s.Attr("class"); ok {
class = strings.ToLower(class) class = strings.ToLower(class)
if negativeRegexp.MatchString(class) { if negativeRegexp.MatchString(class) {
weight -= 25 weight -= 25
@ -291,7 +297,7 @@ func getClassWeight(s *goquery.Selection) float32 {
} }
} }
if id != "" { if id, ok := s.Attr("id"); ok {
id = strings.ToLower(id) id = strings.ToLower(id)
if negativeRegexp.MatchString(id) { if negativeRegexp.MatchString(id) {
weight -= 25 weight -= 25

View file

@ -4,6 +4,8 @@
package readability // import "miniflux.app/v2/internal/reader/readability" package readability // import "miniflux.app/v2/internal/reader/readability"
import ( import (
"bytes"
"os"
"strings" "strings"
"testing" "testing"
) )
@ -161,3 +163,22 @@ func TestRemoveBlacklist(t *testing.T) {
t.Errorf(`Invalid content, got %s instead of %s`, content, want) t.Errorf(`Invalid content, got %s instead of %s`, content, want)
} }
} }
func BenchmarkExtractContent(b *testing.B) {
var testCases = map[string][]byte{
"miniflux_github.html": {},
"miniflux_wikipedia.html": {},
}
for filename := range testCases {
data, err := os.ReadFile("testdata/" + filename)
if err != nil {
b.Fatalf(`Unable to read file %q: %v`, filename, err)
}
testCases[filename] = data
}
for range b.N {
for _, v := range testCases {
ExtractContent(bytes.NewReader(v))
}
}
}

View file

@ -0,0 +1 @@
../../reader/sanitizer/testdata/