mirror of
https://github.com/miniflux/v2.git
synced 2025-07-02 16:38:37 +00:00
Refactor internal/reader/readability/testdata
- Use chained strings.Contains instead of a regex for blacklistCandidatesRegexp, as this is a bit faster - Simplify a Find.Each.Remove to Find.Remove - Don't concatenate id and class for removeUnlikelyCandidates, as it makes no sense to match on overlaps. It might also marginally improve performances, as regex now have to run on two strings separately, instead of both. - Add a small benchmark
This commit is contained in:
parent
777d0dd248
commit
2df59b4865
3 changed files with 43 additions and 15 deletions
|
@ -23,7 +23,6 @@ const (
|
||||||
var (
|
var (
|
||||||
divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`)
|
divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`)
|
||||||
|
|
||||||
blacklistCandidatesRegexp = regexp.MustCompile(`popupbody|-ad|g-plus`)
|
|
||||||
okMaybeItsACandidateRegexp = regexp.MustCompile(`and|article|body|column|main|shadow`)
|
okMaybeItsACandidateRegexp = regexp.MustCompile(`and|article|body|column|main|shadow`)
|
||||||
unlikelyCandidatesRegexp = regexp.MustCompile(`banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`)
|
unlikelyCandidatesRegexp = regexp.MustCompile(`banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`)
|
||||||
|
|
||||||
|
@ -81,9 +80,7 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
document.Find("script,style").Each(func(i int, s *goquery.Selection) {
|
document.Find("script,style").Remove()
|
||||||
s.Remove()
|
|
||||||
})
|
|
||||||
|
|
||||||
transformMisusedDivsIntoParagraphs(document)
|
transformMisusedDivsIntoParagraphs(document)
|
||||||
removeUnlikelyCandidates(document)
|
removeUnlikelyCandidates(document)
|
||||||
|
@ -150,18 +147,29 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
|
||||||
}
|
}
|
||||||
|
|
||||||
func removeUnlikelyCandidates(document *goquery.Document) {
|
func removeUnlikelyCandidates(document *goquery.Document) {
|
||||||
|
var shouldRemove = func(str string) bool {
|
||||||
|
str = strings.ToLower(str)
|
||||||
|
if strings.Contains(str, "popupbody") || strings.Contains(str, "-ad") || strings.Contains(str, "g-plus") {
|
||||||
|
return true
|
||||||
|
} else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
document.Find("*").Each(func(i int, s *goquery.Selection) {
|
document.Find("*").Each(func(i int, s *goquery.Selection) {
|
||||||
if s.Length() == 0 || s.Get(0).Data == "html" || s.Get(0).Data == "body" {
|
if s.Length() == 0 || s.Get(0).Data == "html" || s.Get(0).Data == "body" {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
class, _ := s.Attr("class")
|
|
||||||
id, _ := s.Attr("id")
|
|
||||||
str := strings.ToLower(class + id)
|
|
||||||
|
|
||||||
if blacklistCandidatesRegexp.MatchString(str) {
|
if class, ok := s.Attr("class"); ok {
|
||||||
s.Remove()
|
if shouldRemove(class) {
|
||||||
} else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) {
|
s.Remove()
|
||||||
s.Remove()
|
}
|
||||||
|
} else if id, ok := s.Attr("id"); ok {
|
||||||
|
if shouldRemove(id) {
|
||||||
|
s.Remove()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -279,10 +287,8 @@ func getLinkDensity(s *goquery.Selection) float32 {
|
||||||
// element looks good or bad.
|
// element looks good or bad.
|
||||||
func getClassWeight(s *goquery.Selection) float32 {
|
func getClassWeight(s *goquery.Selection) float32 {
|
||||||
weight := 0
|
weight := 0
|
||||||
class, _ := s.Attr("class")
|
|
||||||
id, _ := s.Attr("id")
|
|
||||||
|
|
||||||
if class != "" {
|
if class, ok := s.Attr("class"); ok {
|
||||||
class = strings.ToLower(class)
|
class = strings.ToLower(class)
|
||||||
if negativeRegexp.MatchString(class) {
|
if negativeRegexp.MatchString(class) {
|
||||||
weight -= 25
|
weight -= 25
|
||||||
|
@ -291,7 +297,7 @@ func getClassWeight(s *goquery.Selection) float32 {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if id != "" {
|
if id, ok := s.Attr("id"); ok {
|
||||||
id = strings.ToLower(id)
|
id = strings.ToLower(id)
|
||||||
if negativeRegexp.MatchString(id) {
|
if negativeRegexp.MatchString(id) {
|
||||||
weight -= 25
|
weight -= 25
|
||||||
|
|
|
@ -4,6 +4,8 @@
|
||||||
package readability // import "miniflux.app/v2/internal/reader/readability"
|
package readability // import "miniflux.app/v2/internal/reader/readability"
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
@ -161,3 +163,22 @@ func TestRemoveBlacklist(t *testing.T) {
|
||||||
t.Errorf(`Invalid content, got %s instead of %s`, content, want)
|
t.Errorf(`Invalid content, got %s instead of %s`, content, want)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func BenchmarkExtractContent(b *testing.B) {
|
||||||
|
var testCases = map[string][]byte{
|
||||||
|
"miniflux_github.html": {},
|
||||||
|
"miniflux_wikipedia.html": {},
|
||||||
|
}
|
||||||
|
for filename := range testCases {
|
||||||
|
data, err := os.ReadFile("testdata/" + filename)
|
||||||
|
if err != nil {
|
||||||
|
b.Fatalf(`Unable to read file %q: %v`, filename, err)
|
||||||
|
}
|
||||||
|
testCases[filename] = data
|
||||||
|
}
|
||||||
|
for range b.N {
|
||||||
|
for _, v := range testCases {
|
||||||
|
ExtractContent(bytes.NewReader(v))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
1
internal/reader/readability/testdata
Symbolic link
1
internal/reader/readability/testdata
Symbolic link
|
@ -0,0 +1 @@
|
||||||
|
../../reader/sanitizer/testdata/
|
Loading…
Add table
Add a link
Reference in a new issue