mirror of
https://github.com/miniflux/v2.git
synced 2025-06-27 16:36:00 +00:00
perf(readability): improve getClassWeight speed
Before ```console $ go test -bench=. goos: linux goarch: arm64 pkg: miniflux.app/v2/internal/reader/readability BenchmarkExtractContent-8 34 86102474 ns/op BenchmarkGetWeight-8 10573 103045 ns/op PASS ok miniflux.app/v2/internal/reader/readability 5.409s ``` After ```console $ go test -bench=. goos: linux goarch: arm64 pkg: miniflux.app/v2/internal/reader/readability BenchmarkExtractContent-8 56 83130924 ns/op BenchmarkGetWeight-8 246541 5241 ns/op PASS ok miniflux.app/v2/internal/reader/readability 6.026s ``` This should make ProcessFeedEntries marginally faster, while saving an also marginal amount of memory.
This commit is contained in:
parent
fcf86e33b9
commit
ef13756b1a
2 changed files with 34 additions and 15 deletions
|
@ -26,8 +26,8 @@ var (
|
|||
okMaybeItsACandidateRegexp = regexp.MustCompile(`and|article|body|column|main|shadow`)
|
||||
unlikelyCandidatesRegexp = regexp.MustCompile(`banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`)
|
||||
|
||||
negativeRegexp = regexp.MustCompile(`hid|banner|combx|comment|com-|contact|foot|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget|byline|author|dateline|writtenby`)
|
||||
positiveRegexp = regexp.MustCompile(`article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`)
|
||||
positive = [...]string{"article", "blog", "body", "content", "entry", "h-entry", "hentry", "main", "page", "pagination", "post", "story", "text"}
|
||||
negative = [...]string{"author", "banner", "byline", "com-", "combx", "comment", "contact", "dateline", "foot", "hid", "masthead", "media", "meta", "modal", "outbrain", "promo", "related", "scroll", "share", "shopping", "shoutbox", "sidebar", "skyscraper", "sponsor", "tags", "tool", "widget", "writtenby"}
|
||||
)
|
||||
|
||||
type candidate struct {
|
||||
|
@ -294,26 +294,29 @@ func getClassWeight(s *goquery.Selection) float32 {
|
|||
weight := 0
|
||||
|
||||
if class, ok := s.Attr("class"); ok {
|
||||
class = strings.ToLower(class)
|
||||
if negativeRegexp.MatchString(class) {
|
||||
weight -= 25
|
||||
} else if positiveRegexp.MatchString(class) {
|
||||
weight += 25
|
||||
}
|
||||
weight += getWeight(class)
|
||||
}
|
||||
|
||||
if id, ok := s.Attr("id"); ok {
|
||||
id = strings.ToLower(id)
|
||||
if negativeRegexp.MatchString(id) {
|
||||
weight -= 25
|
||||
} else if positiveRegexp.MatchString(id) {
|
||||
weight += 25
|
||||
}
|
||||
weight += getWeight(id)
|
||||
}
|
||||
|
||||
return float32(weight)
|
||||
}
|
||||
|
||||
func getWeight(s string) int {
|
||||
for _, pos := range negative {
|
||||
if strings.Contains(s, pos) {
|
||||
return -25
|
||||
}
|
||||
}
|
||||
for _, pos := range positive {
|
||||
if strings.Contains(s, pos) {
|
||||
return +25
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func transformMisusedDivsIntoParagraphs(document *goquery.Document) {
|
||||
document.Find("div").Each(func(i int, s *goquery.Selection) {
|
||||
html, _ := s.Html()
|
||||
|
|
|
@ -204,3 +204,19 @@ func BenchmarkExtractContent(b *testing.B) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkGetWeight(b *testing.B) {
|
||||
testCases := []string{
|
||||
"p-3 color-bg-accent-emphasis color-fg-on-emphasis show-on-focus js-skip-to-content",
|
||||
"d-flex flex-column mb-3",
|
||||
"AppHeader-search-control AppHeader-search-control-overflow",
|
||||
"Button Button--iconOnly Button--invisible Button--medium mr-1 px-2 py-0 d-flex flex-items-center rounded-1 color-fg-muted",
|
||||
"sr-only",
|
||||
"validation-12753bbc-b4d1-4e10-bec6-92e585d1699d",
|
||||
}
|
||||
for range b.N {
|
||||
for _, v := range testCases {
|
||||
getWeight(v)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue