From aed99e65c1fbd25be113cec6965c77d20f596af9 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Fri, 27 Jun 2025 16:11:06 +0200 Subject: [PATCH] perf(readability): improve getClassWeight speed Before ```console $ go test -bench=. goos: linux goarch: arm64 pkg: miniflux.app/v2/internal/reader/readability BenchmarkExtractContent-8 34 86102474 ns/op BenchmarkGetWeight-8 10573 103045 ns/op PASS ok miniflux.app/v2/internal/reader/readability 5.409s ``` After ```console $ go test -bench=. goos: linux goarch: arm64 pkg: miniflux.app/v2/internal/reader/readability BenchmarkExtractContent-8 56 83130924 ns/op BenchmarkGetWeight-8 246541 5241 ns/op PASS ok miniflux.app/v2/internal/reader/readability 6.026s ``` This should make ProcessFeedEntries marginally faster, while saving some memory. --- internal/reader/readability/readability.go | 34 +++++++++++-------- .../reader/readability/readability_test.go | 16 +++++++++ 2 files changed, 35 insertions(+), 15 deletions(-) diff --git a/internal/reader/readability/readability.go b/internal/reader/readability/readability.go index 4ba4e0de..50c7bb30 100644 --- a/internal/reader/readability/readability.go +++ b/internal/reader/readability/readability.go @@ -27,8 +27,8 @@ var ( maybeCandidate = [...]string{"and", "article", "body", "column", "main", "shadow"} unlikelyCandidate = [...]string{"banner", "breadcrumbs", "combx", "comment", "community", "cover-wrap", "disqus", "extra", "foot", "header", "legends", "menu", "modal", "related", "remark", "replies", "rss", "shoutbox", "sidebar", "skyscraper", "social", "sponsor", "supplemental", "ad-break", "agegate", "pagination", "pager", "popup", "yom-remote"} - negativeRegexp = regexp.MustCompile(`hid|banner|combx|comment|com-|contact|foot|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget|byline|author|dateline|writtenby`) - positiveRegexp = regexp.MustCompile(`article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`) + positive = [...]string{"article", "blog", "body", "content", "entry", "h-entry", "hentry", "main", "page", "pagination", "post", "story", "text"} + negative = [...]string{"author", "banner", "byline", "com-", "combx", "comment", "contact", "dateline", "foot", "hid", "masthead", "media", "meta", "modal", "outbrain", "promo", "related", "scroll", "share", "shopping", "shoutbox", "sidebar", "skyscraper", "sponsor", "tags", "tool", "widget", "writtenby"} ) type candidate struct { @@ -303,26 +303,30 @@ func getClassWeight(s *goquery.Selection) float32 { weight := 0 if class, ok := s.Attr("class"); ok { - class = strings.ToLower(class) - if negativeRegexp.MatchString(class) { - weight -= 25 - } else if positiveRegexp.MatchString(class) { - weight += 25 - } + weight += getWeight(class) } - if id, ok := s.Attr("id"); ok { - id = strings.ToLower(id) - if negativeRegexp.MatchString(id) { - weight -= 25 - } else if positiveRegexp.MatchString(id) { - weight += 25 - } + weight += getWeight(id) } return float32(weight) } +func getWeight(s string) int { + s = strings.ToLower(s) + for _, pos := range negative { + if strings.Contains(s, pos) { + return -25 + } + } + for _, pos := range positive { + if strings.Contains(s, pos) { + return +25 + } + } + return 0 +} + func transformMisusedDivsIntoParagraphs(document *goquery.Document) { document.Find("div").Each(func(i int, s *goquery.Selection) { html, _ := s.Html() diff --git a/internal/reader/readability/readability_test.go b/internal/reader/readability/readability_test.go index 088054e7..cf3188e6 100644 --- a/internal/reader/readability/readability_test.go +++ b/internal/reader/readability/readability_test.go @@ -1314,3 +1314,19 @@ func TestContainsSentence(t *testing.T) { }) } } + +func BenchmarkGetWeight(b *testing.B) { + testCases := []string{ + "p-3 color-bg-accent-emphasis color-fg-on-emphasis show-on-focus js-skip-to-content", + "d-flex flex-column mb-3", + "AppHeader-search-control AppHeader-search-control-overflow", + "Button Button--iconOnly Button--invisible Button--medium mr-1 px-2 py-0 d-flex flex-items-center rounded-1 color-fg-muted", + "sr-only", + "validation-12753bbc-b4d1-4e10-bec6-92e585d1699d", + } + for range b.N { + for _, v := range testCases { + getWeight(v) + } + } +}