From ae64fd7e4b09df901f3fc5295c248fc26cf465df Mon Sep 17 00:00:00 2001 From: jvoisin Date: Mon, 9 Jun 2025 14:08:55 +0200 Subject: [PATCH] perf(reader): optimize RemoveTrackingParameters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A bit more than 10% of processor.ProcessFeedEntries' CPU time is spent in urlcleaner.RemoveTrackingParameters, specifically calling url.Parse, so let's extract this operation outside of it, and do it once before calling urlcleaner.RemoveTrackingParameters multiple times. Co-authored-by: Frédéric Guillot --- internal/reader/processor/processor.go | 8 ++++- internal/reader/sanitizer/sanitizer.go | 5 ++- internal/reader/urlcleaner/urlcleaner.go | 33 +++++-------------- internal/reader/urlcleaner/urlcleaner_test.go | 5 ++- 4 files changed, 23 insertions(+), 28 deletions(-) diff --git a/internal/reader/processor/processor.go b/internal/reader/processor/processor.go index 0915d33c..d45f54c8 100644 --- a/internal/reader/processor/processor.go +++ b/internal/reader/processor/processor.go @@ -5,6 +5,7 @@ package processor // import "miniflux.app/v2/internal/reader/processor" import ( "log/slog" + "net/url" "regexp" "time" @@ -36,6 +37,10 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, userID int64, return } + // The errors are handled in RemoveTrackingParameters. + parsedFeedURL, _ := url.Parse(feed.FeedURL) + parsedSiteURL, _ := url.Parse(feed.SiteURL) + // Process older entries first for i := len(feed.Entries) - 1; i >= 0; i-- { entry := feed.Entries[i] @@ -52,7 +57,8 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, userID int64, continue } - if cleanedURL, err := urlcleaner.RemoveTrackingParameters(feed.FeedURL, feed.SiteURL, entry.URL); err == nil { + parsedInputUrl, _ := url.Parse(entry.URL) + if cleanedURL, err := urlcleaner.RemoveTrackingParameters(parsedFeedURL, parsedSiteURL, parsedInputUrl); err == nil { entry.URL = cleanedURL } diff --git a/internal/reader/sanitizer/sanitizer.go b/internal/reader/sanitizer/sanitizer.go index 28b9d94b..25773a48 100644 --- a/internal/reader/sanitizer/sanitizer.go +++ b/internal/reader/sanitizer/sanitizer.go @@ -227,6 +227,8 @@ func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute, sa isImageLargerThanLayout = imgWidth > 750 } + parsedBaseUrl, _ := url.Parse(baseURL) + for _, attribute := range attributes { value := attribute.Val @@ -267,7 +269,8 @@ func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute, sa } // TODO use feedURL instead of baseURL twice. - if cleanedURL, err := urlcleaner.RemoveTrackingParameters(baseURL, baseURL, value); err == nil { + parsedValueUrl, _ := url.Parse(value) + if cleanedURL, err := urlcleaner.RemoveTrackingParameters(parsedBaseUrl, parsedBaseUrl, parsedValueUrl); err == nil { value = cleanedURL } } diff --git a/internal/reader/urlcleaner/urlcleaner.go b/internal/reader/urlcleaner/urlcleaner.go index ea4c91b1..cc0e29da 100644 --- a/internal/reader/urlcleaner/urlcleaner.go +++ b/internal/reader/urlcleaner/urlcleaner.go @@ -95,26 +95,12 @@ var trackingParamsOutbound = map[string]bool{ "ref": true, } -func RemoveTrackingParameters(baseUrl, feedUrl, inputURL string) (string, error) { - parsedURL, err := url.Parse(inputURL) - if err != nil { - return "", fmt.Errorf("urlcleaner: error parsing URL: %v", err) +func RemoveTrackingParameters(parsedFeedURL, parsedSiteURL, parsedInputUrl *url.URL) (string, error) { + if parsedFeedURL == nil || parsedSiteURL == nil || parsedInputUrl == nil { + return "", fmt.Errorf("urlcleaner: one of the URLs is nil") } - if !strings.HasPrefix(parsedURL.Scheme, "http") { - return inputURL, nil - } - - parsedBaseUrl, err := url.Parse(baseUrl) - if err != nil { - return "", fmt.Errorf("urlcleaner: error parsing base URL: %v", err) - } - parsedFeedUrl, err := url.Parse(feedUrl) - if err != nil { - return "", fmt.Errorf("urlcleaner: error parsing feed URL: %v", err) - } - - queryParams := parsedURL.Query() + queryParams := parsedInputUrl.Query() hasTrackers := false // Remove tracking parameters @@ -127,7 +113,7 @@ func RemoveTrackingParameters(baseUrl, feedUrl, inputURL string) (string, error) if trackingParamsOutbound[lowerParam] { // handle duplicate parameters like ?a=b&a=c&a=d… for _, value := range queryParams[param] { - if value == parsedBaseUrl.Hostname() || value == parsedFeedUrl.Hostname() { + if value == parsedFeedURL.Hostname() || value == parsedSiteURL.Hostname() { queryParams.Del(param) hasTrackers = true break @@ -138,14 +124,11 @@ func RemoveTrackingParameters(baseUrl, feedUrl, inputURL string) (string, error) // Do not modify the URL if there are no tracking parameters if !hasTrackers { - return inputURL, nil + return parsedInputUrl.String(), nil } - parsedURL.RawQuery = queryParams.Encode() - - // Remove trailing "?" if query string is empty - cleanedURL := parsedURL.String() - cleanedURL = strings.TrimSuffix(cleanedURL, "?") + parsedInputUrl.RawQuery = queryParams.Encode() + cleanedURL := strings.TrimSuffix(parsedInputUrl.String(), "?") return cleanedURL, nil } diff --git a/internal/reader/urlcleaner/urlcleaner_test.go b/internal/reader/urlcleaner/urlcleaner_test.go index 4905c4de..099708b7 100644 --- a/internal/reader/urlcleaner/urlcleaner_test.go +++ b/internal/reader/urlcleaner/urlcleaner_test.go @@ -121,7 +121,10 @@ func TestRemoveTrackingParams(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - result, err := RemoveTrackingParameters(tt.baseUrl, tt.feedUrl, tt.input) + parsedBaseUrl, _ := url.Parse(tt.baseUrl) + parsedFeedUrl, _ := url.Parse(tt.feedUrl) + parsedInputUrl, _ := url.Parse(tt.input) + result, err := RemoveTrackingParameters(parsedBaseUrl, parsedFeedUrl, parsedInputUrl) if tt.expected == "" { if err == nil { t.Errorf("Expected an error for invalid URL, but got none")