1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-06-27 16:36:00 +00:00

perf(reader): optimize RemoveTrackingParameters

A bit more than 10% of processor.ProcessFeedEntries' CPU time is spent in
urlcleaner.RemoveTrackingParameters, specifically calling url.Parse, so let's
extract this operation outside of it, and do it once before calling
urlcleaner.RemoveTrackingParameters multiple times.

Co-authored-by: Frédéric Guillot <f@miniflux.net>
This commit is contained in:
jvoisin 2025-06-09 14:08:55 +02:00
parent d59990f1dd
commit ae64fd7e4b
4 changed files with 23 additions and 28 deletions

View file

@ -5,6 +5,7 @@ package processor // import "miniflux.app/v2/internal/reader/processor"
import ( import (
"log/slog" "log/slog"
"net/url"
"regexp" "regexp"
"time" "time"
@ -36,6 +37,10 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, userID int64,
return return
} }
// The errors are handled in RemoveTrackingParameters.
parsedFeedURL, _ := url.Parse(feed.FeedURL)
parsedSiteURL, _ := url.Parse(feed.SiteURL)
// Process older entries first // Process older entries first
for i := len(feed.Entries) - 1; i >= 0; i-- { for i := len(feed.Entries) - 1; i >= 0; i-- {
entry := feed.Entries[i] entry := feed.Entries[i]
@ -52,7 +57,8 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, userID int64,
continue continue
} }
if cleanedURL, err := urlcleaner.RemoveTrackingParameters(feed.FeedURL, feed.SiteURL, entry.URL); err == nil { parsedInputUrl, _ := url.Parse(entry.URL)
if cleanedURL, err := urlcleaner.RemoveTrackingParameters(parsedFeedURL, parsedSiteURL, parsedInputUrl); err == nil {
entry.URL = cleanedURL entry.URL = cleanedURL
} }

View file

@ -227,6 +227,8 @@ func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute, sa
isImageLargerThanLayout = imgWidth > 750 isImageLargerThanLayout = imgWidth > 750
} }
parsedBaseUrl, _ := url.Parse(baseURL)
for _, attribute := range attributes { for _, attribute := range attributes {
value := attribute.Val value := attribute.Val
@ -267,7 +269,8 @@ func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute, sa
} }
// TODO use feedURL instead of baseURL twice. // TODO use feedURL instead of baseURL twice.
if cleanedURL, err := urlcleaner.RemoveTrackingParameters(baseURL, baseURL, value); err == nil { parsedValueUrl, _ := url.Parse(value)
if cleanedURL, err := urlcleaner.RemoveTrackingParameters(parsedBaseUrl, parsedBaseUrl, parsedValueUrl); err == nil {
value = cleanedURL value = cleanedURL
} }
} }

View file

@ -95,26 +95,12 @@ var trackingParamsOutbound = map[string]bool{
"ref": true, "ref": true,
} }
func RemoveTrackingParameters(baseUrl, feedUrl, inputURL string) (string, error) { func RemoveTrackingParameters(parsedFeedURL, parsedSiteURL, parsedInputUrl *url.URL) (string, error) {
parsedURL, err := url.Parse(inputURL) if parsedFeedURL == nil || parsedSiteURL == nil || parsedInputUrl == nil {
if err != nil { return "", fmt.Errorf("urlcleaner: one of the URLs is nil")
return "", fmt.Errorf("urlcleaner: error parsing URL: %v", err)
} }
if !strings.HasPrefix(parsedURL.Scheme, "http") { queryParams := parsedInputUrl.Query()
return inputURL, nil
}
parsedBaseUrl, err := url.Parse(baseUrl)
if err != nil {
return "", fmt.Errorf("urlcleaner: error parsing base URL: %v", err)
}
parsedFeedUrl, err := url.Parse(feedUrl)
if err != nil {
return "", fmt.Errorf("urlcleaner: error parsing feed URL: %v", err)
}
queryParams := parsedURL.Query()
hasTrackers := false hasTrackers := false
// Remove tracking parameters // Remove tracking parameters
@ -127,7 +113,7 @@ func RemoveTrackingParameters(baseUrl, feedUrl, inputURL string) (string, error)
if trackingParamsOutbound[lowerParam] { if trackingParamsOutbound[lowerParam] {
// handle duplicate parameters like ?a=b&a=c&a=d… // handle duplicate parameters like ?a=b&a=c&a=d…
for _, value := range queryParams[param] { for _, value := range queryParams[param] {
if value == parsedBaseUrl.Hostname() || value == parsedFeedUrl.Hostname() { if value == parsedFeedURL.Hostname() || value == parsedSiteURL.Hostname() {
queryParams.Del(param) queryParams.Del(param)
hasTrackers = true hasTrackers = true
break break
@ -138,14 +124,11 @@ func RemoveTrackingParameters(baseUrl, feedUrl, inputURL string) (string, error)
// Do not modify the URL if there are no tracking parameters // Do not modify the URL if there are no tracking parameters
if !hasTrackers { if !hasTrackers {
return inputURL, nil return parsedInputUrl.String(), nil
} }
parsedURL.RawQuery = queryParams.Encode() parsedInputUrl.RawQuery = queryParams.Encode()
cleanedURL := strings.TrimSuffix(parsedInputUrl.String(), "?")
// Remove trailing "?" if query string is empty
cleanedURL := parsedURL.String()
cleanedURL = strings.TrimSuffix(cleanedURL, "?")
return cleanedURL, nil return cleanedURL, nil
} }

View file

@ -121,7 +121,10 @@ func TestRemoveTrackingParams(t *testing.T) {
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
result, err := RemoveTrackingParameters(tt.baseUrl, tt.feedUrl, tt.input) parsedBaseUrl, _ := url.Parse(tt.baseUrl)
parsedFeedUrl, _ := url.Parse(tt.feedUrl)
parsedInputUrl, _ := url.Parse(tt.input)
result, err := RemoveTrackingParameters(parsedBaseUrl, parsedFeedUrl, parsedInputUrl)
if tt.expected == "" { if tt.expected == "" {
if err == nil { if err == nil {
t.Errorf("Expected an error for invalid URL, but got none") t.Errorf("Expected an error for invalid URL, but got none")