mirror of
https://github.com/miniflux/v2.git
synced 2025-06-27 16:36:00 +00:00
perf(reader): optimize RemoveTrackingParameters
A bit more than 10% of processor.ProcessFeedEntries' CPU time is spent in urlcleaner.RemoveTrackingParameters, specifically calling url.Parse, so let's extract this operation outside of it, and do it once before calling urlcleaner.RemoveTrackingParameters multiple times. Co-authored-by: Frédéric Guillot <f@miniflux.net>
This commit is contained in:
parent
0caadf82f2
commit
7c857bdc72
4 changed files with 23 additions and 28 deletions
|
@ -5,6 +5,7 @@ package processor // import "miniflux.app/v2/internal/reader/processor"
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"net/url"
|
||||||
"regexp"
|
"regexp"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
@ -36,6 +37,10 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, userID int64,
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// The errors are handled in RemoveTrackingParameters.
|
||||||
|
parsedFeedURL, _ := url.Parse(feed.FeedURL)
|
||||||
|
parsedSiteURL, _ := url.Parse(feed.SiteURL)
|
||||||
|
|
||||||
// Process older entries first
|
// Process older entries first
|
||||||
for i := len(feed.Entries) - 1; i >= 0; i-- {
|
for i := len(feed.Entries) - 1; i >= 0; i-- {
|
||||||
entry := feed.Entries[i]
|
entry := feed.Entries[i]
|
||||||
|
@ -52,7 +57,8 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, userID int64,
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
if cleanedURL, err := urlcleaner.RemoveTrackingParameters(feed.FeedURL, feed.SiteURL, entry.URL); err == nil {
|
parsedInputUrl, _ := url.Parse(entry.URL)
|
||||||
|
if cleanedURL, err := urlcleaner.RemoveTrackingParameters(parsedFeedURL, parsedSiteURL, parsedInputUrl); err == nil {
|
||||||
entry.URL = cleanedURL
|
entry.URL = cleanedURL
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -227,6 +227,8 @@ func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute, sa
|
||||||
isImageLargerThanLayout = imgWidth > 750
|
isImageLargerThanLayout = imgWidth > 750
|
||||||
}
|
}
|
||||||
|
|
||||||
|
parsedBaseUrl, _ := url.Parse(baseURL)
|
||||||
|
|
||||||
for _, attribute := range attributes {
|
for _, attribute := range attributes {
|
||||||
value := attribute.Val
|
value := attribute.Val
|
||||||
|
|
||||||
|
@ -283,7 +285,8 @@ func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute, sa
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO use feedURL instead of baseURL twice.
|
// TODO use feedURL instead of baseURL twice.
|
||||||
if cleanedURL, err := urlcleaner.RemoveTrackingParameters(baseURL, baseURL, value); err == nil {
|
parsedValueUrl, _ := url.Parse(value)
|
||||||
|
if cleanedURL, err := urlcleaner.RemoveTrackingParameters(parsedBaseUrl, parsedBaseUrl, parsedValueUrl); err == nil {
|
||||||
value = cleanedURL
|
value = cleanedURL
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -95,26 +95,12 @@ var trackingParamsOutbound = map[string]bool{
|
||||||
"ref": true,
|
"ref": true,
|
||||||
}
|
}
|
||||||
|
|
||||||
func RemoveTrackingParameters(baseUrl, feedUrl, inputURL string) (string, error) {
|
func RemoveTrackingParameters(parsedFeedURL, parsedSiteURL, parsedInputUrl *url.URL) (string, error) {
|
||||||
parsedURL, err := url.Parse(inputURL)
|
if parsedFeedURL == nil || parsedSiteURL == nil || parsedInputUrl == nil {
|
||||||
if err != nil {
|
return "", fmt.Errorf("urlcleaner: one of the URLs is nil")
|
||||||
return "", fmt.Errorf("urlcleaner: error parsing URL: %v", err)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if !strings.HasPrefix(parsedURL.Scheme, "http") {
|
queryParams := parsedInputUrl.Query()
|
||||||
return inputURL, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
parsedBaseUrl, err := url.Parse(baseUrl)
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("urlcleaner: error parsing base URL: %v", err)
|
|
||||||
}
|
|
||||||
parsedFeedUrl, err := url.Parse(feedUrl)
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("urlcleaner: error parsing feed URL: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
queryParams := parsedURL.Query()
|
|
||||||
hasTrackers := false
|
hasTrackers := false
|
||||||
|
|
||||||
// Remove tracking parameters
|
// Remove tracking parameters
|
||||||
|
@ -127,7 +113,7 @@ func RemoveTrackingParameters(baseUrl, feedUrl, inputURL string) (string, error)
|
||||||
if trackingParamsOutbound[lowerParam] {
|
if trackingParamsOutbound[lowerParam] {
|
||||||
// handle duplicate parameters like ?a=b&a=c&a=d…
|
// handle duplicate parameters like ?a=b&a=c&a=d…
|
||||||
for _, value := range queryParams[param] {
|
for _, value := range queryParams[param] {
|
||||||
if value == parsedBaseUrl.Hostname() || value == parsedFeedUrl.Hostname() {
|
if value == parsedFeedURL.Hostname() || value == parsedSiteURL.Hostname() {
|
||||||
queryParams.Del(param)
|
queryParams.Del(param)
|
||||||
hasTrackers = true
|
hasTrackers = true
|
||||||
break
|
break
|
||||||
|
@ -138,14 +124,11 @@ func RemoveTrackingParameters(baseUrl, feedUrl, inputURL string) (string, error)
|
||||||
|
|
||||||
// Do not modify the URL if there are no tracking parameters
|
// Do not modify the URL if there are no tracking parameters
|
||||||
if !hasTrackers {
|
if !hasTrackers {
|
||||||
return inputURL, nil
|
return parsedInputUrl.String(), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
parsedURL.RawQuery = queryParams.Encode()
|
parsedInputUrl.RawQuery = queryParams.Encode()
|
||||||
|
cleanedURL := strings.TrimSuffix(parsedInputUrl.String(), "?")
|
||||||
// Remove trailing "?" if query string is empty
|
|
||||||
cleanedURL := parsedURL.String()
|
|
||||||
cleanedURL = strings.TrimSuffix(cleanedURL, "?")
|
|
||||||
|
|
||||||
return cleanedURL, nil
|
return cleanedURL, nil
|
||||||
}
|
}
|
||||||
|
|
|
@ -121,7 +121,10 @@ func TestRemoveTrackingParams(t *testing.T) {
|
||||||
|
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
result, err := RemoveTrackingParameters(tt.baseUrl, tt.feedUrl, tt.input)
|
parsedBaseUrl, _ := url.Parse(tt.baseUrl)
|
||||||
|
parsedFeedUrl, _ := url.Parse(tt.feedUrl)
|
||||||
|
parsedInputUrl, _ := url.Parse(tt.input)
|
||||||
|
result, err := RemoveTrackingParameters(parsedBaseUrl, parsedFeedUrl, parsedInputUrl)
|
||||||
if tt.expected == "" {
|
if tt.expected == "" {
|
||||||
if err == nil {
|
if err == nil {
|
||||||
t.Errorf("Expected an error for invalid URL, but got none")
|
t.Errorf("Expected an error for invalid URL, but got none")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue