diff --git a/internal/reader/processor/processor.go b/internal/reader/processor/processor.go index 7c5bb40f..e2099d2c 100644 --- a/internal/reader/processor/processor.go +++ b/internal/reader/processor/processor.go @@ -52,7 +52,7 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, userID int64, continue } - if cleanedURL, err := urlcleaner.RemoveTrackingParameters(entry.URL); err == nil { + if cleanedURL, err := urlcleaner.RemoveTrackingParameters(feed.FeedURL, feed.SiteURL, entry.URL); err == nil { entry.URL = cleanedURL } diff --git a/internal/reader/sanitizer/sanitizer.go b/internal/reader/sanitizer/sanitizer.go index 6bbffe5e..f9116618 100644 --- a/internal/reader/sanitizer/sanitizer.go +++ b/internal/reader/sanitizer/sanitizer.go @@ -217,7 +217,8 @@ func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) ([ continue } - if cleanedURL, err := urlcleaner.RemoveTrackingParameters(value); err == nil { + // TODO use feedURL instead of baseURL twice. + if cleanedURL, err := urlcleaner.RemoveTrackingParameters(baseURL, baseURL, value); err == nil { value = cleanedURL } } diff --git a/internal/reader/urlcleaner/urlcleaner.go b/internal/reader/urlcleaner/urlcleaner.go index b38f7780..ea4c91b1 100644 --- a/internal/reader/urlcleaner/urlcleaner.go +++ b/internal/reader/urlcleaner/urlcleaner.go @@ -89,7 +89,13 @@ var trackingParams = map[string]bool{ "_branch_referrer": true, } -func RemoveTrackingParameters(inputURL string) (string, error) { +// Outbound tracking parameters are appending the website's url to outbound links. +var trackingParamsOutbound = map[string]bool{ + // Ghost + "ref": true, +} + +func RemoveTrackingParameters(baseUrl, feedUrl, inputURL string) (string, error) { parsedURL, err := url.Parse(inputURL) if err != nil { return "", fmt.Errorf("urlcleaner: error parsing URL: %v", err) @@ -99,6 +105,15 @@ func RemoveTrackingParameters(inputURL string) (string, error) { return inputURL, nil } + parsedBaseUrl, err := url.Parse(baseUrl) + if err != nil { + return "", fmt.Errorf("urlcleaner: error parsing base URL: %v", err) + } + parsedFeedUrl, err := url.Parse(feedUrl) + if err != nil { + return "", fmt.Errorf("urlcleaner: error parsing feed URL: %v", err) + } + queryParams := parsedURL.Query() hasTrackers := false @@ -109,6 +124,16 @@ func RemoveTrackingParameters(inputURL string) (string, error) { queryParams.Del(param) hasTrackers = true } + if trackingParamsOutbound[lowerParam] { + // handle duplicate parameters like ?a=b&a=c&a=d… + for _, value := range queryParams[param] { + if value == parsedBaseUrl.Hostname() || value == parsedFeedUrl.Hostname() { + queryParams.Del(param) + hasTrackers = true + break + } + } + } } // Do not modify the URL if there are no tracking parameters diff --git a/internal/reader/urlcleaner/urlcleaner_test.go b/internal/reader/urlcleaner/urlcleaner_test.go index e2f3f670..4905c4de 100644 --- a/internal/reader/urlcleaner/urlcleaner_test.go +++ b/internal/reader/urlcleaner/urlcleaner_test.go @@ -14,6 +14,8 @@ func TestRemoveTrackingParams(t *testing.T) { name string input string expected string + baseUrl string + feedUrl string strictComparison bool }{ { @@ -62,28 +64,64 @@ func TestRemoveTrackingParams(t *testing.T) { input: "https://example.com/page?name=John%20Doe&utm_source=newsletter", expected: "https://example.com/page?name=John+Doe", }, + { + name: "ref parameter for another url", + input: "https://example.com/page?ref=test.com", + baseUrl: "https://example.com/page", + expected: "https://example.com/page?ref=test.com", + }, + { + name: "ref parameter for feed url", + input: "https://example.com/page?ref=feed.com", + baseUrl: "https://example.com/page", + expected: "https://example.com/page", + feedUrl: "http://feed.com", + }, + { + name: "ref parameter for site url", + input: "https://example.com/page?ref=example.com", + baseUrl: "https://example.com/page", + expected: "https://example.com/page", + }, + { + name: "ref parameter for base url", + input: "https://example.com/page?ref=example.com", + expected: "https://example.com/page", + baseUrl: "https://example.com", + feedUrl: "https://feedburned.com/example", + }, + { + name: "ref parameter for base url on subdomain", + input: "https://blog.exploits.club/some-path?ref=blog.exploits.club", + expected: "https://blog.exploits.club/some-path", + baseUrl: "https://blog.exploits.club/some-path", + feedUrl: "https://feedburned.com/exploit.club", + }, { name: "Non-standard URL parameter with no tracker", input: "https://example.com/foo.jpg?crop/1420x708/format/webp", expected: "https://example.com/foo.jpg?crop/1420x708/format/webp", + baseUrl: "https://example.com/page", strictComparison: true, }, { name: "Invalid URL", input: "https://example|org/", + baseUrl: "https://example.com/page", expected: "", }, { name: "Non-HTTP URL", input: "mailto:user@example.org", expected: "mailto:user@example.org", + baseUrl: "https://example.com/page", strictComparison: true, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - result, err := RemoveTrackingParameters(tt.input) + result, err := RemoveTrackingParameters(tt.baseUrl, tt.feedUrl, tt.input) if tt.expected == "" { if err == nil { t.Errorf("Expected an error for invalid URL, but got none")