From c0f6e32a990b701d62e6a7186ce58b0c86bb545e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Fri, 19 Jul 2024 21:03:33 -0700 Subject: [PATCH] feat: remove well-known URL parameter trackers --- internal/reader/processor/processor.go | 40 ++++--- internal/reader/sanitizer/sanitizer.go | 5 + internal/reader/sanitizer/sanitizer_test.go | 20 ++++ internal/reader/urlcleaner/urlcleaner.go | 96 ++++++++++++++++ internal/reader/urlcleaner/urlcleaner_test.go | 108 ++++++++++++++++++ 5 files changed, 252 insertions(+), 17 deletions(-) create mode 100644 internal/reader/urlcleaner/urlcleaner.go create mode 100644 internal/reader/urlcleaner/urlcleaner_test.go diff --git a/internal/reader/processor/processor.go b/internal/reader/processor/processor.go index 06a5cbc5..30679d11 100644 --- a/internal/reader/processor/processor.go +++ b/internal/reader/processor/processor.go @@ -21,6 +21,7 @@ import ( "miniflux.app/v2/internal/reader/rewrite" "miniflux.app/v2/internal/reader/sanitizer" "miniflux.app/v2/internal/reader/scraper" + "miniflux.app/v2/internal/reader/urlcleaner" "miniflux.app/v2/internal/storage" "github.com/PuerkitoBio/goquery" @@ -56,7 +57,11 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us continue } - websiteURL := getUrlFromEntry(feed, entry) + if cleanedURL, err := urlcleaner.RemoveTrackingParameters(entry.URL); err == nil { + entry.URL = cleanedURL + } + + rewrittenURL := rewriteEntryURL(feed, entry) entryIsNew := store.IsNewEntry(feed.ID, entry.Hash) if feed.Crawler && (entryIsNew || forceRefresh) { slog.Debug("Scraping entry", @@ -68,7 +73,7 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us slog.String("feed_url", feed.FeedURL), slog.Bool("entry_is_new", entryIsNew), slog.Bool("force_refresh", forceRefresh), - slog.String("website_url", websiteURL), + slog.String("rewritten_url", rewrittenURL), ) startTime := time.Now() @@ -84,7 +89,7 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us content, scraperErr := scraper.ScrapeWebsite( requestBuilder, - websiteURL, + rewrittenURL, feed.ScraperRules, ) @@ -110,10 +115,10 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us } } - rewrite.Rewriter(websiteURL, entry, feed.RewriteRules) + rewrite.Rewriter(rewrittenURL, entry, feed.RewriteRules) - // The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered. - entry.Content = sanitizer.Sanitize(websiteURL, entry.Content) + // The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered out. + entry.Content = sanitizer.Sanitize(rewrittenURL, entry.Content) updateEntryReadingTime(store, feed, entry, entryIsNew, user) filteredEntries = append(filteredEntries, entry) @@ -264,7 +269,7 @@ func isAllowedEntry(feed *model.Feed, entry *model.Entry, user *model.User) bool // ProcessEntryWebPage downloads the entry web page and apply rewrite rules. func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User) error { startTime := time.Now() - websiteURL := getUrlFromEntry(feed, entry) + rewrittenEntryURL := rewriteEntryURL(feed, entry) requestBuilder := fetcher.NewRequestBuilder() requestBuilder.WithUserAgent(feed.UserAgent, config.Opts.HTTPClientUserAgent()) @@ -277,7 +282,7 @@ func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User) content, scraperErr := scraper.ScrapeWebsite( requestBuilder, - websiteURL, + rewrittenEntryURL, feed.ScraperRules, ) @@ -300,14 +305,14 @@ func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User) } } - rewrite.Rewriter(websiteURL, entry, entry.Feed.RewriteRules) - entry.Content = sanitizer.Sanitize(websiteURL, entry.Content) + rewrite.Rewriter(rewrittenEntryURL, entry, entry.Feed.RewriteRules) + entry.Content = sanitizer.Sanitize(rewrittenEntryURL, entry.Content) return nil } -func getUrlFromEntry(feed *model.Feed, entry *model.Entry) string { - var url = entry.URL +func rewriteEntryURL(feed *model.Feed, entry *model.Entry) string { + var rewrittenURL = entry.URL if feed.UrlRewriteRules != "" { parts := customReplaceRuleRegex.FindStringSubmatch(feed.UrlRewriteRules) @@ -318,26 +323,27 @@ func getUrlFromEntry(feed *model.Feed, entry *model.Entry) string { slog.String("url_rewrite_rules", feed.UrlRewriteRules), slog.Any("error", err), ) - return url + return rewrittenURL } - url = re.ReplaceAllString(entry.URL, parts[2]) + rewrittenURL = re.ReplaceAllString(entry.URL, parts[2]) slog.Debug("Rewriting entry URL", slog.String("original_entry_url", entry.URL), - slog.String("rewritten_entry_url", url), + slog.String("rewritten_entry_url", rewrittenURL), slog.Int64("feed_id", feed.ID), slog.String("feed_url", feed.FeedURL), ) } else { slog.Debug("Cannot find search and replace terms for replace rule", slog.String("original_entry_url", entry.URL), - slog.String("rewritten_entry_url", url), + slog.String("rewritten_entry_url", rewrittenURL), slog.Int64("feed_id", feed.ID), slog.String("feed_url", feed.FeedURL), slog.String("url_rewrite_rules", feed.UrlRewriteRules), ) } } - return url + + return rewrittenURL } func updateEntryReadingTime(store *storage.Storage, feed *model.Feed, entry *model.Entry, entryIsNew bool, user *model.User) { diff --git a/internal/reader/sanitizer/sanitizer.go b/internal/reader/sanitizer/sanitizer.go index 962111b2..3af138b2 100644 --- a/internal/reader/sanitizer/sanitizer.go +++ b/internal/reader/sanitizer/sanitizer.go @@ -12,6 +12,7 @@ import ( "strings" "miniflux.app/v2/internal/config" + "miniflux.app/v2/internal/reader/urlcleaner" "miniflux.app/v2/internal/urllib" "golang.org/x/net/html" @@ -211,6 +212,10 @@ func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) ([ if !hasValidURIScheme(value) || isBlockedResource(value) { continue } + + if cleanedURL, err := urlcleaner.RemoveTrackingParameters(value); err == nil { + value = cleanedURL + } } } diff --git a/internal/reader/sanitizer/sanitizer_test.go b/internal/reader/sanitizer/sanitizer_test.go index 8289e602..a924c430 100644 --- a/internal/reader/sanitizer/sanitizer_test.go +++ b/internal/reader/sanitizer/sanitizer_test.go @@ -490,6 +490,26 @@ func TestBlacklistedLink(t *testing.T) { } } +func TestLinkWithTrackers(t *testing.T) { + input := `

This link has trackers Test

` + expected := `

This link has trackers Test

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestImageSrcWithTrackers(t *testing.T) { + input := `

This image has trackers

` + expected := `

This image has trackers

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + func TestPixelTracker(t *testing.T) { input := `

and

` expected := `

and

` diff --git a/internal/reader/urlcleaner/urlcleaner.go b/internal/reader/urlcleaner/urlcleaner.go new file mode 100644 index 00000000..1c33f637 --- /dev/null +++ b/internal/reader/urlcleaner/urlcleaner.go @@ -0,0 +1,96 @@ +// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package urlcleaner // import "miniflux.app/v2/internal/reader/urlcleaner" + +import ( + "fmt" + "net/url" + "strings" +) + +// Interesting lists: +// https://raw.githubusercontent.com/AdguardTeam/AdguardFilters/master/TrackParamFilter/sections/general_url.txt +// https://firefox.settings.services.mozilla.com/v1/buckets/main/collections/query-stripping/records +var trackingParams = map[string]bool{ + // https://en.wikipedia.org/wiki/UTM_parameters#Parameters + "utm_source": true, + "utm_medium": true, + "utm_campaign": true, + "utm_term": true, + "utm_content": true, + + // Facebook Click Identifiers + "fbclid": true, + "_openstat": true, + + // Google Click Identifiers + "gclid": true, + "dclid": true, + "gbraid": true, + "wbraid": true, + + // Yandex Click Identifiers + "yclid": true, + "ysclid": true, + + // Twitter Click Identifier + "twclid": true, + + // Microsoft Click Identifier + "msclkid": true, + + // Mailchimp Click Identifiers + "mc_cid": true, + "mc_eid": true, + + // Wicked Reports click tracking + "wickedid": true, + + // Hubspot Click Identifiers + "hsa_cam": true, + "_hsenc": true, + "__hssc": true, + "__hstc": true, + "__hsfp": true, + "hsctatracking": true, + + // Olytics + "rb_clickid": true, + "oly_anon_id": true, + "oly_enc_id": true, + + // Vero Click Identifier + "vero_id": true, + + // Marketo email tracking + "mkt_tok": true, +} + +func RemoveTrackingParameters(inputURL string) (string, error) { + parsedURL, err := url.Parse(inputURL) + if err != nil { + return "", fmt.Errorf("urlcleaner: error parsing URL: %v", err) + } + + if !strings.HasPrefix(parsedURL.Scheme, "http") { + return inputURL, nil + } + + queryParams := parsedURL.Query() + + // Remove tracking parameters + for param := range queryParams { + if trackingParams[strings.ToLower(param)] { + queryParams.Del(param) + } + } + + parsedURL.RawQuery = queryParams.Encode() + + // Remove trailing "?" if query string is empty + cleanedURL := parsedURL.String() + cleanedURL = strings.TrimSuffix(cleanedURL, "?") + + return cleanedURL, nil +} diff --git a/internal/reader/urlcleaner/urlcleaner_test.go b/internal/reader/urlcleaner/urlcleaner_test.go new file mode 100644 index 00000000..c1f754e8 --- /dev/null +++ b/internal/reader/urlcleaner/urlcleaner_test.go @@ -0,0 +1,108 @@ +// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package urlcleaner // import "miniflux.app/v2/internal/reader/urlcleaner" + +import ( + "net/url" + "reflect" + "testing" +) + +func TestRemoveTrackingParams(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + { + name: "URL with tracking parameters", + input: "https://example.com/page?id=123&utm_source=newsletter&utm_medium=email&fbclid=abc123", + expected: "https://example.com/page?id=123", + }, + { + name: "URL with only tracking parameters", + input: "https://example.com/page?utm_source=newsletter&utm_medium=email", + expected: "https://example.com/page", + }, + { + name: "URL with no tracking parameters", + input: "https://example.com/page?id=123&foo=bar", + expected: "https://example.com/page?id=123&foo=bar", + }, + { + name: "URL with no parameters", + input: "https://example.com/page", + expected: "https://example.com/page", + }, + { + name: "URL with mixed case tracking parameters", + input: "https://example.com/page?UTM_SOURCE=newsletter&utm_MEDIUM=email", + expected: "https://example.com/page", + }, + { + name: "URL with tracking parameters and fragments", + input: "https://example.com/page?id=123&utm_source=newsletter#section1", + expected: "https://example.com/page?id=123#section1", + }, + { + name: "URL with only tracking parameters and fragments", + input: "https://example.com/page?utm_source=newsletter#section1", + expected: "https://example.com/page#section1", + }, + { + name: "URL with only one tracking parameter", + input: "https://example.com/page?utm_source=newsletter", + expected: "https://example.com/page", + }, + { + name: "URL with encoded characters", + input: "https://example.com/page?name=John%20Doe&utm_source=newsletter", + expected: "https://example.com/page?name=John+Doe", + }, + { + name: "Invalid URL", + input: "https://example|org/", + expected: "", + }, + { + name: "Non-HTTP URL", + input: "mailto:user@example.org", + expected: "mailto:user@example.org", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := RemoveTrackingParameters(tt.input) + if tt.expected == "" { + if err == nil { + t.Errorf("Expected an error for invalid URL, but got none") + } + } else { + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if !urlsEqual(result, tt.expected) { + t.Errorf("removeTrackingParams(%q) = %q, want %q", tt.input, result, tt.expected) + } + } + }) + } +} + +// urlsEqual compares two URLs for equality, ignoring the order of query parameters +func urlsEqual(url1, url2 string) bool { + u1, err1 := url.Parse(url1) + u2, err2 := url.Parse(url2) + + if err1 != nil || err2 != nil { + return false + } + + if u1.Scheme != u2.Scheme || u1.Host != u2.Host || u1.Path != u2.Path || u1.Fragment != u2.Fragment { + return false + } + + return reflect.DeepEqual(u1.Query(), u2.Query()) +}