mirror of
https://github.com/miniflux/v2.git
synced 2025-06-27 16:36:00 +00:00
feat: remove well-known URL parameter trackers
This commit is contained in:
parent
11cafec863
commit
c0f6e32a99
5 changed files with 252 additions and 17 deletions
|
@ -21,6 +21,7 @@ import (
|
||||||
"miniflux.app/v2/internal/reader/rewrite"
|
"miniflux.app/v2/internal/reader/rewrite"
|
||||||
"miniflux.app/v2/internal/reader/sanitizer"
|
"miniflux.app/v2/internal/reader/sanitizer"
|
||||||
"miniflux.app/v2/internal/reader/scraper"
|
"miniflux.app/v2/internal/reader/scraper"
|
||||||
|
"miniflux.app/v2/internal/reader/urlcleaner"
|
||||||
"miniflux.app/v2/internal/storage"
|
"miniflux.app/v2/internal/storage"
|
||||||
|
|
||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
|
@ -56,7 +57,11 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
websiteURL := getUrlFromEntry(feed, entry)
|
if cleanedURL, err := urlcleaner.RemoveTrackingParameters(entry.URL); err == nil {
|
||||||
|
entry.URL = cleanedURL
|
||||||
|
}
|
||||||
|
|
||||||
|
rewrittenURL := rewriteEntryURL(feed, entry)
|
||||||
entryIsNew := store.IsNewEntry(feed.ID, entry.Hash)
|
entryIsNew := store.IsNewEntry(feed.ID, entry.Hash)
|
||||||
if feed.Crawler && (entryIsNew || forceRefresh) {
|
if feed.Crawler && (entryIsNew || forceRefresh) {
|
||||||
slog.Debug("Scraping entry",
|
slog.Debug("Scraping entry",
|
||||||
|
@ -68,7 +73,7 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us
|
||||||
slog.String("feed_url", feed.FeedURL),
|
slog.String("feed_url", feed.FeedURL),
|
||||||
slog.Bool("entry_is_new", entryIsNew),
|
slog.Bool("entry_is_new", entryIsNew),
|
||||||
slog.Bool("force_refresh", forceRefresh),
|
slog.Bool("force_refresh", forceRefresh),
|
||||||
slog.String("website_url", websiteURL),
|
slog.String("rewritten_url", rewrittenURL),
|
||||||
)
|
)
|
||||||
|
|
||||||
startTime := time.Now()
|
startTime := time.Now()
|
||||||
|
@ -84,7 +89,7 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us
|
||||||
|
|
||||||
content, scraperErr := scraper.ScrapeWebsite(
|
content, scraperErr := scraper.ScrapeWebsite(
|
||||||
requestBuilder,
|
requestBuilder,
|
||||||
websiteURL,
|
rewrittenURL,
|
||||||
feed.ScraperRules,
|
feed.ScraperRules,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -110,10 +115,10 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
rewrite.Rewriter(websiteURL, entry, feed.RewriteRules)
|
rewrite.Rewriter(rewrittenURL, entry, feed.RewriteRules)
|
||||||
|
|
||||||
// The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered.
|
// The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered out.
|
||||||
entry.Content = sanitizer.Sanitize(websiteURL, entry.Content)
|
entry.Content = sanitizer.Sanitize(rewrittenURL, entry.Content)
|
||||||
|
|
||||||
updateEntryReadingTime(store, feed, entry, entryIsNew, user)
|
updateEntryReadingTime(store, feed, entry, entryIsNew, user)
|
||||||
filteredEntries = append(filteredEntries, entry)
|
filteredEntries = append(filteredEntries, entry)
|
||||||
|
@ -264,7 +269,7 @@ func isAllowedEntry(feed *model.Feed, entry *model.Entry, user *model.User) bool
|
||||||
// ProcessEntryWebPage downloads the entry web page and apply rewrite rules.
|
// ProcessEntryWebPage downloads the entry web page and apply rewrite rules.
|
||||||
func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User) error {
|
func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User) error {
|
||||||
startTime := time.Now()
|
startTime := time.Now()
|
||||||
websiteURL := getUrlFromEntry(feed, entry)
|
rewrittenEntryURL := rewriteEntryURL(feed, entry)
|
||||||
|
|
||||||
requestBuilder := fetcher.NewRequestBuilder()
|
requestBuilder := fetcher.NewRequestBuilder()
|
||||||
requestBuilder.WithUserAgent(feed.UserAgent, config.Opts.HTTPClientUserAgent())
|
requestBuilder.WithUserAgent(feed.UserAgent, config.Opts.HTTPClientUserAgent())
|
||||||
|
@ -277,7 +282,7 @@ func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User)
|
||||||
|
|
||||||
content, scraperErr := scraper.ScrapeWebsite(
|
content, scraperErr := scraper.ScrapeWebsite(
|
||||||
requestBuilder,
|
requestBuilder,
|
||||||
websiteURL,
|
rewrittenEntryURL,
|
||||||
feed.ScraperRules,
|
feed.ScraperRules,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -300,14 +305,14 @@ func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
rewrite.Rewriter(websiteURL, entry, entry.Feed.RewriteRules)
|
rewrite.Rewriter(rewrittenEntryURL, entry, entry.Feed.RewriteRules)
|
||||||
entry.Content = sanitizer.Sanitize(websiteURL, entry.Content)
|
entry.Content = sanitizer.Sanitize(rewrittenEntryURL, entry.Content)
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func getUrlFromEntry(feed *model.Feed, entry *model.Entry) string {
|
func rewriteEntryURL(feed *model.Feed, entry *model.Entry) string {
|
||||||
var url = entry.URL
|
var rewrittenURL = entry.URL
|
||||||
if feed.UrlRewriteRules != "" {
|
if feed.UrlRewriteRules != "" {
|
||||||
parts := customReplaceRuleRegex.FindStringSubmatch(feed.UrlRewriteRules)
|
parts := customReplaceRuleRegex.FindStringSubmatch(feed.UrlRewriteRules)
|
||||||
|
|
||||||
|
@ -318,26 +323,27 @@ func getUrlFromEntry(feed *model.Feed, entry *model.Entry) string {
|
||||||
slog.String("url_rewrite_rules", feed.UrlRewriteRules),
|
slog.String("url_rewrite_rules", feed.UrlRewriteRules),
|
||||||
slog.Any("error", err),
|
slog.Any("error", err),
|
||||||
)
|
)
|
||||||
return url
|
return rewrittenURL
|
||||||
}
|
}
|
||||||
url = re.ReplaceAllString(entry.URL, parts[2])
|
rewrittenURL = re.ReplaceAllString(entry.URL, parts[2])
|
||||||
slog.Debug("Rewriting entry URL",
|
slog.Debug("Rewriting entry URL",
|
||||||
slog.String("original_entry_url", entry.URL),
|
slog.String("original_entry_url", entry.URL),
|
||||||
slog.String("rewritten_entry_url", url),
|
slog.String("rewritten_entry_url", rewrittenURL),
|
||||||
slog.Int64("feed_id", feed.ID),
|
slog.Int64("feed_id", feed.ID),
|
||||||
slog.String("feed_url", feed.FeedURL),
|
slog.String("feed_url", feed.FeedURL),
|
||||||
)
|
)
|
||||||
} else {
|
} else {
|
||||||
slog.Debug("Cannot find search and replace terms for replace rule",
|
slog.Debug("Cannot find search and replace terms for replace rule",
|
||||||
slog.String("original_entry_url", entry.URL),
|
slog.String("original_entry_url", entry.URL),
|
||||||
slog.String("rewritten_entry_url", url),
|
slog.String("rewritten_entry_url", rewrittenURL),
|
||||||
slog.Int64("feed_id", feed.ID),
|
slog.Int64("feed_id", feed.ID),
|
||||||
slog.String("feed_url", feed.FeedURL),
|
slog.String("feed_url", feed.FeedURL),
|
||||||
slog.String("url_rewrite_rules", feed.UrlRewriteRules),
|
slog.String("url_rewrite_rules", feed.UrlRewriteRules),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return url
|
|
||||||
|
return rewrittenURL
|
||||||
}
|
}
|
||||||
|
|
||||||
func updateEntryReadingTime(store *storage.Storage, feed *model.Feed, entry *model.Entry, entryIsNew bool, user *model.User) {
|
func updateEntryReadingTime(store *storage.Storage, feed *model.Feed, entry *model.Entry, entryIsNew bool, user *model.User) {
|
||||||
|
|
|
@ -12,6 +12,7 @@ import (
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"miniflux.app/v2/internal/config"
|
"miniflux.app/v2/internal/config"
|
||||||
|
"miniflux.app/v2/internal/reader/urlcleaner"
|
||||||
"miniflux.app/v2/internal/urllib"
|
"miniflux.app/v2/internal/urllib"
|
||||||
|
|
||||||
"golang.org/x/net/html"
|
"golang.org/x/net/html"
|
||||||
|
@ -211,6 +212,10 @@ func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) ([
|
||||||
if !hasValidURIScheme(value) || isBlockedResource(value) {
|
if !hasValidURIScheme(value) || isBlockedResource(value) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if cleanedURL, err := urlcleaner.RemoveTrackingParameters(value); err == nil {
|
||||||
|
value = cleanedURL
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -490,6 +490,26 @@ func TestBlacklistedLink(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestLinkWithTrackers(t *testing.T) {
|
||||||
|
input := `<p>This link has trackers <a href="https://example.com/page?utm_source=newsletter">Test</a></p>`
|
||||||
|
expected := `<p>This link has trackers <a href="https://example.com/page" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Test</a></p>`
|
||||||
|
output := Sanitize("http://example.org/", input)
|
||||||
|
|
||||||
|
if expected != output {
|
||||||
|
t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestImageSrcWithTrackers(t *testing.T) {
|
||||||
|
input := `<p>This image has trackers <img src="https://example.org/?id=123&utm_source=newsletter&utm_medium=email&fbclid=abc123"></p>`
|
||||||
|
expected := `<p>This image has trackers <img src="https://example.org/?id=123" loading="lazy"></p>`
|
||||||
|
output := Sanitize("http://example.org/", input)
|
||||||
|
|
||||||
|
if expected != output {
|
||||||
|
t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestPixelTracker(t *testing.T) {
|
func TestPixelTracker(t *testing.T) {
|
||||||
input := `<p><img src="https://tracker1.example.org/" height="1" width="1"> and <img src="https://tracker2.example.org/" height="1" width="1"/></p>`
|
input := `<p><img src="https://tracker1.example.org/" height="1" width="1"> and <img src="https://tracker2.example.org/" height="1" width="1"/></p>`
|
||||||
expected := `<p> and </p>`
|
expected := `<p> and </p>`
|
||||||
|
|
96
internal/reader/urlcleaner/urlcleaner.go
Normal file
96
internal/reader/urlcleaner/urlcleaner.go
Normal file
|
@ -0,0 +1,96 @@
|
||||||
|
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
package urlcleaner // import "miniflux.app/v2/internal/reader/urlcleaner"
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"net/url"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Interesting lists:
|
||||||
|
// https://raw.githubusercontent.com/AdguardTeam/AdguardFilters/master/TrackParamFilter/sections/general_url.txt
|
||||||
|
// https://firefox.settings.services.mozilla.com/v1/buckets/main/collections/query-stripping/records
|
||||||
|
var trackingParams = map[string]bool{
|
||||||
|
// https://en.wikipedia.org/wiki/UTM_parameters#Parameters
|
||||||
|
"utm_source": true,
|
||||||
|
"utm_medium": true,
|
||||||
|
"utm_campaign": true,
|
||||||
|
"utm_term": true,
|
||||||
|
"utm_content": true,
|
||||||
|
|
||||||
|
// Facebook Click Identifiers
|
||||||
|
"fbclid": true,
|
||||||
|
"_openstat": true,
|
||||||
|
|
||||||
|
// Google Click Identifiers
|
||||||
|
"gclid": true,
|
||||||
|
"dclid": true,
|
||||||
|
"gbraid": true,
|
||||||
|
"wbraid": true,
|
||||||
|
|
||||||
|
// Yandex Click Identifiers
|
||||||
|
"yclid": true,
|
||||||
|
"ysclid": true,
|
||||||
|
|
||||||
|
// Twitter Click Identifier
|
||||||
|
"twclid": true,
|
||||||
|
|
||||||
|
// Microsoft Click Identifier
|
||||||
|
"msclkid": true,
|
||||||
|
|
||||||
|
// Mailchimp Click Identifiers
|
||||||
|
"mc_cid": true,
|
||||||
|
"mc_eid": true,
|
||||||
|
|
||||||
|
// Wicked Reports click tracking
|
||||||
|
"wickedid": true,
|
||||||
|
|
||||||
|
// Hubspot Click Identifiers
|
||||||
|
"hsa_cam": true,
|
||||||
|
"_hsenc": true,
|
||||||
|
"__hssc": true,
|
||||||
|
"__hstc": true,
|
||||||
|
"__hsfp": true,
|
||||||
|
"hsctatracking": true,
|
||||||
|
|
||||||
|
// Olytics
|
||||||
|
"rb_clickid": true,
|
||||||
|
"oly_anon_id": true,
|
||||||
|
"oly_enc_id": true,
|
||||||
|
|
||||||
|
// Vero Click Identifier
|
||||||
|
"vero_id": true,
|
||||||
|
|
||||||
|
// Marketo email tracking
|
||||||
|
"mkt_tok": true,
|
||||||
|
}
|
||||||
|
|
||||||
|
func RemoveTrackingParameters(inputURL string) (string, error) {
|
||||||
|
parsedURL, err := url.Parse(inputURL)
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("urlcleaner: error parsing URL: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !strings.HasPrefix(parsedURL.Scheme, "http") {
|
||||||
|
return inputURL, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
queryParams := parsedURL.Query()
|
||||||
|
|
||||||
|
// Remove tracking parameters
|
||||||
|
for param := range queryParams {
|
||||||
|
if trackingParams[strings.ToLower(param)] {
|
||||||
|
queryParams.Del(param)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
parsedURL.RawQuery = queryParams.Encode()
|
||||||
|
|
||||||
|
// Remove trailing "?" if query string is empty
|
||||||
|
cleanedURL := parsedURL.String()
|
||||||
|
cleanedURL = strings.TrimSuffix(cleanedURL, "?")
|
||||||
|
|
||||||
|
return cleanedURL, nil
|
||||||
|
}
|
108
internal/reader/urlcleaner/urlcleaner_test.go
Normal file
108
internal/reader/urlcleaner/urlcleaner_test.go
Normal file
|
@ -0,0 +1,108 @@
|
||||||
|
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
package urlcleaner // import "miniflux.app/v2/internal/reader/urlcleaner"
|
||||||
|
|
||||||
|
import (
|
||||||
|
"net/url"
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestRemoveTrackingParams(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
input string
|
||||||
|
expected string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "URL with tracking parameters",
|
||||||
|
input: "https://example.com/page?id=123&utm_source=newsletter&utm_medium=email&fbclid=abc123",
|
||||||
|
expected: "https://example.com/page?id=123",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "URL with only tracking parameters",
|
||||||
|
input: "https://example.com/page?utm_source=newsletter&utm_medium=email",
|
||||||
|
expected: "https://example.com/page",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "URL with no tracking parameters",
|
||||||
|
input: "https://example.com/page?id=123&foo=bar",
|
||||||
|
expected: "https://example.com/page?id=123&foo=bar",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "URL with no parameters",
|
||||||
|
input: "https://example.com/page",
|
||||||
|
expected: "https://example.com/page",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "URL with mixed case tracking parameters",
|
||||||
|
input: "https://example.com/page?UTM_SOURCE=newsletter&utm_MEDIUM=email",
|
||||||
|
expected: "https://example.com/page",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "URL with tracking parameters and fragments",
|
||||||
|
input: "https://example.com/page?id=123&utm_source=newsletter#section1",
|
||||||
|
expected: "https://example.com/page?id=123#section1",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "URL with only tracking parameters and fragments",
|
||||||
|
input: "https://example.com/page?utm_source=newsletter#section1",
|
||||||
|
expected: "https://example.com/page#section1",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "URL with only one tracking parameter",
|
||||||
|
input: "https://example.com/page?utm_source=newsletter",
|
||||||
|
expected: "https://example.com/page",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "URL with encoded characters",
|
||||||
|
input: "https://example.com/page?name=John%20Doe&utm_source=newsletter",
|
||||||
|
expected: "https://example.com/page?name=John+Doe",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Invalid URL",
|
||||||
|
input: "https://example|org/",
|
||||||
|
expected: "",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Non-HTTP URL",
|
||||||
|
input: "mailto:user@example.org",
|
||||||
|
expected: "mailto:user@example.org",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
result, err := RemoveTrackingParameters(tt.input)
|
||||||
|
if tt.expected == "" {
|
||||||
|
if err == nil {
|
||||||
|
t.Errorf("Expected an error for invalid URL, but got none")
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if !urlsEqual(result, tt.expected) {
|
||||||
|
t.Errorf("removeTrackingParams(%q) = %q, want %q", tt.input, result, tt.expected)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// urlsEqual compares two URLs for equality, ignoring the order of query parameters
|
||||||
|
func urlsEqual(url1, url2 string) bool {
|
||||||
|
u1, err1 := url.Parse(url1)
|
||||||
|
u2, err2 := url.Parse(url2)
|
||||||
|
|
||||||
|
if err1 != nil || err2 != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
if u1.Scheme != u2.Scheme || u1.Host != u2.Host || u1.Path != u2.Path || u1.Fragment != u2.Fragment {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
return reflect.DeepEqual(u1.Query(), u2.Query())
|
||||||
|
}
|
Loading…
Add table
Add a link
Reference in a new issue