mirror of
https://github.com/miniflux/v2.git
synced 2025-09-15 18:57:04 +00:00
feat: remove well-known URL parameter trackers
This commit is contained in:
parent
11cafec863
commit
c0f6e32a99
5 changed files with 252 additions and 17 deletions
96
internal/reader/urlcleaner/urlcleaner.go
Normal file
96
internal/reader/urlcleaner/urlcleaner.go
Normal file
|
@ -0,0 +1,96 @@
|
|||
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
package urlcleaner // import "miniflux.app/v2/internal/reader/urlcleaner"
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/url"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Interesting lists:
|
||||
// https://raw.githubusercontent.com/AdguardTeam/AdguardFilters/master/TrackParamFilter/sections/general_url.txt
|
||||
// https://firefox.settings.services.mozilla.com/v1/buckets/main/collections/query-stripping/records
|
||||
var trackingParams = map[string]bool{
|
||||
// https://en.wikipedia.org/wiki/UTM_parameters#Parameters
|
||||
"utm_source": true,
|
||||
"utm_medium": true,
|
||||
"utm_campaign": true,
|
||||
"utm_term": true,
|
||||
"utm_content": true,
|
||||
|
||||
// Facebook Click Identifiers
|
||||
"fbclid": true,
|
||||
"_openstat": true,
|
||||
|
||||
// Google Click Identifiers
|
||||
"gclid": true,
|
||||
"dclid": true,
|
||||
"gbraid": true,
|
||||
"wbraid": true,
|
||||
|
||||
// Yandex Click Identifiers
|
||||
"yclid": true,
|
||||
"ysclid": true,
|
||||
|
||||
// Twitter Click Identifier
|
||||
"twclid": true,
|
||||
|
||||
// Microsoft Click Identifier
|
||||
"msclkid": true,
|
||||
|
||||
// Mailchimp Click Identifiers
|
||||
"mc_cid": true,
|
||||
"mc_eid": true,
|
||||
|
||||
// Wicked Reports click tracking
|
||||
"wickedid": true,
|
||||
|
||||
// Hubspot Click Identifiers
|
||||
"hsa_cam": true,
|
||||
"_hsenc": true,
|
||||
"__hssc": true,
|
||||
"__hstc": true,
|
||||
"__hsfp": true,
|
||||
"hsctatracking": true,
|
||||
|
||||
// Olytics
|
||||
"rb_clickid": true,
|
||||
"oly_anon_id": true,
|
||||
"oly_enc_id": true,
|
||||
|
||||
// Vero Click Identifier
|
||||
"vero_id": true,
|
||||
|
||||
// Marketo email tracking
|
||||
"mkt_tok": true,
|
||||
}
|
||||
|
||||
func RemoveTrackingParameters(inputURL string) (string, error) {
|
||||
parsedURL, err := url.Parse(inputURL)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("urlcleaner: error parsing URL: %v", err)
|
||||
}
|
||||
|
||||
if !strings.HasPrefix(parsedURL.Scheme, "http") {
|
||||
return inputURL, nil
|
||||
}
|
||||
|
||||
queryParams := parsedURL.Query()
|
||||
|
||||
// Remove tracking parameters
|
||||
for param := range queryParams {
|
||||
if trackingParams[strings.ToLower(param)] {
|
||||
queryParams.Del(param)
|
||||
}
|
||||
}
|
||||
|
||||
parsedURL.RawQuery = queryParams.Encode()
|
||||
|
||||
// Remove trailing "?" if query string is empty
|
||||
cleanedURL := parsedURL.String()
|
||||
cleanedURL = strings.TrimSuffix(cleanedURL, "?")
|
||||
|
||||
return cleanedURL, nil
|
||||
}
|
108
internal/reader/urlcleaner/urlcleaner_test.go
Normal file
108
internal/reader/urlcleaner/urlcleaner_test.go
Normal file
|
@ -0,0 +1,108 @@
|
|||
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
package urlcleaner // import "miniflux.app/v2/internal/reader/urlcleaner"
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestRemoveTrackingParams(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
expected string
|
||||
}{
|
||||
{
|
||||
name: "URL with tracking parameters",
|
||||
input: "https://example.com/page?id=123&utm_source=newsletter&utm_medium=email&fbclid=abc123",
|
||||
expected: "https://example.com/page?id=123",
|
||||
},
|
||||
{
|
||||
name: "URL with only tracking parameters",
|
||||
input: "https://example.com/page?utm_source=newsletter&utm_medium=email",
|
||||
expected: "https://example.com/page",
|
||||
},
|
||||
{
|
||||
name: "URL with no tracking parameters",
|
||||
input: "https://example.com/page?id=123&foo=bar",
|
||||
expected: "https://example.com/page?id=123&foo=bar",
|
||||
},
|
||||
{
|
||||
name: "URL with no parameters",
|
||||
input: "https://example.com/page",
|
||||
expected: "https://example.com/page",
|
||||
},
|
||||
{
|
||||
name: "URL with mixed case tracking parameters",
|
||||
input: "https://example.com/page?UTM_SOURCE=newsletter&utm_MEDIUM=email",
|
||||
expected: "https://example.com/page",
|
||||
},
|
||||
{
|
||||
name: "URL with tracking parameters and fragments",
|
||||
input: "https://example.com/page?id=123&utm_source=newsletter#section1",
|
||||
expected: "https://example.com/page?id=123#section1",
|
||||
},
|
||||
{
|
||||
name: "URL with only tracking parameters and fragments",
|
||||
input: "https://example.com/page?utm_source=newsletter#section1",
|
||||
expected: "https://example.com/page#section1",
|
||||
},
|
||||
{
|
||||
name: "URL with only one tracking parameter",
|
||||
input: "https://example.com/page?utm_source=newsletter",
|
||||
expected: "https://example.com/page",
|
||||
},
|
||||
{
|
||||
name: "URL with encoded characters",
|
||||
input: "https://example.com/page?name=John%20Doe&utm_source=newsletter",
|
||||
expected: "https://example.com/page?name=John+Doe",
|
||||
},
|
||||
{
|
||||
name: "Invalid URL",
|
||||
input: "https://example|org/",
|
||||
expected: "",
|
||||
},
|
||||
{
|
||||
name: "Non-HTTP URL",
|
||||
input: "mailto:user@example.org",
|
||||
expected: "mailto:user@example.org",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result, err := RemoveTrackingParameters(tt.input)
|
||||
if tt.expected == "" {
|
||||
if err == nil {
|
||||
t.Errorf("Expected an error for invalid URL, but got none")
|
||||
}
|
||||
} else {
|
||||
if err != nil {
|
||||
t.Errorf("Unexpected error: %v", err)
|
||||
}
|
||||
if !urlsEqual(result, tt.expected) {
|
||||
t.Errorf("removeTrackingParams(%q) = %q, want %q", tt.input, result, tt.expected)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// urlsEqual compares two URLs for equality, ignoring the order of query parameters
|
||||
func urlsEqual(url1, url2 string) bool {
|
||||
u1, err1 := url.Parse(url1)
|
||||
u2, err2 := url.Parse(url2)
|
||||
|
||||
if err1 != nil || err2 != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
if u1.Scheme != u2.Scheme || u1.Host != u2.Host || u1.Path != u2.Path || u1.Fragment != u2.Fragment {
|
||||
return false
|
||||
}
|
||||
|
||||
return reflect.DeepEqual(u1.Query(), u2.Query())
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue