From cb59944d6b80724cf2411bf92d0e807d8a5f3b86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Thu, 19 Jun 2025 13:14:22 -0700 Subject: [PATCH] refactor(processor): move `RewriteEntryURL` function to `rewrite` package --- internal/reader/processor/processor.go | 42 +-- .../{rewriter.go => content_rewrite.go} | 0 ...ctions.go => content_rewrite_functions.go} | 0 .../{rules.go => content_rewrite_rules.go} | 0 ...writer_test.go => content_rewrite_test.go} | 0 internal/reader/rewrite/url_rewrite.go | 48 +++ internal/reader/rewrite/url_rewrite_test.go | 297 ++++++++++++++++++ 7 files changed, 347 insertions(+), 40 deletions(-) rename internal/reader/rewrite/{rewriter.go => content_rewrite.go} (100%) rename internal/reader/rewrite/{rewrite_functions.go => content_rewrite_functions.go} (100%) rename internal/reader/rewrite/{rules.go => content_rewrite_rules.go} (100%) rename internal/reader/rewrite/{rewriter_test.go => content_rewrite_test.go} (100%) create mode 100644 internal/reader/rewrite/url_rewrite.go create mode 100644 internal/reader/rewrite/url_rewrite_test.go diff --git a/internal/reader/processor/processor.go b/internal/reader/processor/processor.go index eddfeecb..7155cf44 100644 --- a/internal/reader/processor/processor.go +++ b/internal/reader/processor/processor.go @@ -6,7 +6,6 @@ package processor // import "miniflux.app/v2/internal/reader/processor" import ( "log/slog" "net/url" - "regexp" "slices" "time" @@ -24,8 +23,6 @@ import ( "miniflux.app/v2/internal/storage" ) -var customReplaceRuleRegex = regexp.MustCompile(`rewrite\("([^"]+)"\|"([^"]+)"\)`) - // ProcessFeedEntries downloads original web page for entries and apply filters. func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, userID int64, forceRefresh bool) { var filteredEntries model.Entries @@ -60,7 +57,7 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, userID int64, } webpageBaseURL := "" - entry.URL = rewriteEntryURL(feed, entry) + entry.URL = rewrite.RewriteEntryURL(feed, entry) entryIsNew := store.IsNewEntry(feed.ID, entry.Hash) if feed.Crawler && (entryIsNew || forceRefresh) { slog.Debug("Scraping entry", @@ -143,7 +140,7 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, userID int64, // ProcessEntryWebPage downloads the entry web page and apply rewrite rules. func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User) error { startTime := time.Now() - entry.URL = rewriteEntryURL(feed, entry) + entry.URL = rewrite.RewriteEntryURL(feed, entry) requestBuilder := fetcher.NewRequestBuilder() requestBuilder.WithUserAgent(feed.UserAgent, config.Opts.HTTPClientUserAgent()) @@ -187,41 +184,6 @@ func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User) return nil } -func rewriteEntryURL(feed *model.Feed, entry *model.Entry) string { - var rewrittenURL = entry.URL - if feed.UrlRewriteRules != "" { - parts := customReplaceRuleRegex.FindStringSubmatch(feed.UrlRewriteRules) - - if len(parts) >= 3 { - re, err := regexp.Compile(parts[1]) - if err != nil { - slog.Error("Failed on regexp compilation", - slog.String("url_rewrite_rules", feed.UrlRewriteRules), - slog.Any("error", err), - ) - return rewrittenURL - } - rewrittenURL = re.ReplaceAllString(entry.URL, parts[2]) - slog.Debug("Rewriting entry URL", - slog.String("original_entry_url", entry.URL), - slog.String("rewritten_entry_url", rewrittenURL), - slog.Int64("feed_id", feed.ID), - slog.String("feed_url", feed.FeedURL), - ) - } else { - slog.Debug("Cannot find search and replace terms for replace rule", - slog.String("original_entry_url", entry.URL), - slog.String("rewritten_entry_url", rewrittenURL), - slog.Int64("feed_id", feed.ID), - slog.String("feed_url", feed.FeedURL), - slog.String("url_rewrite_rules", feed.UrlRewriteRules), - ) - } - } - - return rewrittenURL -} - func isRecentEntry(entry *model.Entry) bool { if config.Opts.FilterEntryMaxAgeDays() == 0 || entry.Date.After(time.Now().AddDate(0, 0, -config.Opts.FilterEntryMaxAgeDays())) { return true diff --git a/internal/reader/rewrite/rewriter.go b/internal/reader/rewrite/content_rewrite.go similarity index 100% rename from internal/reader/rewrite/rewriter.go rename to internal/reader/rewrite/content_rewrite.go diff --git a/internal/reader/rewrite/rewrite_functions.go b/internal/reader/rewrite/content_rewrite_functions.go similarity index 100% rename from internal/reader/rewrite/rewrite_functions.go rename to internal/reader/rewrite/content_rewrite_functions.go diff --git a/internal/reader/rewrite/rules.go b/internal/reader/rewrite/content_rewrite_rules.go similarity index 100% rename from internal/reader/rewrite/rules.go rename to internal/reader/rewrite/content_rewrite_rules.go diff --git a/internal/reader/rewrite/rewriter_test.go b/internal/reader/rewrite/content_rewrite_test.go similarity index 100% rename from internal/reader/rewrite/rewriter_test.go rename to internal/reader/rewrite/content_rewrite_test.go diff --git a/internal/reader/rewrite/url_rewrite.go b/internal/reader/rewrite/url_rewrite.go new file mode 100644 index 00000000..05c01eb2 --- /dev/null +++ b/internal/reader/rewrite/url_rewrite.go @@ -0,0 +1,48 @@ +// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package rewrite // import "miniflux.app/v2/internal/reader/rewrite" + +import ( + "log/slog" + "regexp" + + "miniflux.app/v2/internal/model" +) + +var customReplaceRuleRegex = regexp.MustCompile(`rewrite\("([^"]+)"\|"([^"]+)"\)`) + +func RewriteEntryURL(feed *model.Feed, entry *model.Entry) string { + var rewrittenURL = entry.URL + if feed.UrlRewriteRules != "" { + parts := customReplaceRuleRegex.FindStringSubmatch(feed.UrlRewriteRules) + + if len(parts) >= 3 { + re, err := regexp.Compile(parts[1]) + if err != nil { + slog.Error("Failed on regexp compilation", + slog.String("url_rewrite_rules", feed.UrlRewriteRules), + slog.Any("error", err), + ) + return rewrittenURL + } + rewrittenURL = re.ReplaceAllString(entry.URL, parts[2]) + slog.Debug("Rewriting entry URL", + slog.String("original_entry_url", entry.URL), + slog.String("rewritten_entry_url", rewrittenURL), + slog.Int64("feed_id", feed.ID), + slog.String("feed_url", feed.FeedURL), + ) + } else { + slog.Debug("Cannot find search and replace terms for replace rule", + slog.String("original_entry_url", entry.URL), + slog.String("rewritten_entry_url", rewrittenURL), + slog.Int64("feed_id", feed.ID), + slog.String("feed_url", feed.FeedURL), + slog.String("url_rewrite_rules", feed.UrlRewriteRules), + ) + } + } + + return rewrittenURL +} diff --git a/internal/reader/rewrite/url_rewrite_test.go b/internal/reader/rewrite/url_rewrite_test.go new file mode 100644 index 00000000..f975078d --- /dev/null +++ b/internal/reader/rewrite/url_rewrite_test.go @@ -0,0 +1,297 @@ +// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package rewrite // import "miniflux.app/v2/internal/reader/rewrite" + +import ( + "testing" + + "miniflux.app/v2/internal/model" +) + +func TestRewriteEntryURL(t *testing.T) { + scenarios := []struct { + name string + feed *model.Feed + entry *model.Entry + expectedURL string + description string + }{ + { + name: "NoRewriteRules", + feed: &model.Feed{ + ID: 1, + FeedURL: "https://example.com/feed.xml", + UrlRewriteRules: "", + }, + entry: &model.Entry{ + URL: "https://example.com/article/123", + }, + expectedURL: "https://example.com/article/123", + description: "Should return original URL when no rewrite rules are specified", + }, + { + name: "EmptyRewriteRules", + feed: &model.Feed{ + ID: 1, + FeedURL: "https://example.com/feed.xml", + UrlRewriteRules: " ", + }, + entry: &model.Entry{ + URL: "https://example.com/article/123", + }, + expectedURL: "https://example.com/article/123", + description: "Should return original URL when rewrite rules are empty/whitespace", + }, + { + name: "ValidRewriteRule", + feed: &model.Feed{ + ID: 1, + FeedURL: "https://example.com/feed.xml", + UrlRewriteRules: `rewrite("^https://example.com/article/(.+)"|"https://example.com/full-article/$1")`, + }, + entry: &model.Entry{ + URL: "https://example.com/article/123", + }, + expectedURL: "https://example.com/full-article/123", + description: "Should rewrite URL according to the regex pattern", + }, + { + name: "ComplexRegexRewrite", + feed: &model.Feed{ + ID: 1, + FeedURL: "https://news.ycombinator.com/rss", + UrlRewriteRules: `rewrite("^https://news\.ycombinator\.com/item\?id=(.+)"|"https://hn.algolia.com/api/v1/items/$1")`, + }, + entry: &model.Entry{ + URL: "https://news.ycombinator.com/item?id=12345", + }, + expectedURL: "https://hn.algolia.com/api/v1/items/12345", + description: "Should handle complex regex patterns with escaped characters", + }, + { + name: "NoMatchingPattern", + feed: &model.Feed{ + ID: 1, + FeedURL: "https://example.com/feed.xml", + UrlRewriteRules: `rewrite("^https://different.com/(.+)"|"https://rewritten.com/$1")`, + }, + entry: &model.Entry{ + URL: "https://example.com/article/123", + }, + expectedURL: "https://example.com/article/123", + description: "Should return original URL when regex pattern doesn't match", + }, + { + name: "InvalidRegexPattern", + feed: &model.Feed{ + ID: 1, + FeedURL: "https://example.com/feed.xml", + UrlRewriteRules: `rewrite("^https://example.com/[invalid"|"https://rewritten.com/$1")`, + }, + entry: &model.Entry{ + URL: "https://example.com/article/123", + }, + expectedURL: "https://example.com/article/123", + description: "Should return original URL when regex pattern is invalid", + }, + { + name: "MalformedRewriteRule", + feed: &model.Feed{ + ID: 1, + FeedURL: "https://example.com/feed.xml", + UrlRewriteRules: `rewrite("invalid format")`, + }, + entry: &model.Entry{ + URL: "https://example.com/article/123", + }, + expectedURL: "https://example.com/article/123", + description: "Should return original URL when rewrite rule format is malformed", + }, + { + name: "MultipleGroups", + feed: &model.Feed{ + ID: 1, + FeedURL: "https://example.com/feed.xml", + UrlRewriteRules: `rewrite("^https://example.com/([^/]+)/article/(.+)"|"https://example.com/full/$1/story/$2")`, + }, + entry: &model.Entry{ + URL: "https://example.com/tech/article/ai-news", + }, + expectedURL: "https://example.com/full/tech/story/ai-news", + description: "Should handle multiple capture groups in regex", + }, + { + name: "URLWithSpecialCharacters", + feed: &model.Feed{ + ID: 1, + FeedURL: "https://example.com/feed.xml", + UrlRewriteRules: `rewrite("^https://example.com/(.+)"|"https://proxy.example.com/$1")`, + }, + entry: &model.Entry{ + URL: "https://example.com/article/test?param=value&other=123#section", + }, + expectedURL: "https://proxy.example.com/article/test?param=value&other=123#section", + description: "Should handle URLs with query parameters and fragments", + }, + { + name: "ReplaceWithStaticURL", + feed: &model.Feed{ + ID: 1, + FeedURL: "https://example.com/feed.xml", + UrlRewriteRules: `rewrite("^https://example.com/(.+)"|"https://static.example.com/reader")`, + }, + entry: &model.Entry{ + URL: "https://example.com/article/123", + }, + expectedURL: "https://static.example.com/reader", + description: "Should replace with static URL when no capture groups are used in replacement", + }, + { + name: "EmptyReplacementString", + feed: &model.Feed{ + ID: 1, + FeedURL: "https://example.com/feed.xml", + UrlRewriteRules: `rewrite("^https://example.com/(.+)"|"x")`, + }, + entry: &model.Entry{ + URL: "https://example.com/article/123", + }, + expectedURL: "x", + description: "Should replace with specified string", + }, + { + name: "EmptyReplacementNotSupported", + feed: &model.Feed{ + ID: 1, + FeedURL: "https://example.com/feed.xml", + UrlRewriteRules: `rewrite("^https://example.com/(.+)"|"")`, + }, + entry: &model.Entry{ + URL: "https://example.com/article/123", + }, + expectedURL: "https://example.com/article/123", + description: "Should return original URL when replacement is empty string (not supported by regex pattern)", + }, + { + name: "InvalidRewriteRuleFormat", + feed: &model.Feed{ + ID: 1, + FeedURL: "https://example.com/feed.xml", + UrlRewriteRules: `not-a-rewrite-rule`, + }, + entry: &model.Entry{ + URL: "https://example.com/article/123", + }, + expectedURL: "https://example.com/article/123", + description: "Should return original URL when rewrite rule doesn't match expected format", + }, + } + + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + result := RewriteEntryURL(scenario.feed, scenario.entry) + if result != scenario.expectedURL { + t.Errorf("Expected URL %q, got %q. Description: %s", scenario.expectedURL, result, scenario.description) + } + }) + } +} + +func TestRewriteEntryURLWithNilValues(t *testing.T) { + t.Run("NilFeed", func(t *testing.T) { + entry := &model.Entry{URL: "https://example.com/article/123"} + + // This should panic or handle gracefully - let's see what happens + defer func() { + if r := recover(); r == nil { + t.Error("Expected panic when feed is nil, but function completed normally") + } + }() + + RewriteEntryURL(nil, entry) + }) + + t.Run("NilEntry", func(t *testing.T) { + feed := &model.Feed{ + ID: 1, + FeedURL: "https://example.com/feed.xml", + UrlRewriteRules: `rewrite("^https://example.com/(.+)"|"https://rewritten.com/$1")`, + } + + // This should panic or handle gracefully - let's see what happens + defer func() { + if r := recover(); r == nil { + t.Error("Expected panic when entry is nil, but function completed normally") + } + }() + + RewriteEntryURL(feed, nil) + }) +} + +func TestCustomReplaceRuleRegex(t *testing.T) { + scenarios := []struct { + name string + input string + expected []string + matches bool + }{ + { + name: "ValidRule", + input: `rewrite("^https://example.com/(.+)"|"https://rewritten.com/$1")`, + expected: []string{`rewrite("^https://example.com/(.+)"|"https://rewritten.com/$1")`, `^https://example.com/(.+)`, `https://rewritten.com/$1`}, + matches: true, + }, + { + name: "ValidRuleWithEscapedCharacters", + input: `rewrite("^https://news\\.ycombinator\\.com/item\\?id=(.+)"|"https://hn.algolia.com/api/v1/items/$1")`, + expected: []string{`rewrite("^https://news\\.ycombinator\\.com/item\\?id=(.+)"|"https://hn.algolia.com/api/v1/items/$1")`, `^https://news\\.ycombinator\\.com/item\\?id=(.+)`, `https://hn.algolia.com/api/v1/items/$1`}, + matches: true, + }, + { + name: "InvalidFormat", + input: `rewrite("invalid")`, + expected: nil, + matches: false, + }, + { + name: "EmptyString", + input: ``, + expected: nil, + matches: false, + }, + { + name: "RandomText", + input: `some random text`, + expected: nil, + matches: false, + }, + } + + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + parts := customReplaceRuleRegex.FindStringSubmatch(scenario.input) + + if scenario.matches { + if len(parts) < 3 { + t.Errorf("Expected regex to match and return at least 3 parts, got %d parts: %v", len(parts), parts) + return + } + + // Check the full match and captured groups + if parts[0] != scenario.expected[0] { + t.Errorf("Expected full match %q, got %q", scenario.expected[0], parts[0]) + } + if parts[1] != scenario.expected[1] { + t.Errorf("Expected first capture group %q, got %q", scenario.expected[1], parts[1]) + } + if parts[2] != scenario.expected[2] { + t.Errorf("Expected second capture group %q, got %q", scenario.expected[2], parts[2]) + } + } else if len(parts) >= 3 { + t.Errorf("Expected regex not to match, but got %d parts: %v", len(parts), parts) + } + }) + } +}