1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-06-27 16:36:00 +00:00

refactor(processor): move RewriteEntryURL function to rewrite package

This commit is contained in:
Frédéric Guillot 2025-06-19 13:14:22 -07:00
parent c12476c1a9
commit cb59944d6b
7 changed files with 347 additions and 40 deletions

View file

@ -6,7 +6,6 @@ package processor // import "miniflux.app/v2/internal/reader/processor"
import (
"log/slog"
"net/url"
"regexp"
"slices"
"time"
@ -24,8 +23,6 @@ import (
"miniflux.app/v2/internal/storage"
)
var customReplaceRuleRegex = regexp.MustCompile(`rewrite\("([^"]+)"\|"([^"]+)"\)`)
// ProcessFeedEntries downloads original web page for entries and apply filters.
func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, userID int64, forceRefresh bool) {
var filteredEntries model.Entries
@ -60,7 +57,7 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, userID int64,
}
webpageBaseURL := ""
entry.URL = rewriteEntryURL(feed, entry)
entry.URL = rewrite.RewriteEntryURL(feed, entry)
entryIsNew := store.IsNewEntry(feed.ID, entry.Hash)
if feed.Crawler && (entryIsNew || forceRefresh) {
slog.Debug("Scraping entry",
@ -143,7 +140,7 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, userID int64,
// ProcessEntryWebPage downloads the entry web page and apply rewrite rules.
func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User) error {
startTime := time.Now()
entry.URL = rewriteEntryURL(feed, entry)
entry.URL = rewrite.RewriteEntryURL(feed, entry)
requestBuilder := fetcher.NewRequestBuilder()
requestBuilder.WithUserAgent(feed.UserAgent, config.Opts.HTTPClientUserAgent())
@ -187,41 +184,6 @@ func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User)
return nil
}
func rewriteEntryURL(feed *model.Feed, entry *model.Entry) string {
var rewrittenURL = entry.URL
if feed.UrlRewriteRules != "" {
parts := customReplaceRuleRegex.FindStringSubmatch(feed.UrlRewriteRules)
if len(parts) >= 3 {
re, err := regexp.Compile(parts[1])
if err != nil {
slog.Error("Failed on regexp compilation",
slog.String("url_rewrite_rules", feed.UrlRewriteRules),
slog.Any("error", err),
)
return rewrittenURL
}
rewrittenURL = re.ReplaceAllString(entry.URL, parts[2])
slog.Debug("Rewriting entry URL",
slog.String("original_entry_url", entry.URL),
slog.String("rewritten_entry_url", rewrittenURL),
slog.Int64("feed_id", feed.ID),
slog.String("feed_url", feed.FeedURL),
)
} else {
slog.Debug("Cannot find search and replace terms for replace rule",
slog.String("original_entry_url", entry.URL),
slog.String("rewritten_entry_url", rewrittenURL),
slog.Int64("feed_id", feed.ID),
slog.String("feed_url", feed.FeedURL),
slog.String("url_rewrite_rules", feed.UrlRewriteRules),
)
}
}
return rewrittenURL
}
func isRecentEntry(entry *model.Entry) bool {
if config.Opts.FilterEntryMaxAgeDays() == 0 || entry.Date.After(time.Now().AddDate(0, 0, -config.Opts.FilterEntryMaxAgeDays())) {
return true

View file

@ -0,0 +1,48 @@
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package rewrite // import "miniflux.app/v2/internal/reader/rewrite"
import (
"log/slog"
"regexp"
"miniflux.app/v2/internal/model"
)
var customReplaceRuleRegex = regexp.MustCompile(`rewrite\("([^"]+)"\|"([^"]+)"\)`)
func RewriteEntryURL(feed *model.Feed, entry *model.Entry) string {
var rewrittenURL = entry.URL
if feed.UrlRewriteRules != "" {
parts := customReplaceRuleRegex.FindStringSubmatch(feed.UrlRewriteRules)
if len(parts) >= 3 {
re, err := regexp.Compile(parts[1])
if err != nil {
slog.Error("Failed on regexp compilation",
slog.String("url_rewrite_rules", feed.UrlRewriteRules),
slog.Any("error", err),
)
return rewrittenURL
}
rewrittenURL = re.ReplaceAllString(entry.URL, parts[2])
slog.Debug("Rewriting entry URL",
slog.String("original_entry_url", entry.URL),
slog.String("rewritten_entry_url", rewrittenURL),
slog.Int64("feed_id", feed.ID),
slog.String("feed_url", feed.FeedURL),
)
} else {
slog.Debug("Cannot find search and replace terms for replace rule",
slog.String("original_entry_url", entry.URL),
slog.String("rewritten_entry_url", rewrittenURL),
slog.Int64("feed_id", feed.ID),
slog.String("feed_url", feed.FeedURL),
slog.String("url_rewrite_rules", feed.UrlRewriteRules),
)
}
}
return rewrittenURL
}

View file

@ -0,0 +1,297 @@
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package rewrite // import "miniflux.app/v2/internal/reader/rewrite"
import (
"testing"
"miniflux.app/v2/internal/model"
)
func TestRewriteEntryURL(t *testing.T) {
scenarios := []struct {
name string
feed *model.Feed
entry *model.Entry
expectedURL string
description string
}{
{
name: "NoRewriteRules",
feed: &model.Feed{
ID: 1,
FeedURL: "https://example.com/feed.xml",
UrlRewriteRules: "",
},
entry: &model.Entry{
URL: "https://example.com/article/123",
},
expectedURL: "https://example.com/article/123",
description: "Should return original URL when no rewrite rules are specified",
},
{
name: "EmptyRewriteRules",
feed: &model.Feed{
ID: 1,
FeedURL: "https://example.com/feed.xml",
UrlRewriteRules: " ",
},
entry: &model.Entry{
URL: "https://example.com/article/123",
},
expectedURL: "https://example.com/article/123",
description: "Should return original URL when rewrite rules are empty/whitespace",
},
{
name: "ValidRewriteRule",
feed: &model.Feed{
ID: 1,
FeedURL: "https://example.com/feed.xml",
UrlRewriteRules: `rewrite("^https://example.com/article/(.+)"|"https://example.com/full-article/$1")`,
},
entry: &model.Entry{
URL: "https://example.com/article/123",
},
expectedURL: "https://example.com/full-article/123",
description: "Should rewrite URL according to the regex pattern",
},
{
name: "ComplexRegexRewrite",
feed: &model.Feed{
ID: 1,
FeedURL: "https://news.ycombinator.com/rss",
UrlRewriteRules: `rewrite("^https://news\.ycombinator\.com/item\?id=(.+)"|"https://hn.algolia.com/api/v1/items/$1")`,
},
entry: &model.Entry{
URL: "https://news.ycombinator.com/item?id=12345",
},
expectedURL: "https://hn.algolia.com/api/v1/items/12345",
description: "Should handle complex regex patterns with escaped characters",
},
{
name: "NoMatchingPattern",
feed: &model.Feed{
ID: 1,
FeedURL: "https://example.com/feed.xml",
UrlRewriteRules: `rewrite("^https://different.com/(.+)"|"https://rewritten.com/$1")`,
},
entry: &model.Entry{
URL: "https://example.com/article/123",
},
expectedURL: "https://example.com/article/123",
description: "Should return original URL when regex pattern doesn't match",
},
{
name: "InvalidRegexPattern",
feed: &model.Feed{
ID: 1,
FeedURL: "https://example.com/feed.xml",
UrlRewriteRules: `rewrite("^https://example.com/[invalid"|"https://rewritten.com/$1")`,
},
entry: &model.Entry{
URL: "https://example.com/article/123",
},
expectedURL: "https://example.com/article/123",
description: "Should return original URL when regex pattern is invalid",
},
{
name: "MalformedRewriteRule",
feed: &model.Feed{
ID: 1,
FeedURL: "https://example.com/feed.xml",
UrlRewriteRules: `rewrite("invalid format")`,
},
entry: &model.Entry{
URL: "https://example.com/article/123",
},
expectedURL: "https://example.com/article/123",
description: "Should return original URL when rewrite rule format is malformed",
},
{
name: "MultipleGroups",
feed: &model.Feed{
ID: 1,
FeedURL: "https://example.com/feed.xml",
UrlRewriteRules: `rewrite("^https://example.com/([^/]+)/article/(.+)"|"https://example.com/full/$1/story/$2")`,
},
entry: &model.Entry{
URL: "https://example.com/tech/article/ai-news",
},
expectedURL: "https://example.com/full/tech/story/ai-news",
description: "Should handle multiple capture groups in regex",
},
{
name: "URLWithSpecialCharacters",
feed: &model.Feed{
ID: 1,
FeedURL: "https://example.com/feed.xml",
UrlRewriteRules: `rewrite("^https://example.com/(.+)"|"https://proxy.example.com/$1")`,
},
entry: &model.Entry{
URL: "https://example.com/article/test?param=value&other=123#section",
},
expectedURL: "https://proxy.example.com/article/test?param=value&other=123#section",
description: "Should handle URLs with query parameters and fragments",
},
{
name: "ReplaceWithStaticURL",
feed: &model.Feed{
ID: 1,
FeedURL: "https://example.com/feed.xml",
UrlRewriteRules: `rewrite("^https://example.com/(.+)"|"https://static.example.com/reader")`,
},
entry: &model.Entry{
URL: "https://example.com/article/123",
},
expectedURL: "https://static.example.com/reader",
description: "Should replace with static URL when no capture groups are used in replacement",
},
{
name: "EmptyReplacementString",
feed: &model.Feed{
ID: 1,
FeedURL: "https://example.com/feed.xml",
UrlRewriteRules: `rewrite("^https://example.com/(.+)"|"x")`,
},
entry: &model.Entry{
URL: "https://example.com/article/123",
},
expectedURL: "x",
description: "Should replace with specified string",
},
{
name: "EmptyReplacementNotSupported",
feed: &model.Feed{
ID: 1,
FeedURL: "https://example.com/feed.xml",
UrlRewriteRules: `rewrite("^https://example.com/(.+)"|"")`,
},
entry: &model.Entry{
URL: "https://example.com/article/123",
},
expectedURL: "https://example.com/article/123",
description: "Should return original URL when replacement is empty string (not supported by regex pattern)",
},
{
name: "InvalidRewriteRuleFormat",
feed: &model.Feed{
ID: 1,
FeedURL: "https://example.com/feed.xml",
UrlRewriteRules: `not-a-rewrite-rule`,
},
entry: &model.Entry{
URL: "https://example.com/article/123",
},
expectedURL: "https://example.com/article/123",
description: "Should return original URL when rewrite rule doesn't match expected format",
},
}
for _, scenario := range scenarios {
t.Run(scenario.name, func(t *testing.T) {
result := RewriteEntryURL(scenario.feed, scenario.entry)
if result != scenario.expectedURL {
t.Errorf("Expected URL %q, got %q. Description: %s", scenario.expectedURL, result, scenario.description)
}
})
}
}
func TestRewriteEntryURLWithNilValues(t *testing.T) {
t.Run("NilFeed", func(t *testing.T) {
entry := &model.Entry{URL: "https://example.com/article/123"}
// This should panic or handle gracefully - let's see what happens
defer func() {
if r := recover(); r == nil {
t.Error("Expected panic when feed is nil, but function completed normally")
}
}()
RewriteEntryURL(nil, entry)
})
t.Run("NilEntry", func(t *testing.T) {
feed := &model.Feed{
ID: 1,
FeedURL: "https://example.com/feed.xml",
UrlRewriteRules: `rewrite("^https://example.com/(.+)"|"https://rewritten.com/$1")`,
}
// This should panic or handle gracefully - let's see what happens
defer func() {
if r := recover(); r == nil {
t.Error("Expected panic when entry is nil, but function completed normally")
}
}()
RewriteEntryURL(feed, nil)
})
}
func TestCustomReplaceRuleRegex(t *testing.T) {
scenarios := []struct {
name string
input string
expected []string
matches bool
}{
{
name: "ValidRule",
input: `rewrite("^https://example.com/(.+)"|"https://rewritten.com/$1")`,
expected: []string{`rewrite("^https://example.com/(.+)"|"https://rewritten.com/$1")`, `^https://example.com/(.+)`, `https://rewritten.com/$1`},
matches: true,
},
{
name: "ValidRuleWithEscapedCharacters",
input: `rewrite("^https://news\\.ycombinator\\.com/item\\?id=(.+)"|"https://hn.algolia.com/api/v1/items/$1")`,
expected: []string{`rewrite("^https://news\\.ycombinator\\.com/item\\?id=(.+)"|"https://hn.algolia.com/api/v1/items/$1")`, `^https://news\\.ycombinator\\.com/item\\?id=(.+)`, `https://hn.algolia.com/api/v1/items/$1`},
matches: true,
},
{
name: "InvalidFormat",
input: `rewrite("invalid")`,
expected: nil,
matches: false,
},
{
name: "EmptyString",
input: ``,
expected: nil,
matches: false,
},
{
name: "RandomText",
input: `some random text`,
expected: nil,
matches: false,
},
}
for _, scenario := range scenarios {
t.Run(scenario.name, func(t *testing.T) {
parts := customReplaceRuleRegex.FindStringSubmatch(scenario.input)
if scenario.matches {
if len(parts) < 3 {
t.Errorf("Expected regex to match and return at least 3 parts, got %d parts: %v", len(parts), parts)
return
}
// Check the full match and captured groups
if parts[0] != scenario.expected[0] {
t.Errorf("Expected full match %q, got %q", scenario.expected[0], parts[0])
}
if parts[1] != scenario.expected[1] {
t.Errorf("Expected first capture group %q, got %q", scenario.expected[1], parts[1])
}
if parts[2] != scenario.expected[2] {
t.Errorf("Expected second capture group %q, got %q", scenario.expected[2], parts[2])
}
} else if len(parts) >= 3 {
t.Errorf("Expected regex not to match, but got %d parts: %v", len(parts), parts)
}
})
}
}