1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-06-27 16:36:00 +00:00

Add rewrite rules for article URL before fetching content

This commit is contained in:
Carsten 2022-07-12 06:12:26 +02:00 committed by GitHub
parent b1f6f2e7ac
commit 2659883ce5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
30 changed files with 96 additions and 21 deletions

View file

@ -32,8 +32,9 @@ import (
)
var (
youtubeRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)`)
iso8601Regex = regexp.MustCompile(`^P((?P<year>\d+)Y)?((?P<month>\d+)M)?((?P<week>\d+)W)?((?P<day>\d+)D)?(T((?P<hour>\d+)H)?((?P<minute>\d+)M)?((?P<second>\d+)S)?)?$`)
youtubeRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)`)
iso8601Regex = regexp.MustCompile(`^P((?P<year>\d+)Y)?((?P<month>\d+)M)?((?P<week>\d+)W)?((?P<day>\d+)D)?(T((?P<hour>\d+)H)?((?P<minute>\d+)M)?((?P<second>\d+)S)?)?$`)
customReplaceRuleRegex = regexp.MustCompile(`rewrite\("(.*)"\|"(.*)"\)`)
)
// ProcessFeedEntries downloads original web page for entries and apply filters.
@ -47,13 +48,14 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed) {
continue
}
url := getUrlFromEntry(feed, entry)
entryIsNew := !store.EntryURLExists(feed.ID, entry.URL)
if feed.Crawler && entryIsNew {
logger.Debug("[Processor] Crawling entry %q from feed %q", entry.URL, feed.FeedURL)
logger.Debug("[Processor] Crawling entry %q from feed %q", url, feed.FeedURL)
startTime := time.Now()
content, scraperErr := scraper.Fetch(
entry.URL,
url,
feed.ScraperRules,
feed.UserAgent,
feed.Cookie,
@ -77,10 +79,10 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed) {
}
}
entry.Content = rewrite.Rewriter(entry.URL, entry.Content, feed.RewriteRules)
entry.Content = rewrite.Rewriter(url, entry.Content, feed.RewriteRules)
// The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered.
entry.Content = sanitizer.Sanitize(entry.URL, entry.Content)
entry.Content = sanitizer.Sanitize(url, entry.Content)
if entryIsNew {
intg, err := store.Integration(feed.UserID)
@ -127,8 +129,10 @@ func isAllowedEntry(feed *model.Feed, entry *model.Entry) bool {
// ProcessEntryWebPage downloads the entry web page and apply rewrite rules.
func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry) error {
startTime := time.Now()
url := getUrlFromEntry(feed, entry)
content, scraperErr := scraper.Fetch(
entry.URL,
url,
entry.Feed.ScraperRules,
entry.Feed.UserAgent,
entry.Feed.Cookie,
@ -148,8 +152,8 @@ func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry) error {
return scraperErr
}
content = rewrite.Rewriter(entry.URL, content, entry.Feed.RewriteRules)
content = sanitizer.Sanitize(entry.URL, content)
content = rewrite.Rewriter(url, content, entry.Feed.RewriteRules)
content = sanitizer.Sanitize(url, content)
if content != "" {
entry.Content = content
@ -159,6 +163,22 @@ func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry) error {
return nil
}
func getUrlFromEntry(feed *model.Feed, entry *model.Entry) string {
var url = entry.URL
if feed.UrlRewriteRules != "" {
parts := customReplaceRuleRegex.FindStringSubmatch(feed.UrlRewriteRules)
if len(parts) >= 3 {
re := regexp.MustCompile(parts[1])
url = re.ReplaceAllString(entry.URL, parts[2])
logger.Debug(`[Processor] Rewriting entry URL %s to %s`, entry.URL, url)
} else {
logger.Debug("[Processor] Cannot find search and replace terms for replace rule %s", feed.UrlRewriteRules)
}
}
return url
}
func updateEntryReadingTime(store *storage.Storage, feed *model.Feed, entry *model.Entry, entryIsNew bool) {
if shouldFetchYouTubeWatchTime(entry) {
if entryIsNew {