mirror of
https://github.com/miniflux/v2.git
synced 2025-06-27 16:36:00 +00:00
Add rewrite rules for article URL before fetching content
This commit is contained in:
parent
b1f6f2e7ac
commit
2659883ce5
30 changed files with 96 additions and 21 deletions
|
@ -32,8 +32,9 @@ import (
|
|||
)
|
||||
|
||||
var (
|
||||
youtubeRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)`)
|
||||
iso8601Regex = regexp.MustCompile(`^P((?P<year>\d+)Y)?((?P<month>\d+)M)?((?P<week>\d+)W)?((?P<day>\d+)D)?(T((?P<hour>\d+)H)?((?P<minute>\d+)M)?((?P<second>\d+)S)?)?$`)
|
||||
youtubeRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)`)
|
||||
iso8601Regex = regexp.MustCompile(`^P((?P<year>\d+)Y)?((?P<month>\d+)M)?((?P<week>\d+)W)?((?P<day>\d+)D)?(T((?P<hour>\d+)H)?((?P<minute>\d+)M)?((?P<second>\d+)S)?)?$`)
|
||||
customReplaceRuleRegex = regexp.MustCompile(`rewrite\("(.*)"\|"(.*)"\)`)
|
||||
)
|
||||
|
||||
// ProcessFeedEntries downloads original web page for entries and apply filters.
|
||||
|
@ -47,13 +48,14 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed) {
|
|||
continue
|
||||
}
|
||||
|
||||
url := getUrlFromEntry(feed, entry)
|
||||
entryIsNew := !store.EntryURLExists(feed.ID, entry.URL)
|
||||
if feed.Crawler && entryIsNew {
|
||||
logger.Debug("[Processor] Crawling entry %q from feed %q", entry.URL, feed.FeedURL)
|
||||
logger.Debug("[Processor] Crawling entry %q from feed %q", url, feed.FeedURL)
|
||||
|
||||
startTime := time.Now()
|
||||
content, scraperErr := scraper.Fetch(
|
||||
entry.URL,
|
||||
url,
|
||||
feed.ScraperRules,
|
||||
feed.UserAgent,
|
||||
feed.Cookie,
|
||||
|
@ -77,10 +79,10 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed) {
|
|||
}
|
||||
}
|
||||
|
||||
entry.Content = rewrite.Rewriter(entry.URL, entry.Content, feed.RewriteRules)
|
||||
entry.Content = rewrite.Rewriter(url, entry.Content, feed.RewriteRules)
|
||||
|
||||
// The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered.
|
||||
entry.Content = sanitizer.Sanitize(entry.URL, entry.Content)
|
||||
entry.Content = sanitizer.Sanitize(url, entry.Content)
|
||||
|
||||
if entryIsNew {
|
||||
intg, err := store.Integration(feed.UserID)
|
||||
|
@ -127,8 +129,10 @@ func isAllowedEntry(feed *model.Feed, entry *model.Entry) bool {
|
|||
// ProcessEntryWebPage downloads the entry web page and apply rewrite rules.
|
||||
func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry) error {
|
||||
startTime := time.Now()
|
||||
url := getUrlFromEntry(feed, entry)
|
||||
|
||||
content, scraperErr := scraper.Fetch(
|
||||
entry.URL,
|
||||
url,
|
||||
entry.Feed.ScraperRules,
|
||||
entry.Feed.UserAgent,
|
||||
entry.Feed.Cookie,
|
||||
|
@ -148,8 +152,8 @@ func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry) error {
|
|||
return scraperErr
|
||||
}
|
||||
|
||||
content = rewrite.Rewriter(entry.URL, content, entry.Feed.RewriteRules)
|
||||
content = sanitizer.Sanitize(entry.URL, content)
|
||||
content = rewrite.Rewriter(url, content, entry.Feed.RewriteRules)
|
||||
content = sanitizer.Sanitize(url, content)
|
||||
|
||||
if content != "" {
|
||||
entry.Content = content
|
||||
|
@ -159,6 +163,22 @@ func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func getUrlFromEntry(feed *model.Feed, entry *model.Entry) string {
|
||||
var url = entry.URL
|
||||
if feed.UrlRewriteRules != "" {
|
||||
parts := customReplaceRuleRegex.FindStringSubmatch(feed.UrlRewriteRules)
|
||||
|
||||
if len(parts) >= 3 {
|
||||
re := regexp.MustCompile(parts[1])
|
||||
url = re.ReplaceAllString(entry.URL, parts[2])
|
||||
logger.Debug(`[Processor] Rewriting entry URL %s to %s`, entry.URL, url)
|
||||
} else {
|
||||
logger.Debug("[Processor] Cannot find search and replace terms for replace rule %s", feed.UrlRewriteRules)
|
||||
}
|
||||
}
|
||||
return url
|
||||
}
|
||||
|
||||
func updateEntryReadingTime(store *storage.Storage, feed *model.Feed, entry *model.Entry, entryIsNew bool) {
|
||||
if shouldFetchYouTubeWatchTime(entry) {
|
||||
if entryIsNew {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue