Add the possibility to define rewrite rules for each feed

2025-09-30 19:22:11 +00:00 · 2017-12-11 22:16:32 -08:00 · 2017-12-11 22:16:32 -08:00 · 33445e5b68
commit 33445e5b68
parent 87ccad5c7f
29 changed files with 214 additions and 72 deletions
--- a/reader/atom/atom.go
+++ b/reader/atom/atom.go
@ -14,7 +14,6 @@ import (
 	"github.com/miniflux/miniflux2/helper"
 	"github.com/miniflux/miniflux2/model"
 	"github.com/miniflux/miniflux2/reader/date"
-	"github.com/miniflux/miniflux2/reader/processor"
 )

 type atomFeed struct {
@ -87,7 +86,7 @@ func (a *atomEntry) Transform() *model.Entry {
 	entry.Date = getDate(a)
 	entry.Author = getAuthor(a.Author)
 	entry.Hash = getHash(a)
-	entry.Content = processor.ItemContentProcessor(entry.URL, getContent(a))
+	entry.Content = getContent(a)
 	entry.Title = strings.TrimSpace(a.Title)
 	entry.Enclosures = getEnclosures(a)

--- a/reader/feed/handler.go
+++ b/reader/feed/handler.go
@ -14,6 +14,7 @@ import (
 	"github.com/miniflux/miniflux2/http"
 	"github.com/miniflux/miniflux2/model"
 	"github.com/miniflux/miniflux2/reader/icon"
+	"github.com/miniflux/miniflux2/reader/processor"
 	"github.com/miniflux/miniflux2/storage"
 )

@ -63,6 +64,9 @@ func (h *Handler) CreateFeed(userID, categoryID int64, url string) (*model.Feed,
 		return nil, err
 	}

+	feedProcessor := processor.NewFeedProcessor(subscription)
+	feedProcessor.Process()
+
 	subscription.Category = &model.Category{ID: categoryID}
 	subscription.EtagHeader = response.ETag
 	subscription.LastModifiedHeader = response.LastModified
@ -136,6 +140,11 @@ func (h *Handler) RefreshFeed(userID, feedID int64) error {
 			return err
 		}

+		feedProcessor := processor.NewFeedProcessor(subscription)
+		feedProcessor.WithScraperRules(originalFeed.ScraperRules)
+		feedProcessor.WithRewriteRules(originalFeed.RewriteRules)
+		feedProcessor.Process()
+
 		originalFeed.EtagHeader = response.ETag
 		originalFeed.LastModifiedHeader = response.LastModified

--- a/reader/json/json.go
+++ b/reader/json/json.go
@ -9,12 +9,10 @@ import (
 	"strings"
 	"time"

-	"github.com/miniflux/miniflux2/reader/sanitizer"
-
 	"github.com/miniflux/miniflux2/helper"
 	"github.com/miniflux/miniflux2/model"
 	"github.com/miniflux/miniflux2/reader/date"
-	"github.com/miniflux/miniflux2/reader/processor"
+	"github.com/miniflux/miniflux2/reader/sanitizer"
 )

 type jsonFeed struct {
@ -148,7 +146,7 @@ func (j *jsonItem) Transform() *model.Entry {
 	entry.Date = j.GetDate()
 	entry.Author = j.GetAuthor()
 	entry.Hash = j.GetHash()
-	entry.Content = processor.ItemContentProcessor(entry.URL, j.GetContent())
+	entry.Content = j.GetContent()
 	entry.Title = strings.TrimSpace(j.GetTitle())
 	entry.Enclosures = j.GetEnclosures()
 	return entry
--- a/reader/json/parser_test.go
+++ b/reader/json/parser_test.go
@ -148,7 +148,7 @@ func TestParsePodcast(t *testing.T) {
 		t.Errorf(`Incorrect entry title, got: "%s"`, feed.Entries[0].Title)
 	}

-	if feed.Entries[0].Content != `Chris has worked at <a href="http://adobe.com/" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Adobe</a> and as a founder of Rogue Sheep, which won an Apple Design Award for Postage. Chris’s new company is Aged &amp; Distilled with Guy English — which shipped <a href="http://aged-and-distilled.com/napkin/" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Napkin</a>, a Mac app for visual collaboration. Chris is also the co-host of The Record. He lives on <a href="http://www.ci.bainbridge-isl.wa.us/" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Bainbridge Island</a>, a quick ferry ride from Seattle.` {
+	if feed.Entries[0].Content != `Chris has worked at <a href="http://adobe.com/">Adobe</a> and as a founder of Rogue Sheep, which won an Apple Design Award for Postage. Chris’s new company is Aged & Distilled with Guy English — which shipped <a href="http://aged-and-distilled.com/napkin/">Napkin</a>, a Mac app for visual collaboration. Chris is also the co-host of The Record. He lives on <a href="http://www.ci.bainbridge-isl.wa.us/">Bainbridge Island</a>, a quick ferry ride from Seattle.` {
 		t.Errorf(`Incorrect entry content, got: "%s"`, feed.Entries[0].Content)
 	}

--- a/reader/processor/processor.go
+++ b/reader/processor/processor.go
@ -5,12 +5,37 @@
 package processor

 import (
+	"github.com/miniflux/miniflux2/model"
 	"github.com/miniflux/miniflux2/reader/rewrite"
 	"github.com/miniflux/miniflux2/reader/sanitizer"
 )

-// ItemContentProcessor executes a set of functions to sanitize and alter item contents.
-func ItemContentProcessor(url, content string) string {
-	content = sanitizer.Sanitize(url, content)
-	return rewrite.Rewriter(url, content)
+// FeedProcessor handles the processing of feed contents.
+type FeedProcessor struct {
+	feed         *model.Feed
+	scraperRules string
+	rewriteRules string
+}
+
+// WithScraperRules adds scraper rules to the processing.
+func (f *FeedProcessor) WithScraperRules(rules string) {
+	f.scraperRules = rules
+}
+
+// WithRewriteRules adds rewrite rules to the processing.
+func (f *FeedProcessor) WithRewriteRules(rules string) {
+	f.rewriteRules = rules
+}
+
+// Process applies rewrite and scraper rules.
+func (f *FeedProcessor) Process() {
+	for _, entry := range f.feed.Entries {
+		entry.Content = sanitizer.Sanitize(entry.URL, entry.Content)
+		entry.Content = rewrite.Rewriter(entry.URL, entry.Content, f.rewriteRules)
+	}
+}
+
+// NewFeedProcessor returns a new FeedProcessor.
+func NewFeedProcessor(feed *model.Feed) *FeedProcessor {
+	return &FeedProcessor{feed: feed}
 }
--- a/reader/rdf/rdf.go
+++ b/reader/rdf/rdf.go
@ -10,10 +10,8 @@ import (
 	"time"

 	"github.com/miniflux/miniflux2/helper"
-	"github.com/miniflux/miniflux2/reader/processor"
-	"github.com/miniflux/miniflux2/reader/sanitizer"
-
 	"github.com/miniflux/miniflux2/model"
+	"github.com/miniflux/miniflux2/reader/sanitizer"
 )

 type rdfFeed struct {
@ -58,7 +56,7 @@ func (r *rdfItem) Transform() *model.Entry {
 	entry.Title = strings.TrimSpace(r.Title)
 	entry.Author = strings.TrimSpace(r.Creator)
 	entry.URL = r.Link
-	entry.Content = processor.ItemContentProcessor(entry.URL, r.Description)
+	entry.Content = r.Description
 	entry.Hash = getHash(r)
 	entry.Date = time.Now()
 	return entry
--- a/reader/rewrite/rewrite_functions.go
+++ b/reader/rewrite/rewrite_functions.go
@ -0,0 +1,40 @@
+// Copyright 2017 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package rewrite
+
+import (
+	"regexp"
+	"strings"
+
+	"github.com/PuerkitoBio/goquery"
+)
+
+var (
+	youtubeRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)`)
+)
+
+func addImageTitle(entryURL, entryContent string) string {
+	doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
+	if err != nil {
+		return entryContent
+	}
+
+	imgTag := doc.Find("img").First()
+	if titleAttr, found := imgTag.Attr("title"); found {
+		return entryContent + `<blockquote cite="` + entryURL + `">` + titleAttr + "</blockquote>"
+	}
+
+	return entryContent
+}
+
+func addYoutubeVideo(entryURL, entryContent string) string {
+	matches := youtubeRegex.FindStringSubmatch(entryURL)
+
+	if len(matches) == 2 {
+		video := `<iframe width="650" height="350" frameborder="0" src="https://www.youtube-nocookie.com/embed/` + matches[1] + `" allowfullscreen></iframe>`
+		return video + "<p>" + entryContent + "</p>"
+	}
+	return entryContent
+}
--- a/reader/rewrite/rewriter.go
+++ b/reader/rewrite/rewriter.go
@ -5,44 +5,39 @@
 package rewrite

 import (
-	"regexp"
 	"strings"

-	"github.com/PuerkitoBio/goquery"
+	"github.com/miniflux/miniflux2/url"
 )

-var rewriteRules = []func(string, string) string{
-	func(url, content string) string {
-		re := regexp.MustCompile(`youtube\.com/watch\?v=(.*)`)
-		matches := re.FindStringSubmatch(url)
-
-		if len(matches) == 2 {
-			video := `<iframe width="650" height="350" frameborder="0" src="https://www.youtube-nocookie.com/embed/` + matches[1] + `" allowfullscreen></iframe>`
-			return video + "<p>" + content + "</p>"
-		}
-		return content
-	},
-	func(url, content string) string {
-		if strings.HasPrefix(url, "https://xkcd.com") {
-			doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
-			if err != nil {
-				return content
-			}
-
-			imgTag := doc.Find("img").First()
-			if titleAttr, found := imgTag.Attr("title"); found {
-				return content + `<blockquote cite="` + url + `">` + titleAttr + "</blockquote>"
-			}
-		}
-		return content
-	},
-}
-
 // Rewriter modify item contents with a set of rewriting rules.
-func Rewriter(url, content string) string {
-	for _, rewriteRule := range rewriteRules {
-		content = rewriteRule(url, content)
+func Rewriter(entryURL, entryContent, customRewriteRules string) string {
+	rulesList := getPredefinedRewriteRules(entryURL)
+	if customRewriteRules != "" {
+		rulesList = customRewriteRules
 	}

-	return content
+	rules := strings.Split(rulesList, ",")
+	for _, rule := range rules {
+		switch strings.TrimSpace(rule) {
+		case "add_image_title":
+			entryContent = addImageTitle(entryURL, entryContent)
+		case "add_youtube_video":
+			entryContent = addYoutubeVideo(entryURL, entryContent)
+		}
+	}
+
+	return entryContent
+}
+
+func getPredefinedRewriteRules(entryURL string) string {
+	urlDomain := url.Domain(entryURL)
+
+	for domain, rules := range predefinedRules {
+		if strings.Contains(urlDomain, domain) {
+			return rules
+		}
+	}
+
+	return ""
 }
--- a/reader/rewrite/rewriter_test.go
+++ b/reader/rewrite/rewriter_test.go
@ -7,7 +7,7 @@ package rewrite
 import "testing"

 func TestRewriteWithNoMatchingRule(t *testing.T) {
-	output := Rewriter("https://example.org/article", `Some text.`)
+	output := Rewriter("https://example.org/article", `Some text.`, ``)
 	expected := `Some text.`

 	if expected != output {
@ -16,7 +16,7 @@ func TestRewriteWithNoMatchingRule(t *testing.T) {
 }

 func TestRewriteWithYoutubeLink(t *testing.T) {
-	output := Rewriter("https://www.youtube.com/watch?v=1234", `Video Description`)
+	output := Rewriter("https://www.youtube.com/watch?v=1234", `Video Description`, ``)
 	expected := `<iframe width="650" height="350" frameborder="0" src="https://www.youtube-nocookie.com/embed/1234" allowfullscreen></iframe><p>Video Description</p>`

 	if expected != output {
@ -24,11 +24,37 @@ func TestRewriteWithYoutubeLink(t *testing.T) {
 	}
 }

+func TestRewriteWithInexistingCustomRule(t *testing.T) {
+	output := Rewriter("https://www.youtube.com/watch?v=1234", `Video Description`, `some rule`)
+	expected := `Video Description`
+	if expected != output {
+		t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
+	}
+}
+
 func TestRewriteWithXkcdLink(t *testing.T) {
 	description := `<img src="https://imgs.xkcd.com/comics/thermostat.png" title="Your problem is so terrible, I worry that, if I help you, I risk drawing the attention of whatever god of technology inflicted it on you." alt="Your problem is so terrible, I worry that, if I help you, I risk drawing the attention of whatever god of technology inflicted it on you." />`
-	output := Rewriter("https://xkcd.com/1912/", description)
+	output := Rewriter("https://xkcd.com/1912/", description, ``)
 	expected := description + `<blockquote cite="https://xkcd.com/1912/">Your problem is so terrible, I worry that, if I help you, I risk drawing the attention of whatever god of technology inflicted it on you.</blockquote>`
 	if expected != output {
 		t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
 	}
 }
+func TestRewriteWithXkcdLinkAndNoImage(t *testing.T) {
+	description := "test"
+	output := Rewriter("https://xkcd.com/1912/", description, ``)
+	expected := description
+	if expected != output {
+		t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
+	}
+}
+
+func TestRewriteWithXkcdAndNoImage(t *testing.T) {
+	description := "test"
+	output := Rewriter("https://xkcd.com/1912/", description, ``)
+	expected := description
+
+	if expected != output {
+		t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
+	}
+}
--- a/reader/rewrite/rules.go
+++ b/reader/rewrite/rules.go
@ -0,0 +1,30 @@
+// Copyright 2017 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package rewrite
+
+// List of predefined rewrite rules (alphabetically sorted)
+// Available rules: "add_image_title", "add_youtube_video"
+// domain => rule name
+var predefinedRules = map[string]string{
+	"abstrusegoose.com":      "add_image_title",
+	"amazingsuperpowers.com": "add_image_title",
+	"cowbirdsinlove.com":     "add_image_title",
+	"drawingboardcomic.com":  "add_image_title",
+	"exocomics.com":          "add_image_title",
+	"happletea.com":          "add_image_title",
+	"imogenquest.net":        "add_image_title",
+	"lukesurl.com":           "add_image_title",
+	"mercworks.net":          "add_image_title",
+	"mrlovenstein.com":       "add_image_title",
+	"nedroid.com":            "add_image_title",
+	"oglaf.com":              "add_image_title",
+	"optipess.com":           "add_image_title",
+	"peebleslab.com":         "add_image_title",
+	"sentfromthemoon.com":    "add_image_title",
+	"thedoghousediaries.com": "add_image_title",
+	"treelobsters.com":       "add_image_title",
+	"youtube.com":            "add_youtube_video",
+	"xkcd.com":               "add_image_title",
+}
--- a/reader/rss/parser_test.go
+++ b/reader/rss/parser_test.go
@ -94,7 +94,7 @@ func TestParseRss2Sample(t *testing.T) {
 		t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
 	}

-	if feed.Entries[0].Content != `How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia&#39;s <a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Star City</a>.` {
+	if feed.Entries[0].Content != `How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's <a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm">Star City</a>.` {
 		t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].Content)
 	}
 }
@ -383,7 +383,7 @@ func TestParseEntryWithContentEncoded(t *testing.T) {
 		t.Error(err)
 	}

-	if feed.Entries[0].Content != `<p><a href="http://www.example.org/" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Example</a>.</p>` {
+	if feed.Entries[0].Content != `<p><a href="http://www.example.org/">Example</a>.</p>` {
 		t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].Content)
 	}
 }
--- a/reader/rss/rss.go
+++ b/reader/rss/rss.go
@ -15,7 +15,6 @@ import (
 	"github.com/miniflux/miniflux2/helper"
 	"github.com/miniflux/miniflux2/model"
 	"github.com/miniflux/miniflux2/reader/date"
-	"github.com/miniflux/miniflux2/reader/processor"
 )

 type rssFeed struct {
@ -211,7 +210,7 @@ func (r *rssItem) Transform() *model.Entry {
 	entry.Date = r.GetDate()
 	entry.Author = r.GetAuthor()
 	entry.Hash = r.GetHash()
-	entry.Content = processor.ItemContentProcessor(entry.URL, r.GetContent())
+	entry.Content = r.GetContent()
 	entry.Title = strings.TrimSpace(r.Title)
 	entry.Enclosures = r.GetEnclosures()