mirror of
https://github.com/miniflux/v2.git
synced 2025-06-27 16:36:00 +00:00
Add the possibility to define rewrite rules for each feed
This commit is contained in:
parent
87ccad5c7f
commit
33445e5b68
29 changed files with 214 additions and 72 deletions
|
@ -14,7 +14,6 @@ import (
|
|||
"github.com/miniflux/miniflux2/helper"
|
||||
"github.com/miniflux/miniflux2/model"
|
||||
"github.com/miniflux/miniflux2/reader/date"
|
||||
"github.com/miniflux/miniflux2/reader/processor"
|
||||
)
|
||||
|
||||
type atomFeed struct {
|
||||
|
@ -87,7 +86,7 @@ func (a *atomEntry) Transform() *model.Entry {
|
|||
entry.Date = getDate(a)
|
||||
entry.Author = getAuthor(a.Author)
|
||||
entry.Hash = getHash(a)
|
||||
entry.Content = processor.ItemContentProcessor(entry.URL, getContent(a))
|
||||
entry.Content = getContent(a)
|
||||
entry.Title = strings.TrimSpace(a.Title)
|
||||
entry.Enclosures = getEnclosures(a)
|
||||
|
||||
|
|
|
@ -14,6 +14,7 @@ import (
|
|||
"github.com/miniflux/miniflux2/http"
|
||||
"github.com/miniflux/miniflux2/model"
|
||||
"github.com/miniflux/miniflux2/reader/icon"
|
||||
"github.com/miniflux/miniflux2/reader/processor"
|
||||
"github.com/miniflux/miniflux2/storage"
|
||||
)
|
||||
|
||||
|
@ -63,6 +64,9 @@ func (h *Handler) CreateFeed(userID, categoryID int64, url string) (*model.Feed,
|
|||
return nil, err
|
||||
}
|
||||
|
||||
feedProcessor := processor.NewFeedProcessor(subscription)
|
||||
feedProcessor.Process()
|
||||
|
||||
subscription.Category = &model.Category{ID: categoryID}
|
||||
subscription.EtagHeader = response.ETag
|
||||
subscription.LastModifiedHeader = response.LastModified
|
||||
|
@ -136,6 +140,11 @@ func (h *Handler) RefreshFeed(userID, feedID int64) error {
|
|||
return err
|
||||
}
|
||||
|
||||
feedProcessor := processor.NewFeedProcessor(subscription)
|
||||
feedProcessor.WithScraperRules(originalFeed.ScraperRules)
|
||||
feedProcessor.WithRewriteRules(originalFeed.RewriteRules)
|
||||
feedProcessor.Process()
|
||||
|
||||
originalFeed.EtagHeader = response.ETag
|
||||
originalFeed.LastModifiedHeader = response.LastModified
|
||||
|
||||
|
|
|
@ -9,12 +9,10 @@ import (
|
|||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/miniflux/miniflux2/reader/sanitizer"
|
||||
|
||||
"github.com/miniflux/miniflux2/helper"
|
||||
"github.com/miniflux/miniflux2/model"
|
||||
"github.com/miniflux/miniflux2/reader/date"
|
||||
"github.com/miniflux/miniflux2/reader/processor"
|
||||
"github.com/miniflux/miniflux2/reader/sanitizer"
|
||||
)
|
||||
|
||||
type jsonFeed struct {
|
||||
|
@ -148,7 +146,7 @@ func (j *jsonItem) Transform() *model.Entry {
|
|||
entry.Date = j.GetDate()
|
||||
entry.Author = j.GetAuthor()
|
||||
entry.Hash = j.GetHash()
|
||||
entry.Content = processor.ItemContentProcessor(entry.URL, j.GetContent())
|
||||
entry.Content = j.GetContent()
|
||||
entry.Title = strings.TrimSpace(j.GetTitle())
|
||||
entry.Enclosures = j.GetEnclosures()
|
||||
return entry
|
||||
|
|
|
@ -148,7 +148,7 @@ func TestParsePodcast(t *testing.T) {
|
|||
t.Errorf(`Incorrect entry title, got: "%s"`, feed.Entries[0].Title)
|
||||
}
|
||||
|
||||
if feed.Entries[0].Content != `Chris has worked at <a href="http://adobe.com/" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Adobe</a> and as a founder of Rogue Sheep, which won an Apple Design Award for Postage. Chris’s new company is Aged & Distilled with Guy English — which shipped <a href="http://aged-and-distilled.com/napkin/" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Napkin</a>, a Mac app for visual collaboration. Chris is also the co-host of The Record. He lives on <a href="http://www.ci.bainbridge-isl.wa.us/" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Bainbridge Island</a>, a quick ferry ride from Seattle.` {
|
||||
if feed.Entries[0].Content != `Chris has worked at <a href="http://adobe.com/">Adobe</a> and as a founder of Rogue Sheep, which won an Apple Design Award for Postage. Chris’s new company is Aged & Distilled with Guy English — which shipped <a href="http://aged-and-distilled.com/napkin/">Napkin</a>, a Mac app for visual collaboration. Chris is also the co-host of The Record. He lives on <a href="http://www.ci.bainbridge-isl.wa.us/">Bainbridge Island</a>, a quick ferry ride from Seattle.` {
|
||||
t.Errorf(`Incorrect entry content, got: "%s"`, feed.Entries[0].Content)
|
||||
}
|
||||
|
||||
|
|
|
@ -5,12 +5,37 @@
|
|||
package processor
|
||||
|
||||
import (
|
||||
"github.com/miniflux/miniflux2/model"
|
||||
"github.com/miniflux/miniflux2/reader/rewrite"
|
||||
"github.com/miniflux/miniflux2/reader/sanitizer"
|
||||
)
|
||||
|
||||
// ItemContentProcessor executes a set of functions to sanitize and alter item contents.
|
||||
func ItemContentProcessor(url, content string) string {
|
||||
content = sanitizer.Sanitize(url, content)
|
||||
return rewrite.Rewriter(url, content)
|
||||
// FeedProcessor handles the processing of feed contents.
|
||||
type FeedProcessor struct {
|
||||
feed *model.Feed
|
||||
scraperRules string
|
||||
rewriteRules string
|
||||
}
|
||||
|
||||
// WithScraperRules adds scraper rules to the processing.
|
||||
func (f *FeedProcessor) WithScraperRules(rules string) {
|
||||
f.scraperRules = rules
|
||||
}
|
||||
|
||||
// WithRewriteRules adds rewrite rules to the processing.
|
||||
func (f *FeedProcessor) WithRewriteRules(rules string) {
|
||||
f.rewriteRules = rules
|
||||
}
|
||||
|
||||
// Process applies rewrite and scraper rules.
|
||||
func (f *FeedProcessor) Process() {
|
||||
for _, entry := range f.feed.Entries {
|
||||
entry.Content = sanitizer.Sanitize(entry.URL, entry.Content)
|
||||
entry.Content = rewrite.Rewriter(entry.URL, entry.Content, f.rewriteRules)
|
||||
}
|
||||
}
|
||||
|
||||
// NewFeedProcessor returns a new FeedProcessor.
|
||||
func NewFeedProcessor(feed *model.Feed) *FeedProcessor {
|
||||
return &FeedProcessor{feed: feed}
|
||||
}
|
||||
|
|
|
@ -10,10 +10,8 @@ import (
|
|||
"time"
|
||||
|
||||
"github.com/miniflux/miniflux2/helper"
|
||||
"github.com/miniflux/miniflux2/reader/processor"
|
||||
"github.com/miniflux/miniflux2/reader/sanitizer"
|
||||
|
||||
"github.com/miniflux/miniflux2/model"
|
||||
"github.com/miniflux/miniflux2/reader/sanitizer"
|
||||
)
|
||||
|
||||
type rdfFeed struct {
|
||||
|
@ -58,7 +56,7 @@ func (r *rdfItem) Transform() *model.Entry {
|
|||
entry.Title = strings.TrimSpace(r.Title)
|
||||
entry.Author = strings.TrimSpace(r.Creator)
|
||||
entry.URL = r.Link
|
||||
entry.Content = processor.ItemContentProcessor(entry.URL, r.Description)
|
||||
entry.Content = r.Description
|
||||
entry.Hash = getHash(r)
|
||||
entry.Date = time.Now()
|
||||
return entry
|
||||
|
|
40
reader/rewrite/rewrite_functions.go
Normal file
40
reader/rewrite/rewrite_functions.go
Normal file
|
@ -0,0 +1,40 @@
|
|||
// Copyright 2017 Frédéric Guillot. All rights reserved.
|
||||
// Use of this source code is governed by the Apache 2.0
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package rewrite
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
var (
|
||||
youtubeRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)`)
|
||||
)
|
||||
|
||||
func addImageTitle(entryURL, entryContent string) string {
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
|
||||
if err != nil {
|
||||
return entryContent
|
||||
}
|
||||
|
||||
imgTag := doc.Find("img").First()
|
||||
if titleAttr, found := imgTag.Attr("title"); found {
|
||||
return entryContent + `<blockquote cite="` + entryURL + `">` + titleAttr + "</blockquote>"
|
||||
}
|
||||
|
||||
return entryContent
|
||||
}
|
||||
|
||||
func addYoutubeVideo(entryURL, entryContent string) string {
|
||||
matches := youtubeRegex.FindStringSubmatch(entryURL)
|
||||
|
||||
if len(matches) == 2 {
|
||||
video := `<iframe width="650" height="350" frameborder="0" src="https://www.youtube-nocookie.com/embed/` + matches[1] + `" allowfullscreen></iframe>`
|
||||
return video + "<p>" + entryContent + "</p>"
|
||||
}
|
||||
return entryContent
|
||||
}
|
|
@ -5,44 +5,39 @@
|
|||
package rewrite
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/miniflux/miniflux2/url"
|
||||
)
|
||||
|
||||
var rewriteRules = []func(string, string) string{
|
||||
func(url, content string) string {
|
||||
re := regexp.MustCompile(`youtube\.com/watch\?v=(.*)`)
|
||||
matches := re.FindStringSubmatch(url)
|
||||
|
||||
if len(matches) == 2 {
|
||||
video := `<iframe width="650" height="350" frameborder="0" src="https://www.youtube-nocookie.com/embed/` + matches[1] + `" allowfullscreen></iframe>`
|
||||
return video + "<p>" + content + "</p>"
|
||||
}
|
||||
return content
|
||||
},
|
||||
func(url, content string) string {
|
||||
if strings.HasPrefix(url, "https://xkcd.com") {
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
|
||||
if err != nil {
|
||||
return content
|
||||
}
|
||||
|
||||
imgTag := doc.Find("img").First()
|
||||
if titleAttr, found := imgTag.Attr("title"); found {
|
||||
return content + `<blockquote cite="` + url + `">` + titleAttr + "</blockquote>"
|
||||
}
|
||||
}
|
||||
return content
|
||||
},
|
||||
}
|
||||
|
||||
// Rewriter modify item contents with a set of rewriting rules.
|
||||
func Rewriter(url, content string) string {
|
||||
for _, rewriteRule := range rewriteRules {
|
||||
content = rewriteRule(url, content)
|
||||
func Rewriter(entryURL, entryContent, customRewriteRules string) string {
|
||||
rulesList := getPredefinedRewriteRules(entryURL)
|
||||
if customRewriteRules != "" {
|
||||
rulesList = customRewriteRules
|
||||
}
|
||||
|
||||
return content
|
||||
rules := strings.Split(rulesList, ",")
|
||||
for _, rule := range rules {
|
||||
switch strings.TrimSpace(rule) {
|
||||
case "add_image_title":
|
||||
entryContent = addImageTitle(entryURL, entryContent)
|
||||
case "add_youtube_video":
|
||||
entryContent = addYoutubeVideo(entryURL, entryContent)
|
||||
}
|
||||
}
|
||||
|
||||
return entryContent
|
||||
}
|
||||
|
||||
func getPredefinedRewriteRules(entryURL string) string {
|
||||
urlDomain := url.Domain(entryURL)
|
||||
|
||||
for domain, rules := range predefinedRules {
|
||||
if strings.Contains(urlDomain, domain) {
|
||||
return rules
|
||||
}
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
|
|
@ -7,7 +7,7 @@ package rewrite
|
|||
import "testing"
|
||||
|
||||
func TestRewriteWithNoMatchingRule(t *testing.T) {
|
||||
output := Rewriter("https://example.org/article", `Some text.`)
|
||||
output := Rewriter("https://example.org/article", `Some text.`, ``)
|
||||
expected := `Some text.`
|
||||
|
||||
if expected != output {
|
||||
|
@ -16,7 +16,7 @@ func TestRewriteWithNoMatchingRule(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestRewriteWithYoutubeLink(t *testing.T) {
|
||||
output := Rewriter("https://www.youtube.com/watch?v=1234", `Video Description`)
|
||||
output := Rewriter("https://www.youtube.com/watch?v=1234", `Video Description`, ``)
|
||||
expected := `<iframe width="650" height="350" frameborder="0" src="https://www.youtube-nocookie.com/embed/1234" allowfullscreen></iframe><p>Video Description</p>`
|
||||
|
||||
if expected != output {
|
||||
|
@ -24,11 +24,37 @@ func TestRewriteWithYoutubeLink(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestRewriteWithInexistingCustomRule(t *testing.T) {
|
||||
output := Rewriter("https://www.youtube.com/watch?v=1234", `Video Description`, `some rule`)
|
||||
expected := `Video Description`
|
||||
if expected != output {
|
||||
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRewriteWithXkcdLink(t *testing.T) {
|
||||
description := `<img src="https://imgs.xkcd.com/comics/thermostat.png" title="Your problem is so terrible, I worry that, if I help you, I risk drawing the attention of whatever god of technology inflicted it on you." alt="Your problem is so terrible, I worry that, if I help you, I risk drawing the attention of whatever god of technology inflicted it on you." />`
|
||||
output := Rewriter("https://xkcd.com/1912/", description)
|
||||
output := Rewriter("https://xkcd.com/1912/", description, ``)
|
||||
expected := description + `<blockquote cite="https://xkcd.com/1912/">Your problem is so terrible, I worry that, if I help you, I risk drawing the attention of whatever god of technology inflicted it on you.</blockquote>`
|
||||
if expected != output {
|
||||
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
|
||||
}
|
||||
}
|
||||
func TestRewriteWithXkcdLinkAndNoImage(t *testing.T) {
|
||||
description := "test"
|
||||
output := Rewriter("https://xkcd.com/1912/", description, ``)
|
||||
expected := description
|
||||
if expected != output {
|
||||
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRewriteWithXkcdAndNoImage(t *testing.T) {
|
||||
description := "test"
|
||||
output := Rewriter("https://xkcd.com/1912/", description, ``)
|
||||
expected := description
|
||||
|
||||
if expected != output {
|
||||
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
|
||||
}
|
||||
}
|
||||
|
|
30
reader/rewrite/rules.go
Normal file
30
reader/rewrite/rules.go
Normal file
|
@ -0,0 +1,30 @@
|
|||
// Copyright 2017 Frédéric Guillot. All rights reserved.
|
||||
// Use of this source code is governed by the Apache 2.0
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package rewrite
|
||||
|
||||
// List of predefined rewrite rules (alphabetically sorted)
|
||||
// Available rules: "add_image_title", "add_youtube_video"
|
||||
// domain => rule name
|
||||
var predefinedRules = map[string]string{
|
||||
"abstrusegoose.com": "add_image_title",
|
||||
"amazingsuperpowers.com": "add_image_title",
|
||||
"cowbirdsinlove.com": "add_image_title",
|
||||
"drawingboardcomic.com": "add_image_title",
|
||||
"exocomics.com": "add_image_title",
|
||||
"happletea.com": "add_image_title",
|
||||
"imogenquest.net": "add_image_title",
|
||||
"lukesurl.com": "add_image_title",
|
||||
"mercworks.net": "add_image_title",
|
||||
"mrlovenstein.com": "add_image_title",
|
||||
"nedroid.com": "add_image_title",
|
||||
"oglaf.com": "add_image_title",
|
||||
"optipess.com": "add_image_title",
|
||||
"peebleslab.com": "add_image_title",
|
||||
"sentfromthemoon.com": "add_image_title",
|
||||
"thedoghousediaries.com": "add_image_title",
|
||||
"treelobsters.com": "add_image_title",
|
||||
"youtube.com": "add_youtube_video",
|
||||
"xkcd.com": "add_image_title",
|
||||
}
|
|
@ -94,7 +94,7 @@ func TestParseRss2Sample(t *testing.T) {
|
|||
t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
|
||||
}
|
||||
|
||||
if feed.Entries[0].Content != `How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's <a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Star City</a>.` {
|
||||
if feed.Entries[0].Content != `How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's <a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm">Star City</a>.` {
|
||||
t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].Content)
|
||||
}
|
||||
}
|
||||
|
@ -383,7 +383,7 @@ func TestParseEntryWithContentEncoded(t *testing.T) {
|
|||
t.Error(err)
|
||||
}
|
||||
|
||||
if feed.Entries[0].Content != `<p><a href="http://www.example.org/" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Example</a>.</p>` {
|
||||
if feed.Entries[0].Content != `<p><a href="http://www.example.org/">Example</a>.</p>` {
|
||||
t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].Content)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -15,7 +15,6 @@ import (
|
|||
"github.com/miniflux/miniflux2/helper"
|
||||
"github.com/miniflux/miniflux2/model"
|
||||
"github.com/miniflux/miniflux2/reader/date"
|
||||
"github.com/miniflux/miniflux2/reader/processor"
|
||||
)
|
||||
|
||||
type rssFeed struct {
|
||||
|
@ -211,7 +210,7 @@ func (r *rssItem) Transform() *model.Entry {
|
|||
entry.Date = r.GetDate()
|
||||
entry.Author = r.GetAuthor()
|
||||
entry.Hash = r.GetHash()
|
||||
entry.Content = processor.ItemContentProcessor(entry.URL, r.GetContent())
|
||||
entry.Content = r.GetContent()
|
||||
entry.Title = strings.TrimSpace(r.Title)
|
||||
entry.Enclosures = r.GetEnclosures()
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue