Add the possibility to enable crawler for feeds

2025-09-30 19:22:11 +00:00 · 2017-12-12 19:19:36 -08:00 · 2017-12-12 19:19:36 -08:00 · ef097f02fe
commit ef097f02fe
parent 33445e5b68
22 changed files with 77 additions and 25 deletions
--- a/reader/feed/handler.go
+++ b/reader/feed/handler.go
@ -33,7 +33,7 @@ type Handler struct {
 }

 // CreateFeed fetch, parse and store a new feed.
-func (h *Handler) CreateFeed(userID, categoryID int64, url string) (*model.Feed, error) {
+func (h *Handler) CreateFeed(userID, categoryID int64, url string, crawler bool) (*model.Feed, error) {
 	defer helper.ExecutionTime(time.Now(), fmt.Sprintf("[Handler:CreateFeed] feedUrl=%s", url))

 	if !h.store.CategoryExists(userID, categoryID) {
@ -65,6 +65,7 @@ func (h *Handler) CreateFeed(userID, categoryID int64, url string) (*model.Feed,
 	}

 	feedProcessor := processor.NewFeedProcessor(subscription)
+	feedProcessor.WithCrawler(crawler)
 	feedProcessor.Process()

 	subscription.Category = &model.Category{ID: categoryID}
@ -72,6 +73,7 @@ func (h *Handler) CreateFeed(userID, categoryID int64, url string) (*model.Feed,
 	subscription.LastModifiedHeader = response.LastModified
 	subscription.FeedURL = response.EffectiveURL
 	subscription.UserID = userID
+	subscription.Crawler = crawler

 	err = h.store.CreateFeed(subscription)
 	if err != nil {
@ -143,6 +145,7 @@ func (h *Handler) RefreshFeed(userID, feedID int64) error {
 		feedProcessor := processor.NewFeedProcessor(subscription)
 		feedProcessor.WithScraperRules(originalFeed.ScraperRules)
 		feedProcessor.WithRewriteRules(originalFeed.RewriteRules)
+		feedProcessor.WithCrawler(originalFeed.Crawler)
 		feedProcessor.Process()

 		originalFeed.EtagHeader = response.ETag
--- a/reader/processor/processor.go
+++ b/reader/processor/processor.go
@ -5,9 +5,12 @@
 package processor

 import (
+	"log"
+
 	"github.com/miniflux/miniflux2/model"
 	"github.com/miniflux/miniflux2/reader/rewrite"
 	"github.com/miniflux/miniflux2/reader/sanitizer"
+	"github.com/miniflux/miniflux2/reader/scraper"
 )

 // FeedProcessor handles the processing of feed contents.
@ -15,6 +18,12 @@ type FeedProcessor struct {
 	feed         *model.Feed
 	scraperRules string
 	rewriteRules string
+	crawler      bool
+}
+
+// WithCrawler enables the crawler.
+func (f *FeedProcessor) WithCrawler(value bool) {
+	f.crawler = value
 }

 // WithScraperRules adds scraper rules to the processing.
@ -30,6 +39,15 @@ func (f *FeedProcessor) WithRewriteRules(rules string) {
 // Process applies rewrite and scraper rules.
 func (f *FeedProcessor) Process() {
 	for _, entry := range f.feed.Entries {
+		if f.crawler {
+			content, err := scraper.Fetch(entry.URL, f.scraperRules)
+			if err != nil {
+				log.Println("[FeedProcessor]", err)
+			} else {
+				entry.Content = content
+			}
+		}
+
 		entry.Content = sanitizer.Sanitize(entry.URL, entry.Content)
 		entry.Content = rewrite.Rewriter(entry.URL, entry.Content, f.rewriteRules)
 	}
@ -37,5 +55,5 @@ func (f *FeedProcessor) Process() {

 // NewFeedProcessor returns a new FeedProcessor.
 func NewFeedProcessor(feed *model.Feed) *FeedProcessor {
-	return &FeedProcessor{feed: feed}
+	return &FeedProcessor{feed: feed, crawler: false}
 }
--- a/reader/scraper/scraper.go
+++ b/reader/scraper/scraper.go
@ -13,7 +13,6 @@ import (
 	"github.com/PuerkitoBio/goquery"
 	"github.com/miniflux/miniflux2/http"
 	"github.com/miniflux/miniflux2/reader/readability"
-	"github.com/miniflux/miniflux2/reader/sanitizer"
 	"github.com/miniflux/miniflux2/url"
 )

@ -34,11 +33,11 @@ func Fetch(websiteURL, rules string) (string, error) {
 		return "", err
 	}

-	var content string
 	if rules == "" {
 		rules = getPredefinedScraperRules(websiteURL)
 	}

+	var content string
 	if rules != "" {
 		log.Printf(`[Scraper] Using rules "%s" for "%s"`, rules, websiteURL)
 		content, err = scrapContent(page, rules)
@ -51,7 +50,7 @@ func Fetch(websiteURL, rules string) (string, error) {
 		return "", err
 	}

-	return sanitizer.Sanitize(websiteURL, content), nil
+	return content, nil
 }

 func scrapContent(page io.Reader, rules string) (string, error) {