1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-08-06 17:41:00 +00:00

Add the possibility to enable crawler for feeds

This commit is contained in:
Frédéric Guillot 2017-12-12 19:19:36 -08:00
parent 33445e5b68
commit ef097f02fe
22 changed files with 77 additions and 25 deletions

View file

@ -33,7 +33,7 @@ type Handler struct {
}
// CreateFeed fetch, parse and store a new feed.
func (h *Handler) CreateFeed(userID, categoryID int64, url string) (*model.Feed, error) {
func (h *Handler) CreateFeed(userID, categoryID int64, url string, crawler bool) (*model.Feed, error) {
defer helper.ExecutionTime(time.Now(), fmt.Sprintf("[Handler:CreateFeed] feedUrl=%s", url))
if !h.store.CategoryExists(userID, categoryID) {
@ -65,6 +65,7 @@ func (h *Handler) CreateFeed(userID, categoryID int64, url string) (*model.Feed,
}
feedProcessor := processor.NewFeedProcessor(subscription)
feedProcessor.WithCrawler(crawler)
feedProcessor.Process()
subscription.Category = &model.Category{ID: categoryID}
@ -72,6 +73,7 @@ func (h *Handler) CreateFeed(userID, categoryID int64, url string) (*model.Feed,
subscription.LastModifiedHeader = response.LastModified
subscription.FeedURL = response.EffectiveURL
subscription.UserID = userID
subscription.Crawler = crawler
err = h.store.CreateFeed(subscription)
if err != nil {
@ -143,6 +145,7 @@ func (h *Handler) RefreshFeed(userID, feedID int64) error {
feedProcessor := processor.NewFeedProcessor(subscription)
feedProcessor.WithScraperRules(originalFeed.ScraperRules)
feedProcessor.WithRewriteRules(originalFeed.RewriteRules)
feedProcessor.WithCrawler(originalFeed.Crawler)
feedProcessor.Process()
originalFeed.EtagHeader = response.ETag

View file

@ -5,9 +5,12 @@
package processor
import (
"log"
"github.com/miniflux/miniflux2/model"
"github.com/miniflux/miniflux2/reader/rewrite"
"github.com/miniflux/miniflux2/reader/sanitizer"
"github.com/miniflux/miniflux2/reader/scraper"
)
// FeedProcessor handles the processing of feed contents.
@ -15,6 +18,12 @@ type FeedProcessor struct {
feed *model.Feed
scraperRules string
rewriteRules string
crawler bool
}
// WithCrawler enables the crawler.
func (f *FeedProcessor) WithCrawler(value bool) {
f.crawler = value
}
// WithScraperRules adds scraper rules to the processing.
@ -30,6 +39,15 @@ func (f *FeedProcessor) WithRewriteRules(rules string) {
// Process applies rewrite and scraper rules.
func (f *FeedProcessor) Process() {
for _, entry := range f.feed.Entries {
if f.crawler {
content, err := scraper.Fetch(entry.URL, f.scraperRules)
if err != nil {
log.Println("[FeedProcessor]", err)
} else {
entry.Content = content
}
}
entry.Content = sanitizer.Sanitize(entry.URL, entry.Content)
entry.Content = rewrite.Rewriter(entry.URL, entry.Content, f.rewriteRules)
}
@ -37,5 +55,5 @@ func (f *FeedProcessor) Process() {
// NewFeedProcessor returns a new FeedProcessor.
func NewFeedProcessor(feed *model.Feed) *FeedProcessor {
return &FeedProcessor{feed: feed}
return &FeedProcessor{feed: feed, crawler: false}
}

View file

@ -13,7 +13,6 @@ import (
"github.com/PuerkitoBio/goquery"
"github.com/miniflux/miniflux2/http"
"github.com/miniflux/miniflux2/reader/readability"
"github.com/miniflux/miniflux2/reader/sanitizer"
"github.com/miniflux/miniflux2/url"
)
@ -34,11 +33,11 @@ func Fetch(websiteURL, rules string) (string, error) {
return "", err
}
var content string
if rules == "" {
rules = getPredefinedScraperRules(websiteURL)
}
var content string
if rules != "" {
log.Printf(`[Scraper] Using rules "%s" for "%s"`, rules, websiteURL)
content, err = scrapContent(page, rules)
@ -51,7 +50,7 @@ func Fetch(websiteURL, rules string) (string, error) {
return "", err
}
return sanitizer.Sanitize(websiteURL, content), nil
return content, nil
}
func scrapContent(page io.Reader, rules string) (string, error) {