mirror of
https://github.com/miniflux/v2.git
synced 2025-08-06 17:41:00 +00:00
Add the possibility to enable crawler for feeds
This commit is contained in:
parent
33445e5b68
commit
ef097f02fe
22 changed files with 77 additions and 25 deletions
|
@ -33,7 +33,7 @@ type Handler struct {
|
|||
}
|
||||
|
||||
// CreateFeed fetch, parse and store a new feed.
|
||||
func (h *Handler) CreateFeed(userID, categoryID int64, url string) (*model.Feed, error) {
|
||||
func (h *Handler) CreateFeed(userID, categoryID int64, url string, crawler bool) (*model.Feed, error) {
|
||||
defer helper.ExecutionTime(time.Now(), fmt.Sprintf("[Handler:CreateFeed] feedUrl=%s", url))
|
||||
|
||||
if !h.store.CategoryExists(userID, categoryID) {
|
||||
|
@ -65,6 +65,7 @@ func (h *Handler) CreateFeed(userID, categoryID int64, url string) (*model.Feed,
|
|||
}
|
||||
|
||||
feedProcessor := processor.NewFeedProcessor(subscription)
|
||||
feedProcessor.WithCrawler(crawler)
|
||||
feedProcessor.Process()
|
||||
|
||||
subscription.Category = &model.Category{ID: categoryID}
|
||||
|
@ -72,6 +73,7 @@ func (h *Handler) CreateFeed(userID, categoryID int64, url string) (*model.Feed,
|
|||
subscription.LastModifiedHeader = response.LastModified
|
||||
subscription.FeedURL = response.EffectiveURL
|
||||
subscription.UserID = userID
|
||||
subscription.Crawler = crawler
|
||||
|
||||
err = h.store.CreateFeed(subscription)
|
||||
if err != nil {
|
||||
|
@ -143,6 +145,7 @@ func (h *Handler) RefreshFeed(userID, feedID int64) error {
|
|||
feedProcessor := processor.NewFeedProcessor(subscription)
|
||||
feedProcessor.WithScraperRules(originalFeed.ScraperRules)
|
||||
feedProcessor.WithRewriteRules(originalFeed.RewriteRules)
|
||||
feedProcessor.WithCrawler(originalFeed.Crawler)
|
||||
feedProcessor.Process()
|
||||
|
||||
originalFeed.EtagHeader = response.ETag
|
||||
|
|
|
@ -5,9 +5,12 @@
|
|||
package processor
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/miniflux/miniflux2/model"
|
||||
"github.com/miniflux/miniflux2/reader/rewrite"
|
||||
"github.com/miniflux/miniflux2/reader/sanitizer"
|
||||
"github.com/miniflux/miniflux2/reader/scraper"
|
||||
)
|
||||
|
||||
// FeedProcessor handles the processing of feed contents.
|
||||
|
@ -15,6 +18,12 @@ type FeedProcessor struct {
|
|||
feed *model.Feed
|
||||
scraperRules string
|
||||
rewriteRules string
|
||||
crawler bool
|
||||
}
|
||||
|
||||
// WithCrawler enables the crawler.
|
||||
func (f *FeedProcessor) WithCrawler(value bool) {
|
||||
f.crawler = value
|
||||
}
|
||||
|
||||
// WithScraperRules adds scraper rules to the processing.
|
||||
|
@ -30,6 +39,15 @@ func (f *FeedProcessor) WithRewriteRules(rules string) {
|
|||
// Process applies rewrite and scraper rules.
|
||||
func (f *FeedProcessor) Process() {
|
||||
for _, entry := range f.feed.Entries {
|
||||
if f.crawler {
|
||||
content, err := scraper.Fetch(entry.URL, f.scraperRules)
|
||||
if err != nil {
|
||||
log.Println("[FeedProcessor]", err)
|
||||
} else {
|
||||
entry.Content = content
|
||||
}
|
||||
}
|
||||
|
||||
entry.Content = sanitizer.Sanitize(entry.URL, entry.Content)
|
||||
entry.Content = rewrite.Rewriter(entry.URL, entry.Content, f.rewriteRules)
|
||||
}
|
||||
|
@ -37,5 +55,5 @@ func (f *FeedProcessor) Process() {
|
|||
|
||||
// NewFeedProcessor returns a new FeedProcessor.
|
||||
func NewFeedProcessor(feed *model.Feed) *FeedProcessor {
|
||||
return &FeedProcessor{feed: feed}
|
||||
return &FeedProcessor{feed: feed, crawler: false}
|
||||
}
|
||||
|
|
|
@ -13,7 +13,6 @@ import (
|
|||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/miniflux/miniflux2/http"
|
||||
"github.com/miniflux/miniflux2/reader/readability"
|
||||
"github.com/miniflux/miniflux2/reader/sanitizer"
|
||||
"github.com/miniflux/miniflux2/url"
|
||||
)
|
||||
|
||||
|
@ -34,11 +33,11 @@ func Fetch(websiteURL, rules string) (string, error) {
|
|||
return "", err
|
||||
}
|
||||
|
||||
var content string
|
||||
if rules == "" {
|
||||
rules = getPredefinedScraperRules(websiteURL)
|
||||
}
|
||||
|
||||
var content string
|
||||
if rules != "" {
|
||||
log.Printf(`[Scraper] Using rules "%s" for "%s"`, rules, websiteURL)
|
||||
content, err = scrapContent(page, rules)
|
||||
|
@ -51,7 +50,7 @@ func Fetch(websiteURL, rules string) (string, error) {
|
|||
return "", err
|
||||
}
|
||||
|
||||
return sanitizer.Sanitize(websiteURL, content), nil
|
||||
return content, nil
|
||||
}
|
||||
|
||||
func scrapContent(page io.Reader, rules string) (string, error) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue