From 9c9a0c5a02787e8ebd70bb06fabb5009c9be2166 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Mon, 9 Dec 2024 14:31:58 +0100 Subject: [PATCH] Find feeds via sitemap --- internal/reader/subscription/finder.go | 90 +++++++++++++++++++-- internal/reader/subscription/finder_test.go | 40 +++++++++ 2 files changed, 122 insertions(+), 8 deletions(-) diff --git a/internal/reader/subscription/finder.go b/internal/reader/subscription/finder.go index 945fd1c6..b87a3e24 100644 --- a/internal/reader/subscription/finder.go +++ b/internal/reader/subscription/finder.go @@ -5,6 +5,7 @@ package subscription // import "miniflux.app/v2/internal/reader/subscription" import ( "bytes" + "encoding/xml" "fmt" "io" "log/slog" @@ -125,6 +126,14 @@ func (f *SubscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string) slog.Debug("Subscriptions found with well-known URLs", slog.String("website_url", websiteURL), slog.Any("subscriptions", subscriptions)) return subscriptions, nil } + // Step 7) Check if the website has feeds in its sitemap. + slog.Debug("Try to detect feeds from sitemap", slog.String("website_url", websiteURL)) + if subscriptions, localizedError := f.FindSubscriptionsFromSitemap(websiteURL); localizedError != nil { + return nil, localizedError + } else if len(subscriptions) > 0 { + slog.Debug("Subscriptions found with sitemap", slog.String("website_url", websiteURL), slog.Any("subscriptions", subscriptions)) + return subscriptions, nil + } return nil, nil } @@ -190,14 +199,16 @@ func (f *SubscriptionFinder) FindSubscriptionsFromWebPage(websiteURL, contentTyp func (f *SubscriptionFinder) FindSubscriptionsFromWellKnownURLs(websiteURL string) (Subscriptions, *locale.LocalizedErrorWrapper) { knownURLs := map[string]string{ - "atom.xml": parser.FormatAtom, - "feed.xml": parser.FormatAtom, - "feed/": parser.FormatAtom, - "rss.xml": parser.FormatRSS, - "rss/": parser.FormatRSS, - "index.rss": parser.FormatRSS, - "index.xml": parser.FormatRSS, - "feed.atom": parser.FormatAtom, + "atom.xml": parser.FormatAtom, + "feed.xml": parser.FormatAtom, + "feed": parser.FormatAtom, + "rss.xml": parser.FormatRSS, + "rss": parser.FormatRSS, + "index.rss": parser.FormatRSS, + "index.xml": parser.FormatRSS, + "feed.atom": parser.FormatAtom, + "atom": parser.FormatAtom, + "index.atom": parser.FormatAtom, } websiteURLRoot := urllib.RootURL(websiteURL) @@ -316,3 +327,66 @@ func (f *SubscriptionFinder) FindSubscriptionsFromYouTubePlaylistPage(websiteURL return nil, nil } + +func (f *SubscriptionFinder) FindSubscriptionsFromSitemap(websiteURL string) (Subscriptions, *locale.LocalizedErrorWrapper) { + websiteURLRoot := urllib.RootURL(websiteURL) + + responseHandler := fetcher.NewResponseHandler(f.requestBuilder.ExecuteRequest(websiteURLRoot + "/sitemap.xml")) + defer responseHandler.Close() + + if localizedError := responseHandler.LocalizedError(); localizedError != nil { + slog.Warn("Unable to find subscriptions", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error())) + return nil, localizedError + } + + responseBody, localizedError := responseHandler.ReadBody(config.Opts.HTTPClientMaxBodySize()) + if localizedError != nil { + slog.Warn("Unable to find subscriptions", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error())) + return nil, localizedError + } + return findSubscriptionsFromDownloadedSitemap(bytes.NewReader(responseBody)) +} + +func findSubscriptionsFromDownloadedSitemap(body io.Reader) (Subscriptions, *locale.LocalizedErrorWrapper) { + var subscriptions Subscriptions + loc := struct { + Content string `xml:",chardata"` + }{} + + decoder := xml.NewDecoder(body) + for { + t, _ := decoder.Token() + if t == nil { + break + } + switch se := t.(type) { + case xml.StartElement: + if se.Name.Local != "loc" { + continue + } + + if err := decoder.DecodeElement(&loc, &se); err != nil { + slog.Warn("Unable to decode loc", slog.Any("error", err)) + } + feedUrl := loc.Content + switch { + case strings.Contains(feedUrl, ".xml"), + strings.Contains(feedUrl, "rss"): + subscriptions = append(subscriptions, &Subscription{ + Type: parser.FormatRSS, + Title: feedUrl, + URL: feedUrl, + }) + case strings.Contains(feedUrl, "feed"), + strings.Contains(feedUrl, "atom"): + subscriptions = append(subscriptions, &Subscription{ + Type: parser.FormatAtom, + Title: feedUrl, + URL: feedUrl, + }) + } + } + } + + return subscriptions, nil +} diff --git a/internal/reader/subscription/finder_test.go b/internal/reader/subscription/finder_test.go index c394a239..8448adc1 100644 --- a/internal/reader/subscription/finder_test.go +++ b/internal/reader/subscription/finder_test.go @@ -481,3 +481,43 @@ func TestParseWebPageWithNoHref(t *testing.T) { t.Fatal(`Incorrect number of subscriptions returned`) } } + +func TestParseSiteMap(t *testing.T) { + htmlPage := ` + + + + http://www.example.com/ + 2005-01-01 + monthly + 0.8 + + + http://www.example.com/feed/myfeed + 2005-01-01 + monthly + 0.8 + + + http://www.example.com/myfeed.xml + 2005-01-01 + monthly + 0.8 + + + http://www.example.com/atom_feed.xml + 2005-01-01 + monthly + 0.8 + + ` + + subscriptions, err := findSubscriptionsFromDownloadedSitemap(strings.NewReader(htmlPage)) + if err != nil { + t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err) + } + + if len(subscriptions) != 3 { + t.Fatal(`Incorrect number of subscriptions returned`) + } +}