1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-06-27 16:36:00 +00:00
This commit is contained in:
Julien Voisin 2025-03-16 12:34:54 -03:00 committed by GitHub
commit e4215906a9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 122 additions and 8 deletions

View file

@ -5,6 +5,7 @@ package subscription // import "miniflux.app/v2/internal/reader/subscription"
import (
"bytes"
"encoding/xml"
"fmt"
"io"
"log/slog"
@ -124,6 +125,14 @@ func (f *SubscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string)
slog.Debug("Subscriptions found with well-known URLs", slog.String("website_url", websiteURL), slog.Any("subscriptions", subscriptions))
return subscriptions, nil
}
// Step 7) Check if the website has feeds in its sitemap.
slog.Debug("Try to detect feeds from sitemap", slog.String("website_url", websiteURL))
if subscriptions, localizedError := f.FindSubscriptionsFromSitemap(websiteURL); localizedError != nil {
return nil, localizedError
} else if len(subscriptions) > 0 {
slog.Debug("Subscriptions found with sitemap", slog.String("website_url", websiteURL), slog.Any("subscriptions", subscriptions))
return subscriptions, nil
}
return nil, nil
}
@ -189,14 +198,16 @@ func (f *SubscriptionFinder) FindSubscriptionsFromWebPage(websiteURL, contentTyp
func (f *SubscriptionFinder) FindSubscriptionsFromWellKnownURLs(websiteURL string) (Subscriptions, *locale.LocalizedErrorWrapper) {
knownURLs := map[string]string{
"atom.xml": parser.FormatAtom,
"feed.xml": parser.FormatAtom,
"feed/": parser.FormatAtom,
"rss.xml": parser.FormatRSS,
"rss/": parser.FormatRSS,
"index.rss": parser.FormatRSS,
"index.xml": parser.FormatRSS,
"feed.atom": parser.FormatAtom,
"atom.xml": parser.FormatAtom,
"feed.xml": parser.FormatAtom,
"feed": parser.FormatAtom,
"rss.xml": parser.FormatRSS,
"rss": parser.FormatRSS,
"index.rss": parser.FormatRSS,
"index.xml": parser.FormatRSS,
"feed.atom": parser.FormatAtom,
"atom": parser.FormatAtom,
"index.atom": parser.FormatAtom,
}
websiteURLRoot := urllib.RootURL(websiteURL)
@ -324,3 +335,66 @@ func (f *SubscriptionFinder) FindSubscriptionsFromYouTubePlaylistPage(websiteURL
return nil, nil
}
func (f *SubscriptionFinder) FindSubscriptionsFromSitemap(websiteURL string) (Subscriptions, *locale.LocalizedErrorWrapper) {
websiteURLRoot := urllib.RootURL(websiteURL)
responseHandler := fetcher.NewResponseHandler(f.requestBuilder.ExecuteRequest(websiteURLRoot + "/sitemap.xml"))
defer responseHandler.Close()
if localizedError := responseHandler.LocalizedError(); localizedError != nil {
slog.Warn("Unable to find subscriptions", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error()))
return nil, localizedError
}
responseBody, localizedError := responseHandler.ReadBody(config.Opts.HTTPClientMaxBodySize())
if localizedError != nil {
slog.Warn("Unable to find subscriptions", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error()))
return nil, localizedError
}
return findSubscriptionsFromDownloadedSitemap(bytes.NewReader(responseBody))
}
func findSubscriptionsFromDownloadedSitemap(body io.Reader) (Subscriptions, *locale.LocalizedErrorWrapper) {
var subscriptions Subscriptions
loc := struct {
Content string `xml:",chardata"`
}{}
decoder := xml.NewDecoder(body)
for {
t, _ := decoder.Token()
if t == nil {
break
}
switch se := t.(type) {
case xml.StartElement:
if se.Name.Local != "loc" {
continue
}
if err := decoder.DecodeElement(&loc, &se); err != nil {
slog.Warn("Unable to decode loc", slog.Any("error", err))
}
feedUrl := loc.Content
switch {
case strings.Contains(feedUrl, ".xml"),
strings.Contains(feedUrl, "rss"):
subscriptions = append(subscriptions, &Subscription{
Type: parser.FormatRSS,
Title: feedUrl,
URL: feedUrl,
})
case strings.Contains(feedUrl, "feed"),
strings.Contains(feedUrl, "atom"):
subscriptions = append(subscriptions, &Subscription{
Type: parser.FormatAtom,
Title: feedUrl,
URL: feedUrl,
})
}
}
}
return subscriptions, nil
}

View file

@ -481,3 +481,43 @@ func TestParseWebPageWithNoHref(t *testing.T) {
t.Fatal(`Incorrect number of subscriptions returned`)
}
}
func TestParseSiteMap(t *testing.T) {
htmlPage := `
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>http://www.example.com/</loc>
<lastmod>2005-01-01</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>http://www.example.com/feed/myfeed</loc>
<lastmod>2005-01-01</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>http://www.example.com/myfeed.xml</loc>
<lastmod>2005-01-01</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>http://www.example.com/atom_feed.xml</loc>
<lastmod>2005-01-01</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
</urlset> `
subscriptions, err := findSubscriptionsFromDownloadedSitemap(strings.NewReader(htmlPage))
if err != nil {
t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
}
if len(subscriptions) != 3 {
t.Fatal(`Incorrect number of subscriptions returned`)
}
}