diff --git a/internal/reader/processor/processor.go b/internal/reader/processor/processor.go
index 30679d11..7a5bdd33 100644
--- a/internal/reader/processor/processor.go
+++ b/internal/reader/processor/processor.go
@@ -61,6 +61,7 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us
entry.URL = cleanedURL
}
+ pageBaseURL := ""
rewrittenURL := rewriteEntryURL(feed, entry)
entryIsNew := store.IsNewEntry(feed.ID, entry.Hash)
if feed.Crawler && (entryIsNew || forceRefresh) {
@@ -87,12 +88,16 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us
requestBuilder.IgnoreTLSErrors(feed.AllowSelfSignedCertificates)
requestBuilder.DisableHTTP2(feed.DisableHTTP2)
- content, scraperErr := scraper.ScrapeWebsite(
+ scrapedPageBaseURL, extractedContent, scraperErr := scraper.ScrapeWebsite(
requestBuilder,
rewrittenURL,
feed.ScraperRules,
)
+ if scrapedPageBaseURL != "" {
+ pageBaseURL = scrapedPageBaseURL
+ }
+
if config.Opts.HasMetricsCollector() {
status := "success"
if scraperErr != nil {
@@ -109,16 +114,20 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us
slog.String("feed_url", feed.FeedURL),
slog.Any("error", scraperErr),
)
- } else if content != "" {
+ } else if extractedContent != "" {
// We replace the entry content only if the scraper doesn't return any error.
- entry.Content = minifyEntryContent(content)
+ entry.Content = minifyEntryContent(extractedContent)
}
}
rewrite.Rewriter(rewrittenURL, entry, feed.RewriteRules)
+ if pageBaseURL == "" {
+ pageBaseURL = rewrittenURL
+ }
+
// The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered out.
- entry.Content = sanitizer.Sanitize(rewrittenURL, entry.Content)
+ entry.Content = sanitizer.Sanitize(pageBaseURL, entry.Content)
updateEntryReadingTime(store, feed, entry, entryIsNew, user)
filteredEntries = append(filteredEntries, entry)
@@ -280,7 +289,7 @@ func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User)
requestBuilder.IgnoreTLSErrors(feed.AllowSelfSignedCertificates)
requestBuilder.DisableHTTP2(feed.DisableHTTP2)
- content, scraperErr := scraper.ScrapeWebsite(
+ pageBaseURL, extractedContent, scraperErr := scraper.ScrapeWebsite(
requestBuilder,
rewrittenEntryURL,
feed.ScraperRules,
@@ -298,15 +307,15 @@ func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User)
return scraperErr
}
- if content != "" {
- entry.Content = minifyEntryContent(content)
+ if extractedContent != "" {
+ entry.Content = minifyEntryContent(extractedContent)
if user.ShowReadingTime {
entry.ReadingTime = readingtime.EstimateReadingTime(entry.Content, user.DefaultReadingSpeed, user.CJKReadingSpeed)
}
}
rewrite.Rewriter(rewrittenEntryURL, entry, entry.Feed.RewriteRules)
- entry.Content = sanitizer.Sanitize(rewrittenEntryURL, entry.Content)
+ entry.Content = sanitizer.Sanitize(pageBaseURL, entry.Content)
return nil
}
diff --git a/internal/reader/readability/readability.go b/internal/reader/readability/readability.go
index 867a4b21..cacaa604 100644
--- a/internal/reader/readability/readability.go
+++ b/internal/reader/readability/readability.go
@@ -12,6 +12,8 @@ import (
"regexp"
"strings"
+ "miniflux.app/v2/internal/urllib"
+
"github.com/PuerkitoBio/goquery"
"golang.org/x/net/html"
)
@@ -69,10 +71,17 @@ func (c candidateList) String() string {
}
// ExtractContent returns relevant content.
-func ExtractContent(page io.Reader) (string, error) {
+func ExtractContent(page io.Reader) (baseURL string, extractedContent string, err error) {
document, err := goquery.NewDocumentFromReader(page)
if err != nil {
- return "", err
+ return "", "", err
+ }
+
+ if hrefValue, exists := document.Find("head base").First().Attr("href"); exists {
+ hrefValue = strings.TrimSpace(hrefValue)
+ if urllib.IsAbsoluteURL(hrefValue) {
+ baseURL = hrefValue
+ }
}
document.Find("script,style").Each(func(i int, s *goquery.Selection) {
@@ -86,12 +95,13 @@ func ExtractContent(page io.Reader) (string, error) {
topCandidate := getTopCandidate(document, candidates)
slog.Debug("Readability parsing",
+ slog.String("base_url", baseURL),
slog.Any("candidates", candidates),
slog.Any("topCandidate", topCandidate),
)
- output := getArticle(topCandidate, candidates)
- return output, nil
+ extractedContent = getArticle(topCandidate, candidates)
+ return baseURL, extractedContent, nil
}
// Now that we have the top candidate, look through its siblings for content that might also be related.
diff --git a/internal/reader/readability/readability_test.go b/internal/reader/readability/readability_test.go
new file mode 100644
index 00000000..bd47d859
--- /dev/null
+++ b/internal/reader/readability/readability_test.go
@@ -0,0 +1,102 @@
+// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+package readability // import "miniflux.app/v2/internal/reader/readability"
+
+import (
+ "strings"
+ "testing"
+)
+
+func TestBaseURL(t *testing.T) {
+ html := `
+
+
+
+
+
+
+ Some content
+
+
+ `
+
+ baseURL, _, err := ExtractContent(strings.NewReader(html))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if baseURL != "https://example.org/" {
+ t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
+ }
+}
+
+func TestMultipleBaseURL(t *testing.T) {
+ html := `
+
+
+
+
+
+
+
+ Some content
+
+
+ `
+
+ baseURL, _, err := ExtractContent(strings.NewReader(html))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if baseURL != "https://example.org/" {
+ t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
+ }
+}
+
+func TestRelativeBaseURL(t *testing.T) {
+ html := `
+
+
+
+
+
+
+ Some content
+
+
+ `
+
+ baseURL, _, err := ExtractContent(strings.NewReader(html))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if baseURL != "" {
+ t.Errorf(`Unexpected base URL, got %q`, baseURL)
+ }
+}
+
+func TestWithoutBaseURL(t *testing.T) {
+ html := `
+
+
+ Test
+
+
+
+ Some content
+
+
+ `
+
+ baseURL, _, err := ExtractContent(strings.NewReader(html))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if baseURL != "" {
+ t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
+ }
+}
diff --git a/internal/reader/scraper/scraper.go b/internal/reader/scraper/scraper.go
index a5013c3d..a200a587 100644
--- a/internal/reader/scraper/scraper.go
+++ b/internal/reader/scraper/scraper.go
@@ -18,72 +18,77 @@ import (
"golang.org/x/net/html/charset"
)
-func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, websiteURL, rules string) (string, error) {
- responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(websiteURL))
+func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, pageURL, rules string) (baseURL string, extractedContent string, err error) {
+ responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(pageURL))
defer responseHandler.Close()
if localizedError := responseHandler.LocalizedError(); localizedError != nil {
- slog.Warn("Unable to scrape website", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error()))
- return "", localizedError.Error()
+ slog.Warn("Unable to scrape website", slog.String("website_url", pageURL), slog.Any("error", localizedError.Error()))
+ return "", "", localizedError.Error()
}
if !isAllowedContentType(responseHandler.ContentType()) {
- return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", responseHandler.ContentType())
+ return "", "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", responseHandler.ContentType())
}
// The entry URL could redirect somewhere else.
- sameSite := urllib.Domain(websiteURL) == urllib.Domain(responseHandler.EffectiveURL())
- websiteURL = responseHandler.EffectiveURL()
+ sameSite := urllib.Domain(pageURL) == urllib.Domain(responseHandler.EffectiveURL())
+ pageURL = responseHandler.EffectiveURL()
if rules == "" {
- rules = getPredefinedScraperRules(websiteURL)
+ rules = getPredefinedScraperRules(pageURL)
}
- var content string
- var err error
-
htmlDocumentReader, err := charset.NewReader(
responseHandler.Body(config.Opts.HTTPClientMaxBodySize()),
responseHandler.ContentType(),
)
if err != nil {
- return "", fmt.Errorf("scraper: unable to read HTML document: %v", err)
+ return "", "", fmt.Errorf("scraper: unable to read HTML document with charset reader: %v", err)
}
if sameSite && rules != "" {
slog.Debug("Extracting content with custom rules",
- "url", websiteURL,
+ "url", pageURL,
"rules", rules,
)
- content, err = findContentUsingCustomRules(htmlDocumentReader, rules)
+ baseURL, extractedContent, err = findContentUsingCustomRules(htmlDocumentReader, rules)
} else {
slog.Debug("Extracting content with readability",
- "url", websiteURL,
+ "url", pageURL,
)
- content, err = readability.ExtractContent(htmlDocumentReader)
+ baseURL, extractedContent, err = readability.ExtractContent(htmlDocumentReader)
}
- if err != nil {
- return "", err
+ if baseURL == "" {
+ baseURL = pageURL
+ } else {
+ slog.Debug("Using base URL from HTML document", "base_url", baseURL)
}
- return content, nil
+ return baseURL, extractedContent, nil
}
-func findContentUsingCustomRules(page io.Reader, rules string) (string, error) {
+func findContentUsingCustomRules(page io.Reader, rules string) (baseURL string, extractedContent string, err error) {
document, err := goquery.NewDocumentFromReader(page)
if err != nil {
- return "", err
+ return "", "", err
+ }
+
+ if hrefValue, exists := document.Find("head base").First().Attr("href"); exists {
+ hrefValue = strings.TrimSpace(hrefValue)
+ if urllib.IsAbsoluteURL(hrefValue) {
+ baseURL = hrefValue
+ }
}
- contents := ""
document.Find(rules).Each(func(i int, s *goquery.Selection) {
if content, err := goquery.OuterHtml(s); err == nil {
- contents += content
+ extractedContent += content
}
})
- return contents, nil
+ return baseURL, extractedContent, nil
}
func getPredefinedScraperRules(websiteURL string) string {
diff --git a/internal/reader/scraper/scraper_test.go b/internal/reader/scraper/scraper_test.go
index bf786129..ad1e98ff 100644
--- a/internal/reader/scraper/scraper_test.go
+++ b/internal/reader/scraper/scraper_test.go
@@ -62,7 +62,7 @@ func TestSelectorRules(t *testing.T) {
t.Fatalf(`Unable to read file %q: %v`, filename, err)
}
- actualResult, err := findContentUsingCustomRules(bytes.NewReader(html), rule)
+ _, actualResult, err := findContentUsingCustomRules(bytes.NewReader(html), rule)
if err != nil {
t.Fatalf(`Scraping error for %q - %q: %v`, filename, rule, err)
}
@@ -73,7 +73,67 @@ func TestSelectorRules(t *testing.T) {
}
if actualResult != strings.TrimSpace(string(expectedResult)) {
- t.Errorf(`Unexpected result for %q, got "%s" instead of "%s"`, rule, actualResult, expectedResult)
+ t.Errorf(`Unexpected result for %q, got %q instead of %q`, rule, actualResult, expectedResult)
}
}
}
+
+func TestParseBaseURLWithCustomRules(t *testing.T) {
+ html := `
`
+ baseURL, _, err := findContentUsingCustomRules(strings.NewReader(html), "img")
+ if err != nil {
+ t.Fatalf(`Scraping error: %v`, err)
+ }
+
+ if baseURL != "https://example.com/" {
+ t.Errorf(`Unexpected base URL, got %q instead of "https://example.com/"`, baseURL)
+ }
+}
+
+func TestParseMultipleBaseURLWithCustomRules(t *testing.T) {
+ html := `
`
+ baseURL, _, err := findContentUsingCustomRules(strings.NewReader(html), "img")
+ if err != nil {
+ t.Fatalf(`Scraping error: %v`, err)
+ }
+
+ if baseURL != "https://example.com/" {
+ t.Errorf(`Unexpected base URL, got %q instead of "https://example.com/"`, baseURL)
+ }
+}
+
+func TestParseRelativeBaseURLWithCustomRules(t *testing.T) {
+ html := `
`
+ baseURL, _, err := findContentUsingCustomRules(strings.NewReader(html), "img")
+ if err != nil {
+ t.Fatalf(`Scraping error: %v`, err)
+ }
+
+ if baseURL != "" {
+ t.Errorf(`Unexpected base URL, got %q`, baseURL)
+ }
+}
+
+func TestParseEmptyBaseURLWithCustomRules(t *testing.T) {
+ html := `
`
+ baseURL, _, err := findContentUsingCustomRules(strings.NewReader(html), "img")
+ if err != nil {
+ t.Fatalf(`Scraping error: %v`, err)
+ }
+
+ if baseURL != "" {
+ t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
+ }
+}
+
+func TestParseMissingBaseURLWithCustomRules(t *testing.T) {
+ html := `
`
+ baseURL, _, err := findContentUsingCustomRules(strings.NewReader(html), "img")
+ if err != nil {
+ t.Fatalf(`Scraping error: %v`, err)
+ }
+
+ if baseURL != "" {
+ t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
+ }
+}