diff --git a/internal/reader/processor/processor.go b/internal/reader/processor/processor.go index 30679d11..7a5bdd33 100644 --- a/internal/reader/processor/processor.go +++ b/internal/reader/processor/processor.go @@ -61,6 +61,7 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us entry.URL = cleanedURL } + pageBaseURL := "" rewrittenURL := rewriteEntryURL(feed, entry) entryIsNew := store.IsNewEntry(feed.ID, entry.Hash) if feed.Crawler && (entryIsNew || forceRefresh) { @@ -87,12 +88,16 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us requestBuilder.IgnoreTLSErrors(feed.AllowSelfSignedCertificates) requestBuilder.DisableHTTP2(feed.DisableHTTP2) - content, scraperErr := scraper.ScrapeWebsite( + scrapedPageBaseURL, extractedContent, scraperErr := scraper.ScrapeWebsite( requestBuilder, rewrittenURL, feed.ScraperRules, ) + if scrapedPageBaseURL != "" { + pageBaseURL = scrapedPageBaseURL + } + if config.Opts.HasMetricsCollector() { status := "success" if scraperErr != nil { @@ -109,16 +114,20 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us slog.String("feed_url", feed.FeedURL), slog.Any("error", scraperErr), ) - } else if content != "" { + } else if extractedContent != "" { // We replace the entry content only if the scraper doesn't return any error. - entry.Content = minifyEntryContent(content) + entry.Content = minifyEntryContent(extractedContent) } } rewrite.Rewriter(rewrittenURL, entry, feed.RewriteRules) + if pageBaseURL == "" { + pageBaseURL = rewrittenURL + } + // The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered out. - entry.Content = sanitizer.Sanitize(rewrittenURL, entry.Content) + entry.Content = sanitizer.Sanitize(pageBaseURL, entry.Content) updateEntryReadingTime(store, feed, entry, entryIsNew, user) filteredEntries = append(filteredEntries, entry) @@ -280,7 +289,7 @@ func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User) requestBuilder.IgnoreTLSErrors(feed.AllowSelfSignedCertificates) requestBuilder.DisableHTTP2(feed.DisableHTTP2) - content, scraperErr := scraper.ScrapeWebsite( + pageBaseURL, extractedContent, scraperErr := scraper.ScrapeWebsite( requestBuilder, rewrittenEntryURL, feed.ScraperRules, @@ -298,15 +307,15 @@ func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User) return scraperErr } - if content != "" { - entry.Content = minifyEntryContent(content) + if extractedContent != "" { + entry.Content = minifyEntryContent(extractedContent) if user.ShowReadingTime { entry.ReadingTime = readingtime.EstimateReadingTime(entry.Content, user.DefaultReadingSpeed, user.CJKReadingSpeed) } } rewrite.Rewriter(rewrittenEntryURL, entry, entry.Feed.RewriteRules) - entry.Content = sanitizer.Sanitize(rewrittenEntryURL, entry.Content) + entry.Content = sanitizer.Sanitize(pageBaseURL, entry.Content) return nil } diff --git a/internal/reader/readability/readability.go b/internal/reader/readability/readability.go index 867a4b21..cacaa604 100644 --- a/internal/reader/readability/readability.go +++ b/internal/reader/readability/readability.go @@ -12,6 +12,8 @@ import ( "regexp" "strings" + "miniflux.app/v2/internal/urllib" + "github.com/PuerkitoBio/goquery" "golang.org/x/net/html" ) @@ -69,10 +71,17 @@ func (c candidateList) String() string { } // ExtractContent returns relevant content. -func ExtractContent(page io.Reader) (string, error) { +func ExtractContent(page io.Reader) (baseURL string, extractedContent string, err error) { document, err := goquery.NewDocumentFromReader(page) if err != nil { - return "", err + return "", "", err + } + + if hrefValue, exists := document.Find("head base").First().Attr("href"); exists { + hrefValue = strings.TrimSpace(hrefValue) + if urllib.IsAbsoluteURL(hrefValue) { + baseURL = hrefValue + } } document.Find("script,style").Each(func(i int, s *goquery.Selection) { @@ -86,12 +95,13 @@ func ExtractContent(page io.Reader) (string, error) { topCandidate := getTopCandidate(document, candidates) slog.Debug("Readability parsing", + slog.String("base_url", baseURL), slog.Any("candidates", candidates), slog.Any("topCandidate", topCandidate), ) - output := getArticle(topCandidate, candidates) - return output, nil + extractedContent = getArticle(topCandidate, candidates) + return baseURL, extractedContent, nil } // Now that we have the top candidate, look through its siblings for content that might also be related. diff --git a/internal/reader/readability/readability_test.go b/internal/reader/readability/readability_test.go new file mode 100644 index 00000000..bd47d859 --- /dev/null +++ b/internal/reader/readability/readability_test.go @@ -0,0 +1,102 @@ +// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package readability // import "miniflux.app/v2/internal/reader/readability" + +import ( + "strings" + "testing" +) + +func TestBaseURL(t *testing.T) { + html := ` + + + + + +
+ Some content +
+ + ` + + baseURL, _, err := ExtractContent(strings.NewReader(html)) + if err != nil { + t.Fatal(err) + } + + if baseURL != "https://example.org/" { + t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL) + } +} + +func TestMultipleBaseURL(t *testing.T) { + html := ` + + + + + + +
+ Some content +
+ + ` + + baseURL, _, err := ExtractContent(strings.NewReader(html)) + if err != nil { + t.Fatal(err) + } + + if baseURL != "https://example.org/" { + t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL) + } +} + +func TestRelativeBaseURL(t *testing.T) { + html := ` + + + + + +
+ Some content +
+ + ` + + baseURL, _, err := ExtractContent(strings.NewReader(html)) + if err != nil { + t.Fatal(err) + } + + if baseURL != "" { + t.Errorf(`Unexpected base URL, got %q`, baseURL) + } +} + +func TestWithoutBaseURL(t *testing.T) { + html := ` + + + Test + + +
+ Some content +
+ + ` + + baseURL, _, err := ExtractContent(strings.NewReader(html)) + if err != nil { + t.Fatal(err) + } + + if baseURL != "" { + t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL) + } +} diff --git a/internal/reader/scraper/scraper.go b/internal/reader/scraper/scraper.go index a5013c3d..a200a587 100644 --- a/internal/reader/scraper/scraper.go +++ b/internal/reader/scraper/scraper.go @@ -18,72 +18,77 @@ import ( "golang.org/x/net/html/charset" ) -func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, websiteURL, rules string) (string, error) { - responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(websiteURL)) +func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, pageURL, rules string) (baseURL string, extractedContent string, err error) { + responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(pageURL)) defer responseHandler.Close() if localizedError := responseHandler.LocalizedError(); localizedError != nil { - slog.Warn("Unable to scrape website", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error())) - return "", localizedError.Error() + slog.Warn("Unable to scrape website", slog.String("website_url", pageURL), slog.Any("error", localizedError.Error())) + return "", "", localizedError.Error() } if !isAllowedContentType(responseHandler.ContentType()) { - return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", responseHandler.ContentType()) + return "", "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", responseHandler.ContentType()) } // The entry URL could redirect somewhere else. - sameSite := urllib.Domain(websiteURL) == urllib.Domain(responseHandler.EffectiveURL()) - websiteURL = responseHandler.EffectiveURL() + sameSite := urllib.Domain(pageURL) == urllib.Domain(responseHandler.EffectiveURL()) + pageURL = responseHandler.EffectiveURL() if rules == "" { - rules = getPredefinedScraperRules(websiteURL) + rules = getPredefinedScraperRules(pageURL) } - var content string - var err error - htmlDocumentReader, err := charset.NewReader( responseHandler.Body(config.Opts.HTTPClientMaxBodySize()), responseHandler.ContentType(), ) if err != nil { - return "", fmt.Errorf("scraper: unable to read HTML document: %v", err) + return "", "", fmt.Errorf("scraper: unable to read HTML document with charset reader: %v", err) } if sameSite && rules != "" { slog.Debug("Extracting content with custom rules", - "url", websiteURL, + "url", pageURL, "rules", rules, ) - content, err = findContentUsingCustomRules(htmlDocumentReader, rules) + baseURL, extractedContent, err = findContentUsingCustomRules(htmlDocumentReader, rules) } else { slog.Debug("Extracting content with readability", - "url", websiteURL, + "url", pageURL, ) - content, err = readability.ExtractContent(htmlDocumentReader) + baseURL, extractedContent, err = readability.ExtractContent(htmlDocumentReader) } - if err != nil { - return "", err + if baseURL == "" { + baseURL = pageURL + } else { + slog.Debug("Using base URL from HTML document", "base_url", baseURL) } - return content, nil + return baseURL, extractedContent, nil } -func findContentUsingCustomRules(page io.Reader, rules string) (string, error) { +func findContentUsingCustomRules(page io.Reader, rules string) (baseURL string, extractedContent string, err error) { document, err := goquery.NewDocumentFromReader(page) if err != nil { - return "", err + return "", "", err + } + + if hrefValue, exists := document.Find("head base").First().Attr("href"); exists { + hrefValue = strings.TrimSpace(hrefValue) + if urllib.IsAbsoluteURL(hrefValue) { + baseURL = hrefValue + } } - contents := "" document.Find(rules).Each(func(i int, s *goquery.Selection) { if content, err := goquery.OuterHtml(s); err == nil { - contents += content + extractedContent += content } }) - return contents, nil + return baseURL, extractedContent, nil } func getPredefinedScraperRules(websiteURL string) string { diff --git a/internal/reader/scraper/scraper_test.go b/internal/reader/scraper/scraper_test.go index bf786129..ad1e98ff 100644 --- a/internal/reader/scraper/scraper_test.go +++ b/internal/reader/scraper/scraper_test.go @@ -62,7 +62,7 @@ func TestSelectorRules(t *testing.T) { t.Fatalf(`Unable to read file %q: %v`, filename, err) } - actualResult, err := findContentUsingCustomRules(bytes.NewReader(html), rule) + _, actualResult, err := findContentUsingCustomRules(bytes.NewReader(html), rule) if err != nil { t.Fatalf(`Scraping error for %q - %q: %v`, filename, rule, err) } @@ -73,7 +73,67 @@ func TestSelectorRules(t *testing.T) { } if actualResult != strings.TrimSpace(string(expectedResult)) { - t.Errorf(`Unexpected result for %q, got "%s" instead of "%s"`, rule, actualResult, expectedResult) + t.Errorf(`Unexpected result for %q, got %q instead of %q`, rule, actualResult, expectedResult) } } } + +func TestParseBaseURLWithCustomRules(t *testing.T) { + html := `` + baseURL, _, err := findContentUsingCustomRules(strings.NewReader(html), "img") + if err != nil { + t.Fatalf(`Scraping error: %v`, err) + } + + if baseURL != "https://example.com/" { + t.Errorf(`Unexpected base URL, got %q instead of "https://example.com/"`, baseURL) + } +} + +func TestParseMultipleBaseURLWithCustomRules(t *testing.T) { + html := `` + baseURL, _, err := findContentUsingCustomRules(strings.NewReader(html), "img") + if err != nil { + t.Fatalf(`Scraping error: %v`, err) + } + + if baseURL != "https://example.com/" { + t.Errorf(`Unexpected base URL, got %q instead of "https://example.com/"`, baseURL) + } +} + +func TestParseRelativeBaseURLWithCustomRules(t *testing.T) { + html := `` + baseURL, _, err := findContentUsingCustomRules(strings.NewReader(html), "img") + if err != nil { + t.Fatalf(`Scraping error: %v`, err) + } + + if baseURL != "" { + t.Errorf(`Unexpected base URL, got %q`, baseURL) + } +} + +func TestParseEmptyBaseURLWithCustomRules(t *testing.T) { + html := `` + baseURL, _, err := findContentUsingCustomRules(strings.NewReader(html), "img") + if err != nil { + t.Fatalf(`Scraping error: %v`, err) + } + + if baseURL != "" { + t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL) + } +} + +func TestParseMissingBaseURLWithCustomRules(t *testing.T) { + html := `` + baseURL, _, err := findContentUsingCustomRules(strings.NewReader(html), "img") + if err != nil { + t.Fatalf(`Scraping error: %v`, err) + } + + if baseURL != "" { + t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL) + } +}