mirror of
https://github.com/miniflux/v2.git
synced 2025-06-27 16:36:00 +00:00
feat: implement base element handling in content scraper
This commit is contained in:
parent
c0f6e32a99
commit
29387f2d60
5 changed files with 224 additions and 38 deletions
|
@ -61,6 +61,7 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us
|
||||||
entry.URL = cleanedURL
|
entry.URL = cleanedURL
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pageBaseURL := ""
|
||||||
rewrittenURL := rewriteEntryURL(feed, entry)
|
rewrittenURL := rewriteEntryURL(feed, entry)
|
||||||
entryIsNew := store.IsNewEntry(feed.ID, entry.Hash)
|
entryIsNew := store.IsNewEntry(feed.ID, entry.Hash)
|
||||||
if feed.Crawler && (entryIsNew || forceRefresh) {
|
if feed.Crawler && (entryIsNew || forceRefresh) {
|
||||||
|
@ -87,12 +88,16 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us
|
||||||
requestBuilder.IgnoreTLSErrors(feed.AllowSelfSignedCertificates)
|
requestBuilder.IgnoreTLSErrors(feed.AllowSelfSignedCertificates)
|
||||||
requestBuilder.DisableHTTP2(feed.DisableHTTP2)
|
requestBuilder.DisableHTTP2(feed.DisableHTTP2)
|
||||||
|
|
||||||
content, scraperErr := scraper.ScrapeWebsite(
|
scrapedPageBaseURL, extractedContent, scraperErr := scraper.ScrapeWebsite(
|
||||||
requestBuilder,
|
requestBuilder,
|
||||||
rewrittenURL,
|
rewrittenURL,
|
||||||
feed.ScraperRules,
|
feed.ScraperRules,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if scrapedPageBaseURL != "" {
|
||||||
|
pageBaseURL = scrapedPageBaseURL
|
||||||
|
}
|
||||||
|
|
||||||
if config.Opts.HasMetricsCollector() {
|
if config.Opts.HasMetricsCollector() {
|
||||||
status := "success"
|
status := "success"
|
||||||
if scraperErr != nil {
|
if scraperErr != nil {
|
||||||
|
@ -109,16 +114,20 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us
|
||||||
slog.String("feed_url", feed.FeedURL),
|
slog.String("feed_url", feed.FeedURL),
|
||||||
slog.Any("error", scraperErr),
|
slog.Any("error", scraperErr),
|
||||||
)
|
)
|
||||||
} else if content != "" {
|
} else if extractedContent != "" {
|
||||||
// We replace the entry content only if the scraper doesn't return any error.
|
// We replace the entry content only if the scraper doesn't return any error.
|
||||||
entry.Content = minifyEntryContent(content)
|
entry.Content = minifyEntryContent(extractedContent)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
rewrite.Rewriter(rewrittenURL, entry, feed.RewriteRules)
|
rewrite.Rewriter(rewrittenURL, entry, feed.RewriteRules)
|
||||||
|
|
||||||
|
if pageBaseURL == "" {
|
||||||
|
pageBaseURL = rewrittenURL
|
||||||
|
}
|
||||||
|
|
||||||
// The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered out.
|
// The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered out.
|
||||||
entry.Content = sanitizer.Sanitize(rewrittenURL, entry.Content)
|
entry.Content = sanitizer.Sanitize(pageBaseURL, entry.Content)
|
||||||
|
|
||||||
updateEntryReadingTime(store, feed, entry, entryIsNew, user)
|
updateEntryReadingTime(store, feed, entry, entryIsNew, user)
|
||||||
filteredEntries = append(filteredEntries, entry)
|
filteredEntries = append(filteredEntries, entry)
|
||||||
|
@ -280,7 +289,7 @@ func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User)
|
||||||
requestBuilder.IgnoreTLSErrors(feed.AllowSelfSignedCertificates)
|
requestBuilder.IgnoreTLSErrors(feed.AllowSelfSignedCertificates)
|
||||||
requestBuilder.DisableHTTP2(feed.DisableHTTP2)
|
requestBuilder.DisableHTTP2(feed.DisableHTTP2)
|
||||||
|
|
||||||
content, scraperErr := scraper.ScrapeWebsite(
|
pageBaseURL, extractedContent, scraperErr := scraper.ScrapeWebsite(
|
||||||
requestBuilder,
|
requestBuilder,
|
||||||
rewrittenEntryURL,
|
rewrittenEntryURL,
|
||||||
feed.ScraperRules,
|
feed.ScraperRules,
|
||||||
|
@ -298,15 +307,15 @@ func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User)
|
||||||
return scraperErr
|
return scraperErr
|
||||||
}
|
}
|
||||||
|
|
||||||
if content != "" {
|
if extractedContent != "" {
|
||||||
entry.Content = minifyEntryContent(content)
|
entry.Content = minifyEntryContent(extractedContent)
|
||||||
if user.ShowReadingTime {
|
if user.ShowReadingTime {
|
||||||
entry.ReadingTime = readingtime.EstimateReadingTime(entry.Content, user.DefaultReadingSpeed, user.CJKReadingSpeed)
|
entry.ReadingTime = readingtime.EstimateReadingTime(entry.Content, user.DefaultReadingSpeed, user.CJKReadingSpeed)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
rewrite.Rewriter(rewrittenEntryURL, entry, entry.Feed.RewriteRules)
|
rewrite.Rewriter(rewrittenEntryURL, entry, entry.Feed.RewriteRules)
|
||||||
entry.Content = sanitizer.Sanitize(rewrittenEntryURL, entry.Content)
|
entry.Content = sanitizer.Sanitize(pageBaseURL, entry.Content)
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,6 +12,8 @@ import (
|
||||||
"regexp"
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"miniflux.app/v2/internal/urllib"
|
||||||
|
|
||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
"golang.org/x/net/html"
|
"golang.org/x/net/html"
|
||||||
)
|
)
|
||||||
|
@ -69,10 +71,17 @@ func (c candidateList) String() string {
|
||||||
}
|
}
|
||||||
|
|
||||||
// ExtractContent returns relevant content.
|
// ExtractContent returns relevant content.
|
||||||
func ExtractContent(page io.Reader) (string, error) {
|
func ExtractContent(page io.Reader) (baseURL string, extractedContent string, err error) {
|
||||||
document, err := goquery.NewDocumentFromReader(page)
|
document, err := goquery.NewDocumentFromReader(page)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
if hrefValue, exists := document.Find("head base").First().Attr("href"); exists {
|
||||||
|
hrefValue = strings.TrimSpace(hrefValue)
|
||||||
|
if urllib.IsAbsoluteURL(hrefValue) {
|
||||||
|
baseURL = hrefValue
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
document.Find("script,style").Each(func(i int, s *goquery.Selection) {
|
document.Find("script,style").Each(func(i int, s *goquery.Selection) {
|
||||||
|
@ -86,12 +95,13 @@ func ExtractContent(page io.Reader) (string, error) {
|
||||||
topCandidate := getTopCandidate(document, candidates)
|
topCandidate := getTopCandidate(document, candidates)
|
||||||
|
|
||||||
slog.Debug("Readability parsing",
|
slog.Debug("Readability parsing",
|
||||||
|
slog.String("base_url", baseURL),
|
||||||
slog.Any("candidates", candidates),
|
slog.Any("candidates", candidates),
|
||||||
slog.Any("topCandidate", topCandidate),
|
slog.Any("topCandidate", topCandidate),
|
||||||
)
|
)
|
||||||
|
|
||||||
output := getArticle(topCandidate, candidates)
|
extractedContent = getArticle(topCandidate, candidates)
|
||||||
return output, nil
|
return baseURL, extractedContent, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Now that we have the top candidate, look through its siblings for content that might also be related.
|
// Now that we have the top candidate, look through its siblings for content that might also be related.
|
||||||
|
|
102
internal/reader/readability/readability_test.go
Normal file
102
internal/reader/readability/readability_test.go
Normal file
|
@ -0,0 +1,102 @@
|
||||||
|
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
package readability // import "miniflux.app/v2/internal/reader/readability"
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestBaseURL(t *testing.T) {
|
||||||
|
html := `
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<base href="https://example.org/ ">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<article>
|
||||||
|
Some content
|
||||||
|
</article>
|
||||||
|
</body>
|
||||||
|
</html>`
|
||||||
|
|
||||||
|
baseURL, _, err := ExtractContent(strings.NewReader(html))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if baseURL != "https://example.org/" {
|
||||||
|
t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMultipleBaseURL(t *testing.T) {
|
||||||
|
html := `
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<base href="https://example.org/ ">
|
||||||
|
<base href="https://example.com/ ">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<article>
|
||||||
|
Some content
|
||||||
|
</article>
|
||||||
|
</body>
|
||||||
|
</html>`
|
||||||
|
|
||||||
|
baseURL, _, err := ExtractContent(strings.NewReader(html))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if baseURL != "https://example.org/" {
|
||||||
|
t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRelativeBaseURL(t *testing.T) {
|
||||||
|
html := `
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<base href="/test/ ">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<article>
|
||||||
|
Some content
|
||||||
|
</article>
|
||||||
|
</body>
|
||||||
|
</html>`
|
||||||
|
|
||||||
|
baseURL, _, err := ExtractContent(strings.NewReader(html))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if baseURL != "" {
|
||||||
|
t.Errorf(`Unexpected base URL, got %q`, baseURL)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestWithoutBaseURL(t *testing.T) {
|
||||||
|
html := `
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Test</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<article>
|
||||||
|
Some content
|
||||||
|
</article>
|
||||||
|
</body>
|
||||||
|
</html>`
|
||||||
|
|
||||||
|
baseURL, _, err := ExtractContent(strings.NewReader(html))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if baseURL != "" {
|
||||||
|
t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
|
||||||
|
}
|
||||||
|
}
|
|
@ -18,72 +18,77 @@ import (
|
||||||
"golang.org/x/net/html/charset"
|
"golang.org/x/net/html/charset"
|
||||||
)
|
)
|
||||||
|
|
||||||
func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, websiteURL, rules string) (string, error) {
|
func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, pageURL, rules string) (baseURL string, extractedContent string, err error) {
|
||||||
responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(websiteURL))
|
responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(pageURL))
|
||||||
defer responseHandler.Close()
|
defer responseHandler.Close()
|
||||||
|
|
||||||
if localizedError := responseHandler.LocalizedError(); localizedError != nil {
|
if localizedError := responseHandler.LocalizedError(); localizedError != nil {
|
||||||
slog.Warn("Unable to scrape website", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error()))
|
slog.Warn("Unable to scrape website", slog.String("website_url", pageURL), slog.Any("error", localizedError.Error()))
|
||||||
return "", localizedError.Error()
|
return "", "", localizedError.Error()
|
||||||
}
|
}
|
||||||
|
|
||||||
if !isAllowedContentType(responseHandler.ContentType()) {
|
if !isAllowedContentType(responseHandler.ContentType()) {
|
||||||
return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", responseHandler.ContentType())
|
return "", "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", responseHandler.ContentType())
|
||||||
}
|
}
|
||||||
|
|
||||||
// The entry URL could redirect somewhere else.
|
// The entry URL could redirect somewhere else.
|
||||||
sameSite := urllib.Domain(websiteURL) == urllib.Domain(responseHandler.EffectiveURL())
|
sameSite := urllib.Domain(pageURL) == urllib.Domain(responseHandler.EffectiveURL())
|
||||||
websiteURL = responseHandler.EffectiveURL()
|
pageURL = responseHandler.EffectiveURL()
|
||||||
|
|
||||||
if rules == "" {
|
if rules == "" {
|
||||||
rules = getPredefinedScraperRules(websiteURL)
|
rules = getPredefinedScraperRules(pageURL)
|
||||||
}
|
}
|
||||||
|
|
||||||
var content string
|
|
||||||
var err error
|
|
||||||
|
|
||||||
htmlDocumentReader, err := charset.NewReader(
|
htmlDocumentReader, err := charset.NewReader(
|
||||||
responseHandler.Body(config.Opts.HTTPClientMaxBodySize()),
|
responseHandler.Body(config.Opts.HTTPClientMaxBodySize()),
|
||||||
responseHandler.ContentType(),
|
responseHandler.ContentType(),
|
||||||
)
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", fmt.Errorf("scraper: unable to read HTML document: %v", err)
|
return "", "", fmt.Errorf("scraper: unable to read HTML document with charset reader: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
if sameSite && rules != "" {
|
if sameSite && rules != "" {
|
||||||
slog.Debug("Extracting content with custom rules",
|
slog.Debug("Extracting content with custom rules",
|
||||||
"url", websiteURL,
|
"url", pageURL,
|
||||||
"rules", rules,
|
"rules", rules,
|
||||||
)
|
)
|
||||||
content, err = findContentUsingCustomRules(htmlDocumentReader, rules)
|
baseURL, extractedContent, err = findContentUsingCustomRules(htmlDocumentReader, rules)
|
||||||
} else {
|
} else {
|
||||||
slog.Debug("Extracting content with readability",
|
slog.Debug("Extracting content with readability",
|
||||||
"url", websiteURL,
|
"url", pageURL,
|
||||||
)
|
)
|
||||||
content, err = readability.ExtractContent(htmlDocumentReader)
|
baseURL, extractedContent, err = readability.ExtractContent(htmlDocumentReader)
|
||||||
}
|
}
|
||||||
|
|
||||||
if err != nil {
|
if baseURL == "" {
|
||||||
return "", err
|
baseURL = pageURL
|
||||||
|
} else {
|
||||||
|
slog.Debug("Using base URL from HTML document", "base_url", baseURL)
|
||||||
}
|
}
|
||||||
|
|
||||||
return content, nil
|
return baseURL, extractedContent, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func findContentUsingCustomRules(page io.Reader, rules string) (string, error) {
|
func findContentUsingCustomRules(page io.Reader, rules string) (baseURL string, extractedContent string, err error) {
|
||||||
document, err := goquery.NewDocumentFromReader(page)
|
document, err := goquery.NewDocumentFromReader(page)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
if hrefValue, exists := document.Find("head base").First().Attr("href"); exists {
|
||||||
|
hrefValue = strings.TrimSpace(hrefValue)
|
||||||
|
if urllib.IsAbsoluteURL(hrefValue) {
|
||||||
|
baseURL = hrefValue
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
contents := ""
|
|
||||||
document.Find(rules).Each(func(i int, s *goquery.Selection) {
|
document.Find(rules).Each(func(i int, s *goquery.Selection) {
|
||||||
if content, err := goquery.OuterHtml(s); err == nil {
|
if content, err := goquery.OuterHtml(s); err == nil {
|
||||||
contents += content
|
extractedContent += content
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
return contents, nil
|
return baseURL, extractedContent, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func getPredefinedScraperRules(websiteURL string) string {
|
func getPredefinedScraperRules(websiteURL string) string {
|
||||||
|
|
|
@ -62,7 +62,7 @@ func TestSelectorRules(t *testing.T) {
|
||||||
t.Fatalf(`Unable to read file %q: %v`, filename, err)
|
t.Fatalf(`Unable to read file %q: %v`, filename, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
actualResult, err := findContentUsingCustomRules(bytes.NewReader(html), rule)
|
_, actualResult, err := findContentUsingCustomRules(bytes.NewReader(html), rule)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf(`Scraping error for %q - %q: %v`, filename, rule, err)
|
t.Fatalf(`Scraping error for %q - %q: %v`, filename, rule, err)
|
||||||
}
|
}
|
||||||
|
@ -73,7 +73,67 @@ func TestSelectorRules(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if actualResult != strings.TrimSpace(string(expectedResult)) {
|
if actualResult != strings.TrimSpace(string(expectedResult)) {
|
||||||
t.Errorf(`Unexpected result for %q, got "%s" instead of "%s"`, rule, actualResult, expectedResult)
|
t.Errorf(`Unexpected result for %q, got %q instead of %q`, rule, actualResult, expectedResult)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseBaseURLWithCustomRules(t *testing.T) {
|
||||||
|
html := `<html><head><base href="https://example.com/"></head><body><img src="image.jpg"></body></html>`
|
||||||
|
baseURL, _, err := findContentUsingCustomRules(strings.NewReader(html), "img")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf(`Scraping error: %v`, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if baseURL != "https://example.com/" {
|
||||||
|
t.Errorf(`Unexpected base URL, got %q instead of "https://example.com/"`, baseURL)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseMultipleBaseURLWithCustomRules(t *testing.T) {
|
||||||
|
html := `<html><head><base href="https://example.com/"><base href="https://example.org/"/></head><body><img src="image.jpg"></body></html>`
|
||||||
|
baseURL, _, err := findContentUsingCustomRules(strings.NewReader(html), "img")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf(`Scraping error: %v`, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if baseURL != "https://example.com/" {
|
||||||
|
t.Errorf(`Unexpected base URL, got %q instead of "https://example.com/"`, baseURL)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseRelativeBaseURLWithCustomRules(t *testing.T) {
|
||||||
|
html := `<html><head><base href="/test"></head><body><img src="image.jpg"></body></html>`
|
||||||
|
baseURL, _, err := findContentUsingCustomRules(strings.NewReader(html), "img")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf(`Scraping error: %v`, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if baseURL != "" {
|
||||||
|
t.Errorf(`Unexpected base URL, got %q`, baseURL)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseEmptyBaseURLWithCustomRules(t *testing.T) {
|
||||||
|
html := `<html><head><base href=" "></head><body><img src="image.jpg"></body></html>`
|
||||||
|
baseURL, _, err := findContentUsingCustomRules(strings.NewReader(html), "img")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf(`Scraping error: %v`, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if baseURL != "" {
|
||||||
|
t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseMissingBaseURLWithCustomRules(t *testing.T) {
|
||||||
|
html := `<html><head></head><body><img src="image.jpg"></body></html>`
|
||||||
|
baseURL, _, err := findContentUsingCustomRules(strings.NewReader(html), "img")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf(`Scraping error: %v`, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if baseURL != "" {
|
||||||
|
t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue