mirror of
https://github.com/miniflux/v2.git
synced 2025-06-27 16:36:00 +00:00
feat: implement base element handling in content scraper
This commit is contained in:
parent
c0f6e32a99
commit
29387f2d60
5 changed files with 224 additions and 38 deletions
|
@ -61,6 +61,7 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us
|
|||
entry.URL = cleanedURL
|
||||
}
|
||||
|
||||
pageBaseURL := ""
|
||||
rewrittenURL := rewriteEntryURL(feed, entry)
|
||||
entryIsNew := store.IsNewEntry(feed.ID, entry.Hash)
|
||||
if feed.Crawler && (entryIsNew || forceRefresh) {
|
||||
|
@ -87,12 +88,16 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us
|
|||
requestBuilder.IgnoreTLSErrors(feed.AllowSelfSignedCertificates)
|
||||
requestBuilder.DisableHTTP2(feed.DisableHTTP2)
|
||||
|
||||
content, scraperErr := scraper.ScrapeWebsite(
|
||||
scrapedPageBaseURL, extractedContent, scraperErr := scraper.ScrapeWebsite(
|
||||
requestBuilder,
|
||||
rewrittenURL,
|
||||
feed.ScraperRules,
|
||||
)
|
||||
|
||||
if scrapedPageBaseURL != "" {
|
||||
pageBaseURL = scrapedPageBaseURL
|
||||
}
|
||||
|
||||
if config.Opts.HasMetricsCollector() {
|
||||
status := "success"
|
||||
if scraperErr != nil {
|
||||
|
@ -109,16 +114,20 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us
|
|||
slog.String("feed_url", feed.FeedURL),
|
||||
slog.Any("error", scraperErr),
|
||||
)
|
||||
} else if content != "" {
|
||||
} else if extractedContent != "" {
|
||||
// We replace the entry content only if the scraper doesn't return any error.
|
||||
entry.Content = minifyEntryContent(content)
|
||||
entry.Content = minifyEntryContent(extractedContent)
|
||||
}
|
||||
}
|
||||
|
||||
rewrite.Rewriter(rewrittenURL, entry, feed.RewriteRules)
|
||||
|
||||
if pageBaseURL == "" {
|
||||
pageBaseURL = rewrittenURL
|
||||
}
|
||||
|
||||
// The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered out.
|
||||
entry.Content = sanitizer.Sanitize(rewrittenURL, entry.Content)
|
||||
entry.Content = sanitizer.Sanitize(pageBaseURL, entry.Content)
|
||||
|
||||
updateEntryReadingTime(store, feed, entry, entryIsNew, user)
|
||||
filteredEntries = append(filteredEntries, entry)
|
||||
|
@ -280,7 +289,7 @@ func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User)
|
|||
requestBuilder.IgnoreTLSErrors(feed.AllowSelfSignedCertificates)
|
||||
requestBuilder.DisableHTTP2(feed.DisableHTTP2)
|
||||
|
||||
content, scraperErr := scraper.ScrapeWebsite(
|
||||
pageBaseURL, extractedContent, scraperErr := scraper.ScrapeWebsite(
|
||||
requestBuilder,
|
||||
rewrittenEntryURL,
|
||||
feed.ScraperRules,
|
||||
|
@ -298,15 +307,15 @@ func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User)
|
|||
return scraperErr
|
||||
}
|
||||
|
||||
if content != "" {
|
||||
entry.Content = minifyEntryContent(content)
|
||||
if extractedContent != "" {
|
||||
entry.Content = minifyEntryContent(extractedContent)
|
||||
if user.ShowReadingTime {
|
||||
entry.ReadingTime = readingtime.EstimateReadingTime(entry.Content, user.DefaultReadingSpeed, user.CJKReadingSpeed)
|
||||
}
|
||||
}
|
||||
|
||||
rewrite.Rewriter(rewrittenEntryURL, entry, entry.Feed.RewriteRules)
|
||||
entry.Content = sanitizer.Sanitize(rewrittenEntryURL, entry.Content)
|
||||
entry.Content = sanitizer.Sanitize(pageBaseURL, entry.Content)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -12,6 +12,8 @@ import (
|
|||
"regexp"
|
||||
"strings"
|
||||
|
||||
"miniflux.app/v2/internal/urllib"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
@ -69,10 +71,17 @@ func (c candidateList) String() string {
|
|||
}
|
||||
|
||||
// ExtractContent returns relevant content.
|
||||
func ExtractContent(page io.Reader) (string, error) {
|
||||
func ExtractContent(page io.Reader) (baseURL string, extractedContent string, err error) {
|
||||
document, err := goquery.NewDocumentFromReader(page)
|
||||
if err != nil {
|
||||
return "", err
|
||||
return "", "", err
|
||||
}
|
||||
|
||||
if hrefValue, exists := document.Find("head base").First().Attr("href"); exists {
|
||||
hrefValue = strings.TrimSpace(hrefValue)
|
||||
if urllib.IsAbsoluteURL(hrefValue) {
|
||||
baseURL = hrefValue
|
||||
}
|
||||
}
|
||||
|
||||
document.Find("script,style").Each(func(i int, s *goquery.Selection) {
|
||||
|
@ -86,12 +95,13 @@ func ExtractContent(page io.Reader) (string, error) {
|
|||
topCandidate := getTopCandidate(document, candidates)
|
||||
|
||||
slog.Debug("Readability parsing",
|
||||
slog.String("base_url", baseURL),
|
||||
slog.Any("candidates", candidates),
|
||||
slog.Any("topCandidate", topCandidate),
|
||||
)
|
||||
|
||||
output := getArticle(topCandidate, candidates)
|
||||
return output, nil
|
||||
extractedContent = getArticle(topCandidate, candidates)
|
||||
return baseURL, extractedContent, nil
|
||||
}
|
||||
|
||||
// Now that we have the top candidate, look through its siblings for content that might also be related.
|
||||
|
|
102
internal/reader/readability/readability_test.go
Normal file
102
internal/reader/readability/readability_test.go
Normal file
|
@ -0,0 +1,102 @@
|
|||
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
package readability // import "miniflux.app/v2/internal/reader/readability"
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestBaseURL(t *testing.T) {
|
||||
html := `
|
||||
<html>
|
||||
<head>
|
||||
<base href="https://example.org/ ">
|
||||
</head>
|
||||
<body>
|
||||
<article>
|
||||
Some content
|
||||
</article>
|
||||
</body>
|
||||
</html>`
|
||||
|
||||
baseURL, _, err := ExtractContent(strings.NewReader(html))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if baseURL != "https://example.org/" {
|
||||
t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMultipleBaseURL(t *testing.T) {
|
||||
html := `
|
||||
<html>
|
||||
<head>
|
||||
<base href="https://example.org/ ">
|
||||
<base href="https://example.com/ ">
|
||||
</head>
|
||||
<body>
|
||||
<article>
|
||||
Some content
|
||||
</article>
|
||||
</body>
|
||||
</html>`
|
||||
|
||||
baseURL, _, err := ExtractContent(strings.NewReader(html))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if baseURL != "https://example.org/" {
|
||||
t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRelativeBaseURL(t *testing.T) {
|
||||
html := `
|
||||
<html>
|
||||
<head>
|
||||
<base href="/test/ ">
|
||||
</head>
|
||||
<body>
|
||||
<article>
|
||||
Some content
|
||||
</article>
|
||||
</body>
|
||||
</html>`
|
||||
|
||||
baseURL, _, err := ExtractContent(strings.NewReader(html))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if baseURL != "" {
|
||||
t.Errorf(`Unexpected base URL, got %q`, baseURL)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWithoutBaseURL(t *testing.T) {
|
||||
html := `
|
||||
<html>
|
||||
<head>
|
||||
<title>Test</title>
|
||||
</head>
|
||||
<body>
|
||||
<article>
|
||||
Some content
|
||||
</article>
|
||||
</body>
|
||||
</html>`
|
||||
|
||||
baseURL, _, err := ExtractContent(strings.NewReader(html))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if baseURL != "" {
|
||||
t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
|
||||
}
|
||||
}
|
|
@ -18,72 +18,77 @@ import (
|
|||
"golang.org/x/net/html/charset"
|
||||
)
|
||||
|
||||
func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, websiteURL, rules string) (string, error) {
|
||||
responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(websiteURL))
|
||||
func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, pageURL, rules string) (baseURL string, extractedContent string, err error) {
|
||||
responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(pageURL))
|
||||
defer responseHandler.Close()
|
||||
|
||||
if localizedError := responseHandler.LocalizedError(); localizedError != nil {
|
||||
slog.Warn("Unable to scrape website", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error()))
|
||||
return "", localizedError.Error()
|
||||
slog.Warn("Unable to scrape website", slog.String("website_url", pageURL), slog.Any("error", localizedError.Error()))
|
||||
return "", "", localizedError.Error()
|
||||
}
|
||||
|
||||
if !isAllowedContentType(responseHandler.ContentType()) {
|
||||
return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", responseHandler.ContentType())
|
||||
return "", "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", responseHandler.ContentType())
|
||||
}
|
||||
|
||||
// The entry URL could redirect somewhere else.
|
||||
sameSite := urllib.Domain(websiteURL) == urllib.Domain(responseHandler.EffectiveURL())
|
||||
websiteURL = responseHandler.EffectiveURL()
|
||||
sameSite := urllib.Domain(pageURL) == urllib.Domain(responseHandler.EffectiveURL())
|
||||
pageURL = responseHandler.EffectiveURL()
|
||||
|
||||
if rules == "" {
|
||||
rules = getPredefinedScraperRules(websiteURL)
|
||||
rules = getPredefinedScraperRules(pageURL)
|
||||
}
|
||||
|
||||
var content string
|
||||
var err error
|
||||
|
||||
htmlDocumentReader, err := charset.NewReader(
|
||||
responseHandler.Body(config.Opts.HTTPClientMaxBodySize()),
|
||||
responseHandler.ContentType(),
|
||||
)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("scraper: unable to read HTML document: %v", err)
|
||||
return "", "", fmt.Errorf("scraper: unable to read HTML document with charset reader: %v", err)
|
||||
}
|
||||
|
||||
if sameSite && rules != "" {
|
||||
slog.Debug("Extracting content with custom rules",
|
||||
"url", websiteURL,
|
||||
"url", pageURL,
|
||||
"rules", rules,
|
||||
)
|
||||
content, err = findContentUsingCustomRules(htmlDocumentReader, rules)
|
||||
baseURL, extractedContent, err = findContentUsingCustomRules(htmlDocumentReader, rules)
|
||||
} else {
|
||||
slog.Debug("Extracting content with readability",
|
||||
"url", websiteURL,
|
||||
"url", pageURL,
|
||||
)
|
||||
content, err = readability.ExtractContent(htmlDocumentReader)
|
||||
baseURL, extractedContent, err = readability.ExtractContent(htmlDocumentReader)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return "", err
|
||||
if baseURL == "" {
|
||||
baseURL = pageURL
|
||||
} else {
|
||||
slog.Debug("Using base URL from HTML document", "base_url", baseURL)
|
||||
}
|
||||
|
||||
return content, nil
|
||||
return baseURL, extractedContent, nil
|
||||
}
|
||||
|
||||
func findContentUsingCustomRules(page io.Reader, rules string) (string, error) {
|
||||
func findContentUsingCustomRules(page io.Reader, rules string) (baseURL string, extractedContent string, err error) {
|
||||
document, err := goquery.NewDocumentFromReader(page)
|
||||
if err != nil {
|
||||
return "", err
|
||||
return "", "", err
|
||||
}
|
||||
|
||||
if hrefValue, exists := document.Find("head base").First().Attr("href"); exists {
|
||||
hrefValue = strings.TrimSpace(hrefValue)
|
||||
if urllib.IsAbsoluteURL(hrefValue) {
|
||||
baseURL = hrefValue
|
||||
}
|
||||
}
|
||||
|
||||
contents := ""
|
||||
document.Find(rules).Each(func(i int, s *goquery.Selection) {
|
||||
if content, err := goquery.OuterHtml(s); err == nil {
|
||||
contents += content
|
||||
extractedContent += content
|
||||
}
|
||||
})
|
||||
|
||||
return contents, nil
|
||||
return baseURL, extractedContent, nil
|
||||
}
|
||||
|
||||
func getPredefinedScraperRules(websiteURL string) string {
|
||||
|
|
|
@ -62,7 +62,7 @@ func TestSelectorRules(t *testing.T) {
|
|||
t.Fatalf(`Unable to read file %q: %v`, filename, err)
|
||||
}
|
||||
|
||||
actualResult, err := findContentUsingCustomRules(bytes.NewReader(html), rule)
|
||||
_, actualResult, err := findContentUsingCustomRules(bytes.NewReader(html), rule)
|
||||
if err != nil {
|
||||
t.Fatalf(`Scraping error for %q - %q: %v`, filename, rule, err)
|
||||
}
|
||||
|
@ -73,7 +73,67 @@ func TestSelectorRules(t *testing.T) {
|
|||
}
|
||||
|
||||
if actualResult != strings.TrimSpace(string(expectedResult)) {
|
||||
t.Errorf(`Unexpected result for %q, got "%s" instead of "%s"`, rule, actualResult, expectedResult)
|
||||
t.Errorf(`Unexpected result for %q, got %q instead of %q`, rule, actualResult, expectedResult)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBaseURLWithCustomRules(t *testing.T) {
|
||||
html := `<html><head><base href="https://example.com/"></head><body><img src="image.jpg"></body></html>`
|
||||
baseURL, _, err := findContentUsingCustomRules(strings.NewReader(html), "img")
|
||||
if err != nil {
|
||||
t.Fatalf(`Scraping error: %v`, err)
|
||||
}
|
||||
|
||||
if baseURL != "https://example.com/" {
|
||||
t.Errorf(`Unexpected base URL, got %q instead of "https://example.com/"`, baseURL)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseMultipleBaseURLWithCustomRules(t *testing.T) {
|
||||
html := `<html><head><base href="https://example.com/"><base href="https://example.org/"/></head><body><img src="image.jpg"></body></html>`
|
||||
baseURL, _, err := findContentUsingCustomRules(strings.NewReader(html), "img")
|
||||
if err != nil {
|
||||
t.Fatalf(`Scraping error: %v`, err)
|
||||
}
|
||||
|
||||
if baseURL != "https://example.com/" {
|
||||
t.Errorf(`Unexpected base URL, got %q instead of "https://example.com/"`, baseURL)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseRelativeBaseURLWithCustomRules(t *testing.T) {
|
||||
html := `<html><head><base href="/test"></head><body><img src="image.jpg"></body></html>`
|
||||
baseURL, _, err := findContentUsingCustomRules(strings.NewReader(html), "img")
|
||||
if err != nil {
|
||||
t.Fatalf(`Scraping error: %v`, err)
|
||||
}
|
||||
|
||||
if baseURL != "" {
|
||||
t.Errorf(`Unexpected base URL, got %q`, baseURL)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseEmptyBaseURLWithCustomRules(t *testing.T) {
|
||||
html := `<html><head><base href=" "></head><body><img src="image.jpg"></body></html>`
|
||||
baseURL, _, err := findContentUsingCustomRules(strings.NewReader(html), "img")
|
||||
if err != nil {
|
||||
t.Fatalf(`Scraping error: %v`, err)
|
||||
}
|
||||
|
||||
if baseURL != "" {
|
||||
t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseMissingBaseURLWithCustomRules(t *testing.T) {
|
||||
html := `<html><head></head><body><img src="image.jpg"></body></html>`
|
||||
baseURL, _, err := findContentUsingCustomRules(strings.NewReader(html), "img")
|
||||
if err != nil {
|
||||
t.Fatalf(`Scraping error: %v`, err)
|
||||
}
|
||||
|
||||
if baseURL != "" {
|
||||
t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue