miniflux-v2/internal/reader/scraper/scraper.go

// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

package scraper // import "miniflux.app/v2/internal/reader/scraper"

import (
	"fmt"
	"io"
	"log/slog"
	"strings"

	"miniflux.app/v2/internal/config"
	"miniflux.app/v2/internal/reader/encoding"
	"miniflux.app/v2/internal/reader/fetcher"
	"miniflux.app/v2/internal/reader/readability"
	"miniflux.app/v2/internal/urllib"

	"github.com/PuerkitoBio/goquery"
)

func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, pageURL, rules string) (baseURL string, extractedContent string, err error) {
	responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(pageURL))
	defer responseHandler.Close()

	if localizedError := responseHandler.LocalizedError(); localizedError != nil {
		slog.Warn("Unable to scrape website", slog.String("website_url", pageURL), slog.Any("error", localizedError.Error()))
		return "", "", localizedError.Error()
	}

	if !isAllowedContentType(responseHandler.ContentType()) {
		return "", "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", responseHandler.ContentType())
	}

	// The entry URL could redirect somewhere else.
	sameSite := urllib.Domain(pageURL) == urllib.Domain(responseHandler.EffectiveURL())
	pageURL = responseHandler.EffectiveURL()

	if rules == "" {
		rules = getPredefinedScraperRules(pageURL)
	}

	htmlDocumentReader, err := encoding.NewCharsetReader(
		responseHandler.Body(config.Opts.HTTPClientMaxBodySize()),
		responseHandler.ContentType(),
	)

	if err != nil {
		return "", "", fmt.Errorf("scraper: unable to read HTML document with charset reader: %v", err)
	}

	if sameSite && rules != "" {
		slog.Debug("Extracting content with custom rules",
			"url", pageURL,
			"rules", rules,
		)
		baseURL, extractedContent, err = findContentUsingCustomRules(htmlDocumentReader, rules)
	} else {
		slog.Debug("Extracting content with readability",
			"url", pageURL,
		)
		baseURL, extractedContent, err = readability.ExtractContent(htmlDocumentReader)
	}

	if baseURL == "" {
		baseURL = pageURL
	} else {
		slog.Debug("Using base URL from HTML document", "base_url", baseURL)
	}

	return baseURL, extractedContent, nil
}

func findContentUsingCustomRules(page io.Reader, rules string) (baseURL string, extractedContent string, err error) {
	document, err := goquery.NewDocumentFromReader(page)
	if err != nil {
		return "", "", err
	}

	if hrefValue, exists := document.FindMatcher(goquery.Single("head base")).Attr("href"); exists {
		hrefValue = strings.TrimSpace(hrefValue)
		if urllib.IsAbsoluteURL(hrefValue) {
			baseURL = hrefValue
		}
	}

	document.Find(rules).Each(func(i int, s *goquery.Selection) {
		if content, err := goquery.OuterHtml(s); err == nil {
			extractedContent += content
		}
	})

	return baseURL, extractedContent, nil
}

func getPredefinedScraperRules(websiteURL string) string {
	urlDomain := urllib.Domain(websiteURL)
	urlDomain = strings.TrimPrefix(urlDomain, "www.")

	if rules, ok := predefinedRules[urlDomain]; ok {
		return rules
	}
	return ""
}

func isAllowedContentType(contentType string) bool {
	contentType = strings.ToLower(contentType)
	return strings.HasPrefix(contentType, "text/html") ||
		strings.HasPrefix(contentType, "application/xhtml+xml")
}
Replace copyright header with SPDX identifier 2023-06-19 14:42:47 -07:00			`// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.`
			`// SPDX-License-Identifier: Apache-2.0`
Add readability package to fetch original content 2017-12-10 19:01:38 -08:00
Move internal packages to an internal folder For reference: https://go.dev/doc/go1.4#internalpackages 2023-08-10 19:46:45 -07:00			`package scraper // import "miniflux.app/v2/internal/reader/scraper"`
Add readability package to fetch original content 2017-12-10 19:01:38 -08:00
			`import (`
Make sure the scraper parse only HTML documents 2018-01-02 18:32:01 -08:00			`"fmt"`
Add scraper rules 2017-12-10 20:51:04 -08:00			`"io"`
Implement structured logging using log/slog package 2023-09-24 16:32:09 -07:00			`"log/slog"`
Add scraper rules 2017-12-10 20:51:04 -08:00			`"strings"`
Add readability package to fetch original content 2017-12-10 19:01:38 -08:00
Move internal packages to an internal folder For reference: https://go.dev/doc/go1.4#internalpackages 2023-08-10 19:46:45 -07:00			`"miniflux.app/v2/internal/config"`
fix(scraper): avoid encoding issue if charset meta tag is after 1024 bytes 2025-02-15 16:58:06 -08:00			`"miniflux.app/v2/internal/reader/encoding"`
Refactor HTTP Client and LocalizedError packages 2023-10-21 19:50:29 -07:00			`"miniflux.app/v2/internal/reader/fetcher"`
Move internal packages to an internal folder For reference: https://go.dev/doc/go1.4#internalpackages 2023-08-10 19:46:45 -07:00			`"miniflux.app/v2/internal/reader/readability"`
Rename internal url package to avoid overlap with net/url 2023-08-13 19:09:01 -07:00			`"miniflux.app/v2/internal/urllib"`
Use canonical imports 2018-08-24 21:51:50 -07:00
Add scraper rules 2017-12-10 20:51:04 -08:00			`"github.com/PuerkitoBio/goquery"`
Add readability package to fetch original content 2017-12-10 19:01:38 -08:00			`)`

feat: implement base element handling in content scraper 2024-07-24 21:41:09 -07:00			`func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, pageURL, rules string) (baseURL string, extractedContent string, err error) {`
			`responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(pageURL))`
Refactor HTTP Client and LocalizedError packages 2023-10-21 19:50:29 -07:00			`defer responseHandler.Close()`
Add readability package to fetch original content 2017-12-10 19:01:38 -08:00
Refactor HTTP Client and LocalizedError packages 2023-10-21 19:50:29 -07:00			`if localizedError := responseHandler.LocalizedError(); localizedError != nil {`
feat: implement base element handling in content scraper 2024-07-24 21:41:09 -07:00			`slog.Warn("Unable to scrape website", slog.String("website_url", pageURL), slog.Any("error", localizedError.Error()))`
			`return "", "", localizedError.Error()`
Make sure the scraper parse only HTML documents 2018-01-02 18:32:01 -08:00			`}`

Refactor HTTP Client and LocalizedError packages 2023-10-21 19:50:29 -07:00			`if !isAllowedContentType(responseHandler.ContentType()) {`
feat: implement base element handling in content scraper 2024-07-24 21:41:09 -07:00			`return "", "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", responseHandler.ContentType())`
Add readability package to fetch original content 2017-12-10 19:01:38 -08:00			`}`

Make sure the scraper parse only HTML documents 2018-01-02 18:32:01 -08:00			`// The entry URL could redirect somewhere else.`
feat: implement base element handling in content scraper 2024-07-24 21:41:09 -07:00			`sameSite := urllib.Domain(pageURL) == urllib.Domain(responseHandler.EffectiveURL())`
			`pageURL = responseHandler.EffectiveURL()`
Improve content scraper 2017-12-13 21:30:40 -08:00
Add scraper rules 2017-12-10 20:51:04 -08:00			`if rules == "" {`
feat: implement base element handling in content scraper 2024-07-24 21:41:09 -07:00			`rules = getPredefinedScraperRules(pageURL)`
Add scraper rules 2017-12-10 20:51:04 -08:00			`}`

fix(scraper): avoid encoding issue if charset meta tag is after 1024 bytes 2025-02-15 16:58:06 -08:00			`htmlDocumentReader, err := encoding.NewCharsetReader(`
Regression: ensure all HTML documents are encoded in UTF-8 Fixes #2196 2023-12-01 16:27:18 -08:00			`responseHandler.Body(config.Opts.HTTPClientMaxBodySize()),`
Inline a one-liner function No need to expose a symbol for this. 2024-03-20 20:44:41 +01:00			`responseHandler.ContentType(),`
Regression: ensure all HTML documents are encoded in UTF-8 Fixes #2196 2023-12-01 16:27:18 -08:00			`)`
fix(scraper): avoid encoding issue if charset meta tag is after 1024 bytes 2025-02-15 16:58:06 -08:00
Regression: ensure all HTML documents are encoded in UTF-8 Fixes #2196 2023-12-01 16:27:18 -08:00			`if err != nil {`
feat: implement base element handling in content scraper 2024-07-24 21:41:09 -07:00			`return "", "", fmt.Errorf("scraper: unable to read HTML document with charset reader: %v", err)`
Regression: ensure all HTML documents are encoded in UTF-8 Fixes #2196 2023-12-01 16:27:18 -08:00			`}`

scraper follow the only link * in some cases, what the scraper got is only a landing page, user can use scraper rules to extract the link of the landing page and follow it * it also fix the wrong scrape rule apply when the server redirects it to another host 2021-12-08 16:46:33 +08:00			`if sameSite && rules != "" {`
Implement structured logging using log/slog package 2023-09-24 16:32:09 -07:00			`slog.Debug("Extracting content with custom rules",`
feat: implement base element handling in content scraper 2024-07-24 21:41:09 -07:00			`"url", pageURL,`
Implement structured logging using log/slog package 2023-09-24 16:32:09 -07:00			`"rules", rules,`
			`)`
feat: implement base element handling in content scraper 2024-07-24 21:41:09 -07:00			`baseURL, extractedContent, err = findContentUsingCustomRules(htmlDocumentReader, rules)`
Add scraper rules 2017-12-10 20:51:04 -08:00			`} else {`
Implement structured logging using log/slog package 2023-09-24 16:32:09 -07:00			`slog.Debug("Extracting content with readability",`
feat: implement base element handling in content scraper 2024-07-24 21:41:09 -07:00			`"url", pageURL,`
Implement structured logging using log/slog package 2023-09-24 16:32:09 -07:00			`)`
feat: implement base element handling in content scraper 2024-07-24 21:41:09 -07:00			`baseURL, extractedContent, err = readability.ExtractContent(htmlDocumentReader)`
Add scraper rules 2017-12-10 20:51:04 -08:00			`}`

feat: implement base element handling in content scraper 2024-07-24 21:41:09 -07:00			`if baseURL == "" {`
			`baseURL = pageURL`
			`} else {`
			`slog.Debug("Using base URL from HTML document", "base_url", baseURL)`
Add readability package to fetch original content 2017-12-10 19:01:38 -08:00			`}`

feat: implement base element handling in content scraper 2024-07-24 21:41:09 -07:00			`return baseURL, extractedContent, nil`
Add readability package to fetch original content 2017-12-10 19:01:38 -08:00			`}`
Add scraper rules 2017-12-10 20:51:04 -08:00
feat: implement base element handling in content scraper 2024-07-24 21:41:09 -07:00			`func findContentUsingCustomRules(page io.Reader, rules string) (baseURL string, extractedContent string, err error) {`
Add scraper rules 2017-12-10 20:51:04 -08:00			`document, err := goquery.NewDocumentFromReader(page)`
			`if err != nil {`
feat: implement base element handling in content scraper 2024-07-24 21:41:09 -07:00			`return "", "", err`
			`}`

refactor: use a better construct than `doc.Find(…).First()` As mentioned in goquery's documentation (https://pkg.go.dev/github.com/PuerkitoBio/goquery#Single): > By default, Selection.Find and other functions that accept a selector string to select nodes will use all matches corresponding to that selector. By using the Matcher returned by Single, at most the first match will be selected. > > The one using Single is optimized to be potentially much faster on large documents. 2024-12-12 03:40:55 +00:00			`if hrefValue, exists := document.FindMatcher(goquery.Single("head base")).Attr("href"); exists {`
feat: implement base element handling in content scraper 2024-07-24 21:41:09 -07:00			`hrefValue = strings.TrimSpace(hrefValue)`
			`if urllib.IsAbsoluteURL(hrefValue) {`
			`baseURL = hrefValue`
			`}`
Add scraper rules 2017-12-10 20:51:04 -08:00			`}`

			`document.Find(rules).Each(func(i int, s *goquery.Selection) {`
Improve a bit internal/reader/scraper/scraper.go - make findContentUsingCustomRules' more idiomatic, since in golang a function returning an error might return garbage in other parameter. Moreover, ignoring errors is bad practise. - getPredefinedScraperRules is now running in constant-time, instead of iterating on a list with around 50 items in it. 2024-02-26 17:37:49 +01:00			`if content, err := goquery.OuterHtml(s); err == nil {`
feat: implement base element handling in content scraper 2024-07-24 21:41:09 -07:00			`extractedContent += content`
Improve a bit internal/reader/scraper/scraper.go - make findContentUsingCustomRules' more idiomatic, since in golang a function returning an error might return garbage in other parameter. Moreover, ignoring errors is bad practise. - getPredefinedScraperRules is now running in constant-time, instead of iterating on a list with around 50 items in it. 2024-02-26 17:37:49 +01:00			`}`
Add scraper rules 2017-12-10 20:51:04 -08:00			`})`

feat: implement base element handling in content scraper 2024-07-24 21:41:09 -07:00			`return baseURL, extractedContent, nil`
Add scraper rules 2017-12-10 20:51:04 -08:00			`}`

			`func getPredefinedScraperRules(websiteURL string) string {`
Rename internal url package to avoid overlap with net/url 2023-08-13 19:09:01 -07:00			`urlDomain := urllib.Domain(websiteURL)`
Improve a bit internal/reader/scraper/scraper.go - make findContentUsingCustomRules' more idiomatic, since in golang a function returning an error might return garbage in other parameter. Moreover, ignoring errors is bad practise. - getPredefinedScraperRules is now running in constant-time, instead of iterating on a list with around 50 items in it. 2024-02-26 17:37:49 +01:00			`urlDomain = strings.TrimPrefix(urlDomain, "www.")`
Add scraper rules 2017-12-10 20:51:04 -08:00
Improve a bit internal/reader/scraper/scraper.go - make findContentUsingCustomRules' more idiomatic, since in golang a function returning an error might return garbage in other parameter. Moreover, ignoring errors is bad practise. - getPredefinedScraperRules is now running in constant-time, instead of iterating on a list with around 50 items in it. 2024-02-26 17:37:49 +01:00			`if rules, ok := predefinedRules[urlDomain]; ok {`
			`return rules`
Add scraper rules 2017-12-10 20:51:04 -08:00			`}`
			`return ""`
			`}`
Allow the scraper to parse XHTML documents Only "text/html" was authorized before. 2018-11-03 13:44:13 -07:00
Add Prometheus exporter 2020-09-27 16:01:06 -07:00			`func isAllowedContentType(contentType string) bool {`
Allow the scraper to parse XHTML documents Only "text/html" was authorized before. 2018-11-03 13:44:13 -07:00			`contentType = strings.ToLower(contentType)`
			`return strings.HasPrefix(contentType, "text/html") \|\|`
			`strings.HasPrefix(contentType, "application/xhtml+xml")`
			`}`