1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-08-11 17:51:01 +00:00

Refactor HTTP Client and LocalizedError packages

This commit is contained in:
Frédéric Guillot 2023-10-21 19:50:29 -07:00
parent 120aabfbce
commit 14e25ab9fe
104 changed files with 1277 additions and 10672 deletions

View file

@ -4,67 +4,54 @@
package scraper // import "miniflux.app/v2/internal/reader/scraper"
import (
"errors"
"fmt"
"io"
"log/slog"
"strings"
"miniflux.app/v2/internal/config"
"miniflux.app/v2/internal/http/client"
"miniflux.app/v2/internal/reader/fetcher"
"miniflux.app/v2/internal/reader/readability"
"miniflux.app/v2/internal/urllib"
"github.com/PuerkitoBio/goquery"
)
// Fetch downloads a web page and returns relevant contents.
func Fetch(websiteURL, rules, userAgent string, cookie string, allowSelfSignedCertificates, useProxy bool) (string, error) {
clt := client.NewClientWithConfig(websiteURL, config.Opts)
clt.WithUserAgent(userAgent)
clt.WithCookie(cookie)
if useProxy {
clt.WithProxy()
}
clt.AllowSelfSignedCertificates = allowSelfSignedCertificates
func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, websiteURL, rules string) (string, error) {
responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(websiteURL))
defer responseHandler.Close()
response, err := clt.Get()
if err != nil {
return "", err
if localizedError := responseHandler.LocalizedError(); localizedError != nil {
slog.Warn("Unable to scrape website", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error()))
return "", localizedError.Error()
}
if response.HasServerFailure() {
return "", errors.New("scraper: unable to download web page")
}
if !isAllowedContentType(response.ContentType) {
return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", response.ContentType)
}
if err = response.EnsureUnicodeBody(); err != nil {
return "", err
if !isAllowedContentType(responseHandler.ContentType()) {
return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", responseHandler.ContentType())
}
// The entry URL could redirect somewhere else.
sameSite := urllib.Domain(websiteURL) == urllib.Domain(response.EffectiveURL)
websiteURL = response.EffectiveURL
sameSite := urllib.Domain(websiteURL) == urllib.Domain(responseHandler.EffectiveURL())
websiteURL = responseHandler.EffectiveURL()
if rules == "" {
rules = getPredefinedScraperRules(websiteURL)
}
var content string
var err error
if sameSite && rules != "" {
slog.Debug("Extracting content with custom rules",
"url", websiteURL,
"rules", rules,
)
content, err = scrapContent(response.Body, rules)
content, err = findContentUsingCustomRules(responseHandler.Body(config.Opts.HTTPClientMaxBodySize()), rules)
} else {
slog.Debug("Extracting content with readability",
"url", websiteURL,
)
content, err = readability.ExtractContent(response.Body)
content, err = readability.ExtractContent(responseHandler.Body(config.Opts.HTTPClientMaxBodySize()))
}
if err != nil {
@ -74,7 +61,7 @@ func Fetch(websiteURL, rules, userAgent string, cookie string, allowSelfSignedCe
return content, nil
}
func scrapContent(page io.Reader, rules string) (string, error) {
func findContentUsingCustomRules(page io.Reader, rules string) (string, error) {
document, err := goquery.NewDocumentFromReader(page)
if err != nil {
return "", err