1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-08-11 17:51:01 +00:00

fix(scraper): avoid encoding issue if charset meta tag is after 1024 bytes

This commit is contained in:
Frédéric Guillot 2025-02-15 16:58:06 -08:00
parent af1f966250
commit 6eedf4111f
12 changed files with 352 additions and 10 deletions

View file

@ -10,12 +10,12 @@ import (
"strings"
"miniflux.app/v2/internal/config"
"miniflux.app/v2/internal/reader/encoding"
"miniflux.app/v2/internal/reader/fetcher"
"miniflux.app/v2/internal/reader/readability"
"miniflux.app/v2/internal/urllib"
"github.com/PuerkitoBio/goquery"
"golang.org/x/net/html/charset"
)
func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, pageURL, rules string) (baseURL string, extractedContent string, err error) {
@ -39,10 +39,11 @@ func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, pageURL, rules string
rules = getPredefinedScraperRules(pageURL)
}
htmlDocumentReader, err := charset.NewReader(
htmlDocumentReader, err := encoding.NewCharsetReader(
responseHandler.Body(config.Opts.HTTPClientMaxBodySize()),
responseHandler.ContentType(),
)
if err != nil {
return "", "", fmt.Errorf("scraper: unable to read HTML document with charset reader: %v", err)
}