1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-06-27 16:36:00 +00:00

Regression: ensure all HTML documents are encoded in UTF-8

Fixes #2196
This commit is contained in:
Frédéric Guillot 2023-12-01 16:27:18 -08:00
parent f8b40085cd
commit d0f99cee1a
12 changed files with 68 additions and 36 deletions

View file

@ -29,7 +29,7 @@ func Parse(baseURL string, r io.Reader) (*model.Feed, error) {
rawFeed = new(atom10Feed) rawFeed = new(atom10Feed)
} }
if err := xml_decoder.NewDecoder(&buf).Decode(rawFeed); err != nil { if err := xml_decoder.NewXMLDecoder(&buf).Decode(rawFeed); err != nil {
return nil, fmt.Errorf("atom: unable to parse feed: %w", err) return nil, fmt.Errorf("atom: unable to parse feed: %w", err)
} }
@ -37,7 +37,7 @@ func Parse(baseURL string, r io.Reader) (*model.Feed, error) {
} }
func getAtomFeedVersion(data io.Reader) string { func getAtomFeedVersion(data io.Reader) string {
decoder := xml_decoder.NewDecoder(data) decoder := xml_decoder.NewXMLDecoder(data)
for { for {
token, _ := decoder.Token() token, _ := decoder.Token()
if token == nil { if token == nil {

View file

@ -22,7 +22,7 @@ import (
// - Feeds with encoding specified in both places // - Feeds with encoding specified in both places
// - Feeds with encoding specified only in XML document and not in HTTP header // - Feeds with encoding specified only in XML document and not in HTTP header
// - Feeds with wrong encoding defined and already in UTF-8 // - Feeds with wrong encoding defined and already in UTF-8
func CharsetReader(label string, input io.Reader) (io.Reader, error) { func CharsetReader(charsetLabel string, input io.Reader) (io.Reader, error) {
buffer, _ := io.ReadAll(input) buffer, _ := io.ReadAll(input)
r := bytes.NewReader(buffer) r := bytes.NewReader(buffer)
@ -33,5 +33,10 @@ func CharsetReader(label string, input io.Reader) (io.Reader, error) {
} }
// Transform document to UTF-8 from the specified encoding in XML prolog. // Transform document to UTF-8 from the specified encoding in XML prolog.
return charset.NewReaderLabel(label, r) return charset.NewReaderLabel(charsetLabel, r)
}
// CharsetReaderFromContentType is used when the encoding is not specified for the input document.
func CharsetReaderFromContentType(contentType string, input io.Reader) (io.Reader, error) {
return charset.NewReader(input, contentType)
} }

View file

@ -14,6 +14,7 @@ import (
"miniflux.app/v2/internal/config" "miniflux.app/v2/internal/config"
"miniflux.app/v2/internal/crypto" "miniflux.app/v2/internal/crypto"
"miniflux.app/v2/internal/model" "miniflux.app/v2/internal/model"
"miniflux.app/v2/internal/reader/encoding"
"miniflux.app/v2/internal/reader/fetcher" "miniflux.app/v2/internal/reader/fetcher"
"miniflux.app/v2/internal/urllib" "miniflux.app/v2/internal/urllib"
@ -110,7 +111,10 @@ func (f *IconFinder) FetchIconsFromHTMLDocument() (*model.Icon, error) {
return nil, fmt.Errorf("icon: unable to download website index page: %w", localizedError.Error()) return nil, fmt.Errorf("icon: unable to download website index page: %w", localizedError.Error())
} }
iconURLs, err := findIconURLsFromHTMLDocument(responseHandler.Body(config.Opts.HTTPClientMaxBodySize())) iconURLs, err := findIconURLsFromHTMLDocument(
responseHandler.Body(config.Opts.HTTPClientMaxBodySize()),
responseHandler.ContentType(),
)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -178,7 +182,7 @@ func (f *IconFinder) DownloadIcon(iconURL string) (*model.Icon, error) {
return icon, nil return icon, nil
} }
func findIconURLsFromHTMLDocument(body io.Reader) ([]string, error) { func findIconURLsFromHTMLDocument(body io.Reader, contentType string) ([]string, error) {
queries := []string{ queries := []string{
"link[rel='shortcut icon']", "link[rel='shortcut icon']",
"link[rel='Shortcut Icon']", "link[rel='Shortcut Icon']",
@ -186,7 +190,12 @@ func findIconURLsFromHTMLDocument(body io.Reader) ([]string, error) {
"link[rel='icon']", "link[rel='icon']",
} }
doc, err := goquery.NewDocumentFromReader(body) htmlDocumentReader, err := encoding.CharsetReaderFromContentType(contentType, body)
if err != nil {
return nil, fmt.Errorf("icon: unable to create charset reader: %w", err)
}
doc, err := goquery.NewDocumentFromReader(htmlDocumentReader)
if err != nil { if err != nil {
return nil, fmt.Errorf("icon: unable to read document: %v", err) return nil, fmt.Errorf("icon: unable to read document: %v", err)
} }

View file

@ -112,7 +112,7 @@ func TestParseDocumentWithWhitespaceIconURL(t *testing.T) {
/static/img/favicon.ico /static/img/favicon.ico
">` ">`
iconURLs, err := findIconURLsFromHTMLDocument(strings.NewReader(html)) iconURLs, err := findIconURLsFromHTMLDocument(strings.NewReader(html), "text/html")
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }

View file

@ -30,7 +30,7 @@ func DetectFeedFormat(r io.ReadSeeker) string {
} }
r.Seek(0, io.SeekStart) r.Seek(0, io.SeekStart)
decoder := rxml.NewDecoder(r) decoder := rxml.NewXMLDecoder(r)
for { for {
token, _ := decoder.Token() token, _ := decoder.Token()

View file

@ -14,7 +14,7 @@ import (
// Parse returns a normalized feed struct from a RDF feed. // Parse returns a normalized feed struct from a RDF feed.
func Parse(baseURL string, data io.Reader) (*model.Feed, error) { func Parse(baseURL string, data io.Reader) (*model.Feed, error) {
feed := new(rdfFeed) feed := new(rdfFeed)
if err := xml.NewDecoder(data).Decode(feed); err != nil { if err := xml.NewXMLDecoder(data).Decode(feed); err != nil {
return nil, fmt.Errorf("rdf: unable to parse feed: %w", err) return nil, fmt.Errorf("rdf: unable to parse feed: %w", err)
} }

View file

@ -14,7 +14,7 @@ import (
// Parse returns a normalized feed struct from a RSS feed. // Parse returns a normalized feed struct from a RSS feed.
func Parse(baseURL string, data io.Reader) (*model.Feed, error) { func Parse(baseURL string, data io.Reader) (*model.Feed, error) {
feed := new(rssFeed) feed := new(rssFeed)
if err := xml.NewDecoder(data).Decode(feed); err != nil { if err := xml.NewXMLDecoder(data).Decode(feed); err != nil {
return nil, fmt.Errorf("rss: unable to parse feed: %w", err) return nil, fmt.Errorf("rss: unable to parse feed: %w", err)
} }
return feed.Transform(baseURL), nil return feed.Transform(baseURL), nil

View file

@ -10,6 +10,7 @@ import (
"strings" "strings"
"miniflux.app/v2/internal/config" "miniflux.app/v2/internal/config"
"miniflux.app/v2/internal/reader/encoding"
"miniflux.app/v2/internal/reader/fetcher" "miniflux.app/v2/internal/reader/fetcher"
"miniflux.app/v2/internal/reader/readability" "miniflux.app/v2/internal/reader/readability"
"miniflux.app/v2/internal/urllib" "miniflux.app/v2/internal/urllib"
@ -41,17 +42,25 @@ func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, websiteURL, rules str
var content string var content string
var err error var err error
htmlDocumentReader, err := encoding.CharsetReaderFromContentType(
responseHandler.ContentType(),
responseHandler.Body(config.Opts.HTTPClientMaxBodySize()),
)
if err != nil {
return "", fmt.Errorf("scraper: unable to read HTML document: %v", err)
}
if sameSite && rules != "" { if sameSite && rules != "" {
slog.Debug("Extracting content with custom rules", slog.Debug("Extracting content with custom rules",
"url", websiteURL, "url", websiteURL,
"rules", rules, "rules", rules,
) )
content, err = findContentUsingCustomRules(responseHandler.Body(config.Opts.HTTPClientMaxBodySize()), rules) content, err = findContentUsingCustomRules(htmlDocumentReader, rules)
} else { } else {
slog.Debug("Extracting content with readability", slog.Debug("Extracting content with readability",
"url", websiteURL, "url", websiteURL,
) )
content, err = readability.ExtractContent(responseHandler.Body(config.Opts.HTTPClientMaxBodySize())) content, err = readability.ExtractContent(htmlDocumentReader)
} }
if err != nil { if err != nil {

View file

@ -14,6 +14,7 @@ import (
"miniflux.app/v2/internal/integration/rssbridge" "miniflux.app/v2/internal/integration/rssbridge"
"miniflux.app/v2/internal/locale" "miniflux.app/v2/internal/locale"
"miniflux.app/v2/internal/model" "miniflux.app/v2/internal/model"
"miniflux.app/v2/internal/reader/encoding"
"miniflux.app/v2/internal/reader/fetcher" "miniflux.app/v2/internal/reader/fetcher"
"miniflux.app/v2/internal/reader/parser" "miniflux.app/v2/internal/reader/parser"
"miniflux.app/v2/internal/urllib" "miniflux.app/v2/internal/urllib"
@ -98,8 +99,11 @@ func (f *SubscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string)
} }
// Step 4) Parse web page to find feeds from HTML meta tags. // Step 4) Parse web page to find feeds from HTML meta tags.
slog.Debug("Try to detect feeds from HTML meta tags", slog.String("website_url", websiteURL)) slog.Debug("Try to detect feeds from HTML meta tags",
subscriptions, localizedError = f.FindSubscriptionsFromWebPage(websiteURL, bytes.NewReader(responseBody)) slog.String("website_url", websiteURL),
slog.String("content_type", responseHandler.ContentType()),
)
subscriptions, localizedError = f.FindSubscriptionsFromWebPage(websiteURL, responseHandler.ContentType(), bytes.NewReader(responseBody))
if localizedError != nil { if localizedError != nil {
return nil, localizedError return nil, localizedError
} }
@ -138,7 +142,7 @@ func (f *SubscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string)
return nil, nil return nil, nil
} }
func (f *SubscriptionFinder) FindSubscriptionsFromWebPage(websiteURL string, body io.Reader) (Subscriptions, *locale.LocalizedErrorWrapper) { func (f *SubscriptionFinder) FindSubscriptionsFromWebPage(websiteURL, contentType string, body io.Reader) (Subscriptions, *locale.LocalizedErrorWrapper) {
queries := map[string]string{ queries := map[string]string{
"link[type='application/rss+xml']": parser.FormatRSS, "link[type='application/rss+xml']": parser.FormatRSS,
"link[type='application/atom+xml']": parser.FormatAtom, "link[type='application/atom+xml']": parser.FormatAtom,
@ -146,7 +150,12 @@ func (f *SubscriptionFinder) FindSubscriptionsFromWebPage(websiteURL string, bod
"link[type='application/feed+json']": parser.FormatJSON, "link[type='application/feed+json']": parser.FormatJSON,
} }
doc, err := goquery.NewDocumentFromReader(body) htmlDocumentReader, err := encoding.CharsetReaderFromContentType(contentType, body)
if err != nil {
return nil, locale.NewLocalizedErrorWrapper(err, "error.unable_to_parse_html_document", err)
}
doc, err := goquery.NewDocumentFromReader(htmlDocumentReader)
if err != nil { if err != nil {
return nil, locale.NewLocalizedErrorWrapper(err, "error.unable_to_parse_html_document", err) return nil, locale.NewLocalizedErrorWrapper(err, "error.unable_to_parse_html_document", err)
} }

View file

@ -40,7 +40,7 @@ func TestParseWebPageWithRssFeed(t *testing.T) {
</body> </body>
</html>` </html>`
subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage)) subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage))
if err != nil { if err != nil {
t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err) t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
} }
@ -73,7 +73,7 @@ func TestParseWebPageWithAtomFeed(t *testing.T) {
</body> </body>
</html>` </html>`
subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage)) subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage))
if err != nil { if err != nil {
t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err) t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
} }
@ -106,7 +106,7 @@ func TestParseWebPageWithJSONFeed(t *testing.T) {
</body> </body>
</html>` </html>`
subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage)) subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage))
if err != nil { if err != nil {
t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err) t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
} }
@ -139,7 +139,7 @@ func TestParseWebPageWithOldJSONFeedMimeType(t *testing.T) {
</body> </body>
</html>` </html>`
subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage)) subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage))
if err != nil { if err != nil {
t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err) t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
} }
@ -172,7 +172,7 @@ func TestParseWebPageWithRelativeFeedURL(t *testing.T) {
</body> </body>
</html>` </html>`
subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage)) subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage))
if err != nil { if err != nil {
t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err) t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
} }
@ -205,7 +205,7 @@ func TestParseWebPageWithEmptyTitle(t *testing.T) {
</body> </body>
</html>` </html>`
subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage)) subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage))
if err != nil { if err != nil {
t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err) t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
} }
@ -239,7 +239,7 @@ func TestParseWebPageWithMultipleFeeds(t *testing.T) {
</body> </body>
</html>` </html>`
subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage)) subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage))
if err != nil { if err != nil {
t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err) t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
} }
@ -261,7 +261,7 @@ func TestParseWebPageWithDuplicatedFeeds(t *testing.T) {
</body> </body>
</html>` </html>`
subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage)) subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage))
if err != nil { if err != nil {
t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err) t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
} }
@ -294,7 +294,7 @@ func TestParseWebPageWithEmptyFeedURL(t *testing.T) {
</body> </body>
</html>` </html>`
subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage)) subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage))
if err != nil { if err != nil {
t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err) t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
} }
@ -315,7 +315,7 @@ func TestParseWebPageWithNoHref(t *testing.T) {
</body> </body>
</html>` </html>`
subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage)) subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage))
if err != nil { if err != nil {
t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err) t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
} }

View file

@ -13,8 +13,8 @@ import (
"miniflux.app/v2/internal/reader/encoding" "miniflux.app/v2/internal/reader/encoding"
) )
// NewDecoder returns a XML decoder that filters illegal characters. // NewXMLDecoder returns a XML decoder that filters illegal characters.
func NewDecoder(data io.Reader) *xml.Decoder { func NewXMLDecoder(data io.Reader) *xml.Decoder {
var decoder *xml.Decoder var decoder *xml.Decoder
buffer, _ := io.ReadAll(data) buffer, _ := io.ReadAll(data)
enc := procInst("encoding", string(buffer)) enc := procInst("encoding", string(buffer))
@ -36,7 +36,7 @@ func NewDecoder(data io.Reader) *xml.Decoder {
} }
rawData, err := io.ReadAll(utf8Reader) rawData, err := io.ReadAll(utf8Reader)
if err != nil { if err != nil {
return nil, fmt.Errorf("Unable to read data: %q", err) return nil, fmt.Errorf("encoding: unable to read data: %w", err)
} }
filteredBytes := bytes.Map(filterValidXMLChar, rawData) filteredBytes := bytes.Map(filterValidXMLChar, rawData)
return bytes.NewReader(filteredBytes), nil return bytes.NewReader(filteredBytes), nil

View file

@ -10,7 +10,7 @@ import (
"testing" "testing"
) )
func TestUTF8WithIllegalCharacters(t *testing.T) { func TestXMLDocumentWithIllegalUnicodeCharacters(t *testing.T) {
type myxml struct { type myxml struct {
XMLName xml.Name `xml:"rss"` XMLName xml.Name `xml:"rss"`
Version string `xml:"version,attr"` Version string `xml:"version,attr"`
@ -23,7 +23,7 @@ func TestUTF8WithIllegalCharacters(t *testing.T) {
var x myxml var x myxml
decoder := NewDecoder(reader) decoder := NewXMLDecoder(reader)
err := decoder.Decode(&x) err := decoder.Decode(&x)
if err != nil { if err != nil {
t.Error(err) t.Error(err)
@ -34,7 +34,7 @@ func TestUTF8WithIllegalCharacters(t *testing.T) {
} }
} }
func TestWindows251WithIllegalCharacters(t *testing.T) { func TestXMLDocumentWindows251EncodedWithIllegalCharacters(t *testing.T) {
type myxml struct { type myxml struct {
XMLName xml.Name `xml:"rss"` XMLName xml.Name `xml:"rss"`
Version string `xml:"version,attr"` Version string `xml:"version,attr"`
@ -47,7 +47,7 @@ func TestWindows251WithIllegalCharacters(t *testing.T) {
var x myxml var x myxml
decoder := NewDecoder(reader) decoder := NewXMLDecoder(reader)
err := decoder.Decode(&x) err := decoder.Decode(&x)
if err != nil { if err != nil {
t.Error(err) t.Error(err)
@ -58,7 +58,7 @@ func TestWindows251WithIllegalCharacters(t *testing.T) {
} }
} }
func TestIllegalEncodingField(t *testing.T) { func TestXMLDocumentWithIncorrectEncodingField(t *testing.T) {
type myxml struct { type myxml struct {
XMLName xml.Name `xml:"rss"` XMLName xml.Name `xml:"rss"`
Version string `xml:"version,attr"` Version string `xml:"version,attr"`
@ -71,7 +71,7 @@ func TestIllegalEncodingField(t *testing.T) {
var x myxml var x myxml
decoder := NewDecoder(reader) decoder := NewXMLDecoder(reader)
err := decoder.Decode(&x) err := decoder.Decode(&x)
if err != nil { if err != nil {
t.Error(err) t.Error(err)