1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-06-27 16:36:00 +00:00

fix(scraper): avoid encoding issue if charset meta tag is after 1024 bytes

This commit is contained in:
Frédéric Guillot 2025-02-15 16:58:06 -08:00
parent af1f966250
commit 6eedf4111f
12 changed files with 352 additions and 10 deletions

View file

@ -5,6 +5,7 @@ package encoding // import "miniflux.app/v2/internal/reader/encoding"
import (
"bytes"
"fmt"
"io"
"unicode/utf8"
@ -23,7 +24,11 @@ import (
// - Feeds with encoding specified only in XML document and not in HTTP header
// - Feeds with wrong encoding defined and already in UTF-8
func CharsetReader(charsetLabel string, input io.Reader) (io.Reader, error) {
buffer, _ := io.ReadAll(input)
buffer, err := io.ReadAll(input)
if err != nil {
return nil, fmt.Errorf(`encoding: unable to read input: %w`, err)
}
r := bytes.NewReader(buffer)
// The document is already UTF-8, do not do anything (avoid double-encoding).
@ -35,3 +40,24 @@ func CharsetReader(charsetLabel string, input io.Reader) (io.Reader, error) {
// Transform document to UTF-8 from the specified encoding in XML prolog.
return charset.NewReaderLabel(charsetLabel, r)
}
// NewCharsetReader returns an io.Reader that converts the content of r to UTF-8.
func NewCharsetReader(r io.Reader, contentType string) (io.Reader, error) {
buffer, err := io.ReadAll(r)
if err != nil {
return nil, fmt.Errorf(`encoding: unable to read input: %w`, err)
}
internalReader := bytes.NewReader(buffer)
// The document is already UTF-8, do not do anything.
if utf8.Valid(buffer) {
return internalReader, nil
}
// Transform document to UTF-8 from the specified encoding in Content-Type header.
// Note that only the first 1024 bytes are used to detect the encoding.
// If the <meta charset> tag is not found in the first 1024 bytes, charset.DetermineEncoding returns "windows-1252" resulting in encoding issues.
// See https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
return charset.NewReader(internalReader, contentType)
}

View file

@ -4,6 +4,7 @@
package encoding // import "miniflux.app/v2/internal/reader/encoding"
import (
"bytes"
"io"
"os"
"testing"
@ -31,6 +32,11 @@ func TestCharsetReaderWithUTF8(t *testing.T) {
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
expectedUnicodeString := "Café"
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
}
}
func TestCharsetReaderWithISO88591(t *testing.T) {
@ -54,6 +60,11 @@ func TestCharsetReaderWithISO88591(t *testing.T) {
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
expectedUnicodeString := "Café"
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
}
}
func TestCharsetReaderWithWindows1252(t *testing.T) {
@ -77,6 +88,11 @@ func TestCharsetReaderWithWindows1252(t *testing.T) {
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
expectedUnicodeString := "Euro €"
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
}
}
func TestCharsetReaderWithInvalidProlog(t *testing.T) {
@ -100,6 +116,11 @@ func TestCharsetReaderWithInvalidProlog(t *testing.T) {
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
expectedUnicodeString := "Café"
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
}
}
func TestCharsetReaderWithUTF8DocumentWithIncorrectProlog(t *testing.T) {
@ -123,6 +144,11 @@ func TestCharsetReaderWithUTF8DocumentWithIncorrectProlog(t *testing.T) {
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
expectedUnicodeString := "Café"
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
}
}
func TestCharsetReaderWithWindows1252DocumentWithIncorrectProlog(t *testing.T) {
@ -146,4 +172,177 @@ func TestCharsetReaderWithWindows1252DocumentWithIncorrectProlog(t *testing.T) {
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
expectedUnicodeString := "Euro €"
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
}
}
func TestNewReaderWithUTF8Document(t *testing.T) {
file := "testdata/utf8.html"
f, err := os.Open(file)
if err != nil {
t.Fatalf("Unable to open file: %v", err)
}
reader, err := NewCharsetReader(f, "text/html; charset=UTF-8")
if err != nil {
t.Fatalf("Unable to create reader: %v", err)
}
data, err := io.ReadAll(reader)
if err != nil {
t.Fatalf("Unable to read data: %v", err)
}
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
expectedUnicodeString := "Café"
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
}
}
func TestNewReaderWithUTF8DocumentAndNoContentEncoding(t *testing.T) {
file := "testdata/utf8.html"
f, err := os.Open(file)
if err != nil {
t.Fatalf("Unable to open file: %v", err)
}
reader, err := NewCharsetReader(f, "text/html")
if err != nil {
t.Fatalf("Unable to create reader: %v", err)
}
data, err := io.ReadAll(reader)
if err != nil {
t.Fatalf("Unable to read data: %v", err)
}
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
expectedUnicodeString := "Café"
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
}
}
func TestNewReaderWithISO88591Document(t *testing.T) {
file := "testdata/iso-8859-1.xml"
f, err := os.Open(file)
if err != nil {
t.Fatalf("Unable to open file: %v", err)
}
reader, err := NewCharsetReader(f, "text/html; charset=ISO-8859-1")
if err != nil {
t.Fatalf("Unable to create reader: %v", err)
}
data, err := io.ReadAll(reader)
if err != nil {
t.Fatalf("Unable to read data: %v", err)
}
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
expectedUnicodeString := "Café"
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
}
}
func TestNewReaderWithISO88591DocumentAndNoContentType(t *testing.T) {
file := "testdata/iso-8859-1.xml"
f, err := os.Open(file)
if err != nil {
t.Fatalf("Unable to open file: %v", err)
}
reader, err := NewCharsetReader(f, "")
if err != nil {
t.Fatalf("Unable to create reader: %v", err)
}
data, err := io.ReadAll(reader)
if err != nil {
t.Fatalf("Unable to read data: %v", err)
}
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
expectedUnicodeString := "Café"
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
}
}
func TestNewReaderWithISO88591DocumentWithMetaAfter1024Bytes(t *testing.T) {
file := "testdata/iso-8859-1-meta-after-1024.html"
f, err := os.Open(file)
if err != nil {
t.Fatalf("Unable to open file: %v", err)
}
reader, err := NewCharsetReader(f, "text/html")
if err != nil {
t.Fatalf("Unable to create reader: %v", err)
}
data, err := io.ReadAll(reader)
if err != nil {
t.Fatalf("Unable to read data: %v", err)
}
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
expectedUnicodeString := "Café"
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
}
}
func TestNewReaderWithUTF8DocumentWithMetaAfter1024Bytes(t *testing.T) {
file := "testdata/utf8-meta-after-1024.html"
f, err := os.Open(file)
if err != nil {
t.Fatalf("Unable to open file: %v", err)
}
reader, err := NewCharsetReader(f, "text/html")
if err != nil {
t.Fatalf("Unable to create reader: %v", err)
}
data, err := io.ReadAll(reader)
if err != nil {
t.Fatalf("Unable to read data: %v", err)
}
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
expectedUnicodeString := "Café"
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
}
}

View file

@ -2,6 +2,6 @@
<feed>
<title>테스트 피드</title>
<entry>
<title>こんにちは世界</title>
<title>Café</title>
</entry>
</feed>

View file

@ -0,0 +1,48 @@
<!DOCTYPE html>
<html>
<!---
This text is greater than 1024 bytes which are used by the charset.NewReader to determine the encoding of the file.
This comment is used to pad the file to 1024 bytes.
The <meta> tag must be after 1024 bytes to ensure that the encoding is detected correctly.
---
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
-->
<head>
<meta charset="iso-8859-1">
<title>Frédéric</title>
</head>
<body>
<p>Café</p>
</body>
</html>

View file

@ -0,0 +1,10 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="iso-8859-1">
<title>Frédéric</title>
</head>
<body>
<p>Café</p>
</body>
</html>

View file

@ -2,6 +2,6 @@
<feed>
<title>테스트 피드</title>
<entry>
<title>こんにちは世界</title>
<title>Café</title>
</entry>
</feed>

View file

@ -0,0 +1,48 @@
<!DOCTYPE html>
<html>
<!---
This text is greater than 1024 bytes which are used by the charset.NewReader to determine the encoding of the file.
This comment is used to pad the file to 1024 bytes.
The <meta> tag must be after 1024 bytes to ensure that the encoding is detected correctly.
---
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
More text to pad the file to 1024 bytes.
-->
<head>
<meta charset="utf-8">
<title>Frédéric</title>
</head>
<body>
<p>Café</p>
</body>
</html>

View file

@ -0,0 +1,10 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Café</title>
</head>
<body>
<p>Café</p>
</body>
</html>

View file

@ -2,6 +2,6 @@
<feed>
<title>테스트 피드</title>
<entry>
<title>こんにちは世界</title>
<title>Café</title>
</entry>
</feed>

View file

@ -21,12 +21,12 @@ import (
"miniflux.app/v2/internal/config"
"miniflux.app/v2/internal/crypto"
"miniflux.app/v2/internal/model"
"miniflux.app/v2/internal/reader/encoding"
"miniflux.app/v2/internal/reader/fetcher"
"miniflux.app/v2/internal/urllib"
"github.com/PuerkitoBio/goquery"
"golang.org/x/image/draw"
"golang.org/x/net/html/charset"
)
type IconFinder struct {
@ -248,7 +248,7 @@ func findIconURLsFromHTMLDocument(body io.Reader, contentType string) ([]string,
"link[rel='apple-touch-icon-precomposed.png']",
}
htmlDocumentReader, err := charset.NewReader(body, contentType)
htmlDocumentReader, err := encoding.NewCharsetReader(body, contentType)
if err != nil {
return nil, fmt.Errorf("icon: unable to create charset reader: %w", err)
}

View file

@ -10,12 +10,12 @@ import (
"strings"
"miniflux.app/v2/internal/config"
"miniflux.app/v2/internal/reader/encoding"
"miniflux.app/v2/internal/reader/fetcher"
"miniflux.app/v2/internal/reader/readability"
"miniflux.app/v2/internal/urllib"
"github.com/PuerkitoBio/goquery"
"golang.org/x/net/html/charset"
)
func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, pageURL, rules string) (baseURL string, extractedContent string, err error) {
@ -39,10 +39,11 @@ func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, pageURL, rules string
rules = getPredefinedScraperRules(pageURL)
}
htmlDocumentReader, err := charset.NewReader(
htmlDocumentReader, err := encoding.NewCharsetReader(
responseHandler.Body(config.Opts.HTTPClientMaxBodySize()),
responseHandler.ContentType(),
)
if err != nil {
return "", "", fmt.Errorf("scraper: unable to read HTML document with charset reader: %v", err)
}

View file

@ -16,12 +16,12 @@ import (
"miniflux.app/v2/internal/integration/rssbridge"
"miniflux.app/v2/internal/locale"
"miniflux.app/v2/internal/model"
"miniflux.app/v2/internal/reader/encoding"
"miniflux.app/v2/internal/reader/fetcher"
"miniflux.app/v2/internal/reader/parser"
"miniflux.app/v2/internal/urllib"
"github.com/PuerkitoBio/goquery"
"golang.org/x/net/html/charset"
)
var (
@ -136,7 +136,7 @@ func (f *SubscriptionFinder) FindSubscriptionsFromWebPage(websiteURL, contentTyp
"link[type='application/feed+json']": parser.FormatJSON,
}
htmlDocumentReader, err := charset.NewReader(body, contentType)
htmlDocumentReader, err := encoding.NewCharsetReader(body, contentType)
if err != nil {
return nil, locale.NewLocalizedErrorWrapper(err, "error.unable_to_parse_html_document", err)
}