mirror of
https://github.com/miniflux/v2.git
synced 2025-06-27 16:36:00 +00:00
fix(scraper): avoid encoding issue if charset meta tag is after 1024 bytes
This commit is contained in:
parent
af1f966250
commit
6eedf4111f
12 changed files with 352 additions and 10 deletions
|
@ -5,6 +5,7 @@ package encoding // import "miniflux.app/v2/internal/reader/encoding"
|
|||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"unicode/utf8"
|
||||
|
||||
|
@ -23,7 +24,11 @@ import (
|
|||
// - Feeds with encoding specified only in XML document and not in HTTP header
|
||||
// - Feeds with wrong encoding defined and already in UTF-8
|
||||
func CharsetReader(charsetLabel string, input io.Reader) (io.Reader, error) {
|
||||
buffer, _ := io.ReadAll(input)
|
||||
buffer, err := io.ReadAll(input)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`encoding: unable to read input: %w`, err)
|
||||
}
|
||||
|
||||
r := bytes.NewReader(buffer)
|
||||
|
||||
// The document is already UTF-8, do not do anything (avoid double-encoding).
|
||||
|
@ -35,3 +40,24 @@ func CharsetReader(charsetLabel string, input io.Reader) (io.Reader, error) {
|
|||
// Transform document to UTF-8 from the specified encoding in XML prolog.
|
||||
return charset.NewReaderLabel(charsetLabel, r)
|
||||
}
|
||||
|
||||
// NewCharsetReader returns an io.Reader that converts the content of r to UTF-8.
|
||||
func NewCharsetReader(r io.Reader, contentType string) (io.Reader, error) {
|
||||
buffer, err := io.ReadAll(r)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`encoding: unable to read input: %w`, err)
|
||||
}
|
||||
|
||||
internalReader := bytes.NewReader(buffer)
|
||||
|
||||
// The document is already UTF-8, do not do anything.
|
||||
if utf8.Valid(buffer) {
|
||||
return internalReader, nil
|
||||
}
|
||||
|
||||
// Transform document to UTF-8 from the specified encoding in Content-Type header.
|
||||
// Note that only the first 1024 bytes are used to detect the encoding.
|
||||
// If the <meta charset> tag is not found in the first 1024 bytes, charset.DetermineEncoding returns "windows-1252" resulting in encoding issues.
|
||||
// See https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
|
||||
return charset.NewReader(internalReader, contentType)
|
||||
}
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
package encoding // import "miniflux.app/v2/internal/reader/encoding"
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
"os"
|
||||
"testing"
|
||||
|
@ -31,6 +32,11 @@ func TestCharsetReaderWithUTF8(t *testing.T) {
|
|||
if !utf8.Valid(data) {
|
||||
t.Fatalf("Data is not valid UTF-8")
|
||||
}
|
||||
|
||||
expectedUnicodeString := "Café"
|
||||
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCharsetReaderWithISO88591(t *testing.T) {
|
||||
|
@ -54,6 +60,11 @@ func TestCharsetReaderWithISO88591(t *testing.T) {
|
|||
if !utf8.Valid(data) {
|
||||
t.Fatalf("Data is not valid UTF-8")
|
||||
}
|
||||
|
||||
expectedUnicodeString := "Café"
|
||||
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCharsetReaderWithWindows1252(t *testing.T) {
|
||||
|
@ -77,6 +88,11 @@ func TestCharsetReaderWithWindows1252(t *testing.T) {
|
|||
if !utf8.Valid(data) {
|
||||
t.Fatalf("Data is not valid UTF-8")
|
||||
}
|
||||
|
||||
expectedUnicodeString := "Euro €"
|
||||
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCharsetReaderWithInvalidProlog(t *testing.T) {
|
||||
|
@ -100,6 +116,11 @@ func TestCharsetReaderWithInvalidProlog(t *testing.T) {
|
|||
if !utf8.Valid(data) {
|
||||
t.Fatalf("Data is not valid UTF-8")
|
||||
}
|
||||
|
||||
expectedUnicodeString := "Café"
|
||||
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCharsetReaderWithUTF8DocumentWithIncorrectProlog(t *testing.T) {
|
||||
|
@ -123,6 +144,11 @@ func TestCharsetReaderWithUTF8DocumentWithIncorrectProlog(t *testing.T) {
|
|||
if !utf8.Valid(data) {
|
||||
t.Fatalf("Data is not valid UTF-8")
|
||||
}
|
||||
|
||||
expectedUnicodeString := "Café"
|
||||
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCharsetReaderWithWindows1252DocumentWithIncorrectProlog(t *testing.T) {
|
||||
|
@ -146,4 +172,177 @@ func TestCharsetReaderWithWindows1252DocumentWithIncorrectProlog(t *testing.T) {
|
|||
if !utf8.Valid(data) {
|
||||
t.Fatalf("Data is not valid UTF-8")
|
||||
}
|
||||
|
||||
expectedUnicodeString := "Euro €"
|
||||
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewReaderWithUTF8Document(t *testing.T) {
|
||||
file := "testdata/utf8.html"
|
||||
|
||||
f, err := os.Open(file)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to open file: %v", err)
|
||||
}
|
||||
|
||||
reader, err := NewCharsetReader(f, "text/html; charset=UTF-8")
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to create reader: %v", err)
|
||||
}
|
||||
|
||||
data, err := io.ReadAll(reader)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to read data: %v", err)
|
||||
}
|
||||
|
||||
if !utf8.Valid(data) {
|
||||
t.Fatalf("Data is not valid UTF-8")
|
||||
}
|
||||
|
||||
expectedUnicodeString := "Café"
|
||||
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewReaderWithUTF8DocumentAndNoContentEncoding(t *testing.T) {
|
||||
file := "testdata/utf8.html"
|
||||
|
||||
f, err := os.Open(file)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to open file: %v", err)
|
||||
}
|
||||
|
||||
reader, err := NewCharsetReader(f, "text/html")
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to create reader: %v", err)
|
||||
}
|
||||
|
||||
data, err := io.ReadAll(reader)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to read data: %v", err)
|
||||
}
|
||||
|
||||
if !utf8.Valid(data) {
|
||||
t.Fatalf("Data is not valid UTF-8")
|
||||
}
|
||||
|
||||
expectedUnicodeString := "Café"
|
||||
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewReaderWithISO88591Document(t *testing.T) {
|
||||
file := "testdata/iso-8859-1.xml"
|
||||
|
||||
f, err := os.Open(file)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to open file: %v", err)
|
||||
}
|
||||
|
||||
reader, err := NewCharsetReader(f, "text/html; charset=ISO-8859-1")
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to create reader: %v", err)
|
||||
}
|
||||
|
||||
data, err := io.ReadAll(reader)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to read data: %v", err)
|
||||
}
|
||||
|
||||
if !utf8.Valid(data) {
|
||||
t.Fatalf("Data is not valid UTF-8")
|
||||
}
|
||||
|
||||
expectedUnicodeString := "Café"
|
||||
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewReaderWithISO88591DocumentAndNoContentType(t *testing.T) {
|
||||
file := "testdata/iso-8859-1.xml"
|
||||
|
||||
f, err := os.Open(file)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to open file: %v", err)
|
||||
}
|
||||
|
||||
reader, err := NewCharsetReader(f, "")
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to create reader: %v", err)
|
||||
}
|
||||
|
||||
data, err := io.ReadAll(reader)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to read data: %v", err)
|
||||
}
|
||||
|
||||
if !utf8.Valid(data) {
|
||||
t.Fatalf("Data is not valid UTF-8")
|
||||
}
|
||||
|
||||
expectedUnicodeString := "Café"
|
||||
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewReaderWithISO88591DocumentWithMetaAfter1024Bytes(t *testing.T) {
|
||||
file := "testdata/iso-8859-1-meta-after-1024.html"
|
||||
|
||||
f, err := os.Open(file)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to open file: %v", err)
|
||||
}
|
||||
|
||||
reader, err := NewCharsetReader(f, "text/html")
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to create reader: %v", err)
|
||||
}
|
||||
|
||||
data, err := io.ReadAll(reader)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to read data: %v", err)
|
||||
}
|
||||
|
||||
if !utf8.Valid(data) {
|
||||
t.Fatalf("Data is not valid UTF-8")
|
||||
}
|
||||
|
||||
expectedUnicodeString := "Café"
|
||||
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewReaderWithUTF8DocumentWithMetaAfter1024Bytes(t *testing.T) {
|
||||
file := "testdata/utf8-meta-after-1024.html"
|
||||
|
||||
f, err := os.Open(file)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to open file: %v", err)
|
||||
}
|
||||
|
||||
reader, err := NewCharsetReader(f, "text/html")
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to create reader: %v", err)
|
||||
}
|
||||
|
||||
data, err := io.ReadAll(reader)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to read data: %v", err)
|
||||
}
|
||||
|
||||
if !utf8.Valid(data) {
|
||||
t.Fatalf("Data is not valid UTF-8")
|
||||
}
|
||||
|
||||
expectedUnicodeString := "Café"
|
||||
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,6 +2,6 @@
|
|||
<feed>
|
||||
<title>테스트 피드</title>
|
||||
<entry>
|
||||
<title>こんにちは世界</title>
|
||||
<title>Café</title>
|
||||
</entry>
|
||||
</feed>
|
48
internal/reader/encoding/testdata/iso-8859-1-meta-after-1024.html
vendored
Normal file
48
internal/reader/encoding/testdata/iso-8859-1-meta-after-1024.html
vendored
Normal file
|
@ -0,0 +1,48 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<!---
|
||||
|
||||
This text is greater than 1024 bytes which are used by the charset.NewReader to determine the encoding of the file.
|
||||
|
||||
This comment is used to pad the file to 1024 bytes.
|
||||
|
||||
The <meta> tag must be after 1024 bytes to ensure that the encoding is detected correctly.
|
||||
|
||||
---
|
||||
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
|
||||
-->
|
||||
<head>
|
||||
<meta charset="iso-8859-1">
|
||||
<title>Frédéric</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>Café</p>
|
||||
</body>
|
||||
</html>
|
10
internal/reader/encoding/testdata/iso-8859-1.html
vendored
Normal file
10
internal/reader/encoding/testdata/iso-8859-1.html
vendored
Normal file
|
@ -0,0 +1,10 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="iso-8859-1">
|
||||
<title>Frédéric</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>Café</p>
|
||||
</body>
|
||||
</html>
|
|
@ -2,6 +2,6 @@
|
|||
<feed>
|
||||
<title>테스트 피드</title>
|
||||
<entry>
|
||||
<title>こんにちは世界</title>
|
||||
<title>Café</title>
|
||||
</entry>
|
||||
</feed>
|
48
internal/reader/encoding/testdata/utf8-meta-after-1024.html
vendored
Normal file
48
internal/reader/encoding/testdata/utf8-meta-after-1024.html
vendored
Normal file
|
@ -0,0 +1,48 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<!---
|
||||
|
||||
This text is greater than 1024 bytes which are used by the charset.NewReader to determine the encoding of the file.
|
||||
|
||||
This comment is used to pad the file to 1024 bytes.
|
||||
|
||||
The <meta> tag must be after 1024 bytes to ensure that the encoding is detected correctly.
|
||||
|
||||
---
|
||||
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
|
||||
-->
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Frédéric</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>Café</p>
|
||||
</body>
|
||||
</html>
|
10
internal/reader/encoding/testdata/utf8.html
vendored
Normal file
10
internal/reader/encoding/testdata/utf8.html
vendored
Normal file
|
@ -0,0 +1,10 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Café</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>Café</p>
|
||||
</body>
|
||||
</html>
|
2
internal/reader/encoding/testdata/utf8.xml
vendored
2
internal/reader/encoding/testdata/utf8.xml
vendored
|
@ -2,6 +2,6 @@
|
|||
<feed>
|
||||
<title>테스트 피드</title>
|
||||
<entry>
|
||||
<title>こんにちは世界</title>
|
||||
<title>Café</title>
|
||||
</entry>
|
||||
</feed>
|
|
@ -21,12 +21,12 @@ import (
|
|||
"miniflux.app/v2/internal/config"
|
||||
"miniflux.app/v2/internal/crypto"
|
||||
"miniflux.app/v2/internal/model"
|
||||
"miniflux.app/v2/internal/reader/encoding"
|
||||
"miniflux.app/v2/internal/reader/fetcher"
|
||||
"miniflux.app/v2/internal/urllib"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"golang.org/x/image/draw"
|
||||
"golang.org/x/net/html/charset"
|
||||
)
|
||||
|
||||
type IconFinder struct {
|
||||
|
@ -248,7 +248,7 @@ func findIconURLsFromHTMLDocument(body io.Reader, contentType string) ([]string,
|
|||
"link[rel='apple-touch-icon-precomposed.png']",
|
||||
}
|
||||
|
||||
htmlDocumentReader, err := charset.NewReader(body, contentType)
|
||||
htmlDocumentReader, err := encoding.NewCharsetReader(body, contentType)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("icon: unable to create charset reader: %w", err)
|
||||
}
|
||||
|
|
|
@ -10,12 +10,12 @@ import (
|
|||
"strings"
|
||||
|
||||
"miniflux.app/v2/internal/config"
|
||||
"miniflux.app/v2/internal/reader/encoding"
|
||||
"miniflux.app/v2/internal/reader/fetcher"
|
||||
"miniflux.app/v2/internal/reader/readability"
|
||||
"miniflux.app/v2/internal/urllib"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"golang.org/x/net/html/charset"
|
||||
)
|
||||
|
||||
func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, pageURL, rules string) (baseURL string, extractedContent string, err error) {
|
||||
|
@ -39,10 +39,11 @@ func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, pageURL, rules string
|
|||
rules = getPredefinedScraperRules(pageURL)
|
||||
}
|
||||
|
||||
htmlDocumentReader, err := charset.NewReader(
|
||||
htmlDocumentReader, err := encoding.NewCharsetReader(
|
||||
responseHandler.Body(config.Opts.HTTPClientMaxBodySize()),
|
||||
responseHandler.ContentType(),
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
return "", "", fmt.Errorf("scraper: unable to read HTML document with charset reader: %v", err)
|
||||
}
|
||||
|
|
|
@ -16,12 +16,12 @@ import (
|
|||
"miniflux.app/v2/internal/integration/rssbridge"
|
||||
"miniflux.app/v2/internal/locale"
|
||||
"miniflux.app/v2/internal/model"
|
||||
"miniflux.app/v2/internal/reader/encoding"
|
||||
"miniflux.app/v2/internal/reader/fetcher"
|
||||
"miniflux.app/v2/internal/reader/parser"
|
||||
"miniflux.app/v2/internal/urllib"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"golang.org/x/net/html/charset"
|
||||
)
|
||||
|
||||
var (
|
||||
|
@ -136,7 +136,7 @@ func (f *SubscriptionFinder) FindSubscriptionsFromWebPage(websiteURL, contentTyp
|
|||
"link[type='application/feed+json']": parser.FormatJSON,
|
||||
}
|
||||
|
||||
htmlDocumentReader, err := charset.NewReader(body, contentType)
|
||||
htmlDocumentReader, err := encoding.NewCharsetReader(body, contentType)
|
||||
if err != nil {
|
||||
return nil, locale.NewLocalizedErrorWrapper(err, "error.unable_to_parse_html_document", err)
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue