mirror of
https://github.com/miniflux/v2.git
synced 2025-08-26 18:21:01 +00:00
fix(scraper): avoid encoding issue if charset meta tag is after 1024 bytes
This commit is contained in:
parent
af1f966250
commit
6eedf4111f
12 changed files with 352 additions and 10 deletions
|
@ -5,6 +5,7 @@ package encoding // import "miniflux.app/v2/internal/reader/encoding"
|
|||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"unicode/utf8"
|
||||
|
||||
|
@ -23,7 +24,11 @@ import (
|
|||
// - Feeds with encoding specified only in XML document and not in HTTP header
|
||||
// - Feeds with wrong encoding defined and already in UTF-8
|
||||
func CharsetReader(charsetLabel string, input io.Reader) (io.Reader, error) {
|
||||
buffer, _ := io.ReadAll(input)
|
||||
buffer, err := io.ReadAll(input)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`encoding: unable to read input: %w`, err)
|
||||
}
|
||||
|
||||
r := bytes.NewReader(buffer)
|
||||
|
||||
// The document is already UTF-8, do not do anything (avoid double-encoding).
|
||||
|
@ -35,3 +40,24 @@ func CharsetReader(charsetLabel string, input io.Reader) (io.Reader, error) {
|
|||
// Transform document to UTF-8 from the specified encoding in XML prolog.
|
||||
return charset.NewReaderLabel(charsetLabel, r)
|
||||
}
|
||||
|
||||
// NewCharsetReader returns an io.Reader that converts the content of r to UTF-8.
|
||||
func NewCharsetReader(r io.Reader, contentType string) (io.Reader, error) {
|
||||
buffer, err := io.ReadAll(r)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`encoding: unable to read input: %w`, err)
|
||||
}
|
||||
|
||||
internalReader := bytes.NewReader(buffer)
|
||||
|
||||
// The document is already UTF-8, do not do anything.
|
||||
if utf8.Valid(buffer) {
|
||||
return internalReader, nil
|
||||
}
|
||||
|
||||
// Transform document to UTF-8 from the specified encoding in Content-Type header.
|
||||
// Note that only the first 1024 bytes are used to detect the encoding.
|
||||
// If the <meta charset> tag is not found in the first 1024 bytes, charset.DetermineEncoding returns "windows-1252" resulting in encoding issues.
|
||||
// See https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
|
||||
return charset.NewReader(internalReader, contentType)
|
||||
}
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
package encoding // import "miniflux.app/v2/internal/reader/encoding"
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
"os"
|
||||
"testing"
|
||||
|
@ -31,6 +32,11 @@ func TestCharsetReaderWithUTF8(t *testing.T) {
|
|||
if !utf8.Valid(data) {
|
||||
t.Fatalf("Data is not valid UTF-8")
|
||||
}
|
||||
|
||||
expectedUnicodeString := "Café"
|
||||
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCharsetReaderWithISO88591(t *testing.T) {
|
||||
|
@ -54,6 +60,11 @@ func TestCharsetReaderWithISO88591(t *testing.T) {
|
|||
if !utf8.Valid(data) {
|
||||
t.Fatalf("Data is not valid UTF-8")
|
||||
}
|
||||
|
||||
expectedUnicodeString := "Café"
|
||||
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCharsetReaderWithWindows1252(t *testing.T) {
|
||||
|
@ -77,6 +88,11 @@ func TestCharsetReaderWithWindows1252(t *testing.T) {
|
|||
if !utf8.Valid(data) {
|
||||
t.Fatalf("Data is not valid UTF-8")
|
||||
}
|
||||
|
||||
expectedUnicodeString := "Euro €"
|
||||
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCharsetReaderWithInvalidProlog(t *testing.T) {
|
||||
|
@ -100,6 +116,11 @@ func TestCharsetReaderWithInvalidProlog(t *testing.T) {
|
|||
if !utf8.Valid(data) {
|
||||
t.Fatalf("Data is not valid UTF-8")
|
||||
}
|
||||
|
||||
expectedUnicodeString := "Café"
|
||||
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCharsetReaderWithUTF8DocumentWithIncorrectProlog(t *testing.T) {
|
||||
|
@ -123,6 +144,11 @@ func TestCharsetReaderWithUTF8DocumentWithIncorrectProlog(t *testing.T) {
|
|||
if !utf8.Valid(data) {
|
||||
t.Fatalf("Data is not valid UTF-8")
|
||||
}
|
||||
|
||||
expectedUnicodeString := "Café"
|
||||
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCharsetReaderWithWindows1252DocumentWithIncorrectProlog(t *testing.T) {
|
||||
|
@ -146,4 +172,177 @@ func TestCharsetReaderWithWindows1252DocumentWithIncorrectProlog(t *testing.T) {
|
|||
if !utf8.Valid(data) {
|
||||
t.Fatalf("Data is not valid UTF-8")
|
||||
}
|
||||
|
||||
expectedUnicodeString := "Euro €"
|
||||
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewReaderWithUTF8Document(t *testing.T) {
|
||||
file := "testdata/utf8.html"
|
||||
|
||||
f, err := os.Open(file)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to open file: %v", err)
|
||||
}
|
||||
|
||||
reader, err := NewCharsetReader(f, "text/html; charset=UTF-8")
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to create reader: %v", err)
|
||||
}
|
||||
|
||||
data, err := io.ReadAll(reader)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to read data: %v", err)
|
||||
}
|
||||
|
||||
if !utf8.Valid(data) {
|
||||
t.Fatalf("Data is not valid UTF-8")
|
||||
}
|
||||
|
||||
expectedUnicodeString := "Café"
|
||||
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewReaderWithUTF8DocumentAndNoContentEncoding(t *testing.T) {
|
||||
file := "testdata/utf8.html"
|
||||
|
||||
f, err := os.Open(file)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to open file: %v", err)
|
||||
}
|
||||
|
||||
reader, err := NewCharsetReader(f, "text/html")
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to create reader: %v", err)
|
||||
}
|
||||
|
||||
data, err := io.ReadAll(reader)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to read data: %v", err)
|
||||
}
|
||||
|
||||
if !utf8.Valid(data) {
|
||||
t.Fatalf("Data is not valid UTF-8")
|
||||
}
|
||||
|
||||
expectedUnicodeString := "Café"
|
||||
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewReaderWithISO88591Document(t *testing.T) {
|
||||
file := "testdata/iso-8859-1.xml"
|
||||
|
||||
f, err := os.Open(file)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to open file: %v", err)
|
||||
}
|
||||
|
||||
reader, err := NewCharsetReader(f, "text/html; charset=ISO-8859-1")
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to create reader: %v", err)
|
||||
}
|
||||
|
||||
data, err := io.ReadAll(reader)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to read data: %v", err)
|
||||
}
|
||||
|
||||
if !utf8.Valid(data) {
|
||||
t.Fatalf("Data is not valid UTF-8")
|
||||
}
|
||||
|
||||
expectedUnicodeString := "Café"
|
||||
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewReaderWithISO88591DocumentAndNoContentType(t *testing.T) {
|
||||
file := "testdata/iso-8859-1.xml"
|
||||
|
||||
f, err := os.Open(file)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to open file: %v", err)
|
||||
}
|
||||
|
||||
reader, err := NewCharsetReader(f, "")
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to create reader: %v", err)
|
||||
}
|
||||
|
||||
data, err := io.ReadAll(reader)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to read data: %v", err)
|
||||
}
|
||||
|
||||
if !utf8.Valid(data) {
|
||||
t.Fatalf("Data is not valid UTF-8")
|
||||
}
|
||||
|
||||
expectedUnicodeString := "Café"
|
||||
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewReaderWithISO88591DocumentWithMetaAfter1024Bytes(t *testing.T) {
|
||||
file := "testdata/iso-8859-1-meta-after-1024.html"
|
||||
|
||||
f, err := os.Open(file)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to open file: %v", err)
|
||||
}
|
||||
|
||||
reader, err := NewCharsetReader(f, "text/html")
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to create reader: %v", err)
|
||||
}
|
||||
|
||||
data, err := io.ReadAll(reader)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to read data: %v", err)
|
||||
}
|
||||
|
||||
if !utf8.Valid(data) {
|
||||
t.Fatalf("Data is not valid UTF-8")
|
||||
}
|
||||
|
||||
expectedUnicodeString := "Café"
|
||||
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewReaderWithUTF8DocumentWithMetaAfter1024Bytes(t *testing.T) {
|
||||
file := "testdata/utf8-meta-after-1024.html"
|
||||
|
||||
f, err := os.Open(file)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to open file: %v", err)
|
||||
}
|
||||
|
||||
reader, err := NewCharsetReader(f, "text/html")
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to create reader: %v", err)
|
||||
}
|
||||
|
||||
data, err := io.ReadAll(reader)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to read data: %v", err)
|
||||
}
|
||||
|
||||
if !utf8.Valid(data) {
|
||||
t.Fatalf("Data is not valid UTF-8")
|
||||
}
|
||||
|
||||
expectedUnicodeString := "Café"
|
||||
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,6 +2,6 @@
|
|||
<feed>
|
||||
<title>테스트 피드</title>
|
||||
<entry>
|
||||
<title>こんにちは世界</title>
|
||||
<title>Café</title>
|
||||
</entry>
|
||||
</feed>
|
48
internal/reader/encoding/testdata/iso-8859-1-meta-after-1024.html
vendored
Normal file
48
internal/reader/encoding/testdata/iso-8859-1-meta-after-1024.html
vendored
Normal file
|
@ -0,0 +1,48 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<!---
|
||||
|
||||
This text is greater than 1024 bytes which are used by the charset.NewReader to determine the encoding of the file.
|
||||
|
||||
This comment is used to pad the file to 1024 bytes.
|
||||
|
||||
The <meta> tag must be after 1024 bytes to ensure that the encoding is detected correctly.
|
||||
|
||||
---
|
||||
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
|
||||
-->
|
||||
<head>
|
||||
<meta charset="iso-8859-1">
|
||||
<title>Frédéric</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>Café</p>
|
||||
</body>
|
||||
</html>
|
10
internal/reader/encoding/testdata/iso-8859-1.html
vendored
Normal file
10
internal/reader/encoding/testdata/iso-8859-1.html
vendored
Normal file
|
@ -0,0 +1,10 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="iso-8859-1">
|
||||
<title>Frédéric</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>Café</p>
|
||||
</body>
|
||||
</html>
|
|
@ -2,6 +2,6 @@
|
|||
<feed>
|
||||
<title>테스트 피드</title>
|
||||
<entry>
|
||||
<title>こんにちは世界</title>
|
||||
<title>Café</title>
|
||||
</entry>
|
||||
</feed>
|
48
internal/reader/encoding/testdata/utf8-meta-after-1024.html
vendored
Normal file
48
internal/reader/encoding/testdata/utf8-meta-after-1024.html
vendored
Normal file
|
@ -0,0 +1,48 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<!---
|
||||
|
||||
This text is greater than 1024 bytes which are used by the charset.NewReader to determine the encoding of the file.
|
||||
|
||||
This comment is used to pad the file to 1024 bytes.
|
||||
|
||||
The <meta> tag must be after 1024 bytes to ensure that the encoding is detected correctly.
|
||||
|
||||
---
|
||||
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
|
||||
-->
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Frédéric</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>Café</p>
|
||||
</body>
|
||||
</html>
|
10
internal/reader/encoding/testdata/utf8.html
vendored
Normal file
10
internal/reader/encoding/testdata/utf8.html
vendored
Normal file
|
@ -0,0 +1,10 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Café</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>Café</p>
|
||||
</body>
|
||||
</html>
|
2
internal/reader/encoding/testdata/utf8.xml
vendored
2
internal/reader/encoding/testdata/utf8.xml
vendored
|
@ -2,6 +2,6 @@
|
|||
<feed>
|
||||
<title>테스트 피드</title>
|
||||
<entry>
|
||||
<title>こんにちは世界</title>
|
||||
<title>Café</title>
|
||||
</entry>
|
||||
</feed>
|
Loading…
Add table
Add a link
Reference in a new issue