fix(scraper): avoid encoding issue if charset meta tag is after 1024 bytes

2025-09-15 18:57:04 +00:00 · 2025-02-15 16:58:06 -08:00 · 2025-02-15 16:58:06 -08:00 · 6eedf4111f
commit 6eedf4111f
parent af1f966250
12 changed files with 352 additions and 10 deletions
--- a/internal/reader/encoding/encoding.go
+++ b/internal/reader/encoding/encoding.go
@ -5,6 +5,7 @@ package encoding // import "miniflux.app/v2/internal/reader/encoding"

 import (
 	"bytes"
+	"fmt"
 	"io"
 	"unicode/utf8"

@ -23,7 +24,11 @@ import (
 // - Feeds with encoding specified only in XML document and not in HTTP header
 // - Feeds with wrong encoding defined and already in UTF-8
 func CharsetReader(charsetLabel string, input io.Reader) (io.Reader, error) {
-	buffer, _ := io.ReadAll(input)
+	buffer, err := io.ReadAll(input)
+	if err != nil {
+		return nil, fmt.Errorf(`encoding: unable to read input: %w`, err)
+	}
+
 	r := bytes.NewReader(buffer)

 	// The document is already UTF-8, do not do anything (avoid double-encoding).
@ -35,3 +40,24 @@ func CharsetReader(charsetLabel string, input io.Reader) (io.Reader, error) {
 	// Transform document to UTF-8 from the specified encoding in XML prolog.
 	return charset.NewReaderLabel(charsetLabel, r)
 }
+
+// NewCharsetReader returns an io.Reader that converts the content of r to UTF-8.
+func NewCharsetReader(r io.Reader, contentType string) (io.Reader, error) {
+	buffer, err := io.ReadAll(r)
+	if err != nil {
+		return nil, fmt.Errorf(`encoding: unable to read input: %w`, err)
+	}
+
+	internalReader := bytes.NewReader(buffer)
+
+	// The document is already UTF-8, do not do anything.
+	if utf8.Valid(buffer) {
+		return internalReader, nil
+	}
+
+	// Transform document to UTF-8 from the specified encoding in Content-Type header.
+	// Note that only the first 1024 bytes are used to detect the encoding.
+	// If the <meta charset> tag is not found in the first 1024 bytes, charset.DetermineEncoding returns "windows-1252" resulting in encoding issues.
+	// See https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
+	return charset.NewReader(internalReader, contentType)
+}
--- a/internal/reader/encoding/encoding_test.go
+++ b/internal/reader/encoding/encoding_test.go
@ -4,6 +4,7 @@
 package encoding // import "miniflux.app/v2/internal/reader/encoding"

 import (
+	"bytes"
 	"io"
 	"os"
 	"testing"
@ -31,6 +32,11 @@ func TestCharsetReaderWithUTF8(t *testing.T) {
 	if !utf8.Valid(data) {
 		t.Fatalf("Data is not valid UTF-8")
 	}
+
+	expectedUnicodeString := "Café"
+	if !bytes.Contains(data, []byte(expectedUnicodeString)) {
+		t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
+	}
 }

 func TestCharsetReaderWithISO88591(t *testing.T) {
@ -54,6 +60,11 @@ func TestCharsetReaderWithISO88591(t *testing.T) {
 	if !utf8.Valid(data) {
 		t.Fatalf("Data is not valid UTF-8")
 	}
+
+	expectedUnicodeString := "Café"
+	if !bytes.Contains(data, []byte(expectedUnicodeString)) {
+		t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
+	}
 }

 func TestCharsetReaderWithWindows1252(t *testing.T) {
@ -77,6 +88,11 @@ func TestCharsetReaderWithWindows1252(t *testing.T) {
 	if !utf8.Valid(data) {
 		t.Fatalf("Data is not valid UTF-8")
 	}
+
+	expectedUnicodeString := "Euro €"
+	if !bytes.Contains(data, []byte(expectedUnicodeString)) {
+		t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
+	}
 }

 func TestCharsetReaderWithInvalidProlog(t *testing.T) {
@ -100,6 +116,11 @@ func TestCharsetReaderWithInvalidProlog(t *testing.T) {
 	if !utf8.Valid(data) {
 		t.Fatalf("Data is not valid UTF-8")
 	}
+
+	expectedUnicodeString := "Café"
+	if !bytes.Contains(data, []byte(expectedUnicodeString)) {
+		t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
+	}
 }

 func TestCharsetReaderWithUTF8DocumentWithIncorrectProlog(t *testing.T) {
@ -123,6 +144,11 @@ func TestCharsetReaderWithUTF8DocumentWithIncorrectProlog(t *testing.T) {
 	if !utf8.Valid(data) {
 		t.Fatalf("Data is not valid UTF-8")
 	}
+
+	expectedUnicodeString := "Café"
+	if !bytes.Contains(data, []byte(expectedUnicodeString)) {
+		t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
+	}
 }

 func TestCharsetReaderWithWindows1252DocumentWithIncorrectProlog(t *testing.T) {
@ -146,4 +172,177 @@ func TestCharsetReaderWithWindows1252DocumentWithIncorrectProlog(t *testing.T) {
 	if !utf8.Valid(data) {
 		t.Fatalf("Data is not valid UTF-8")
 	}
+
+	expectedUnicodeString := "Euro €"
+	if !bytes.Contains(data, []byte(expectedUnicodeString)) {
+		t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
+	}
+}
+
+func TestNewReaderWithUTF8Document(t *testing.T) {
+	file := "testdata/utf8.html"
+
+	f, err := os.Open(file)
+	if err != nil {
+		t.Fatalf("Unable to open file: %v", err)
+	}
+
+	reader, err := NewCharsetReader(f, "text/html; charset=UTF-8")
+	if err != nil {
+		t.Fatalf("Unable to create reader: %v", err)
+	}
+
+	data, err := io.ReadAll(reader)
+	if err != nil {
+		t.Fatalf("Unable to read data: %v", err)
+	}
+
+	if !utf8.Valid(data) {
+		t.Fatalf("Data is not valid UTF-8")
+	}
+
+	expectedUnicodeString := "Café"
+	if !bytes.Contains(data, []byte(expectedUnicodeString)) {
+		t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
+	}
+}
+
+func TestNewReaderWithUTF8DocumentAndNoContentEncoding(t *testing.T) {
+	file := "testdata/utf8.html"
+
+	f, err := os.Open(file)
+	if err != nil {
+		t.Fatalf("Unable to open file: %v", err)
+	}
+
+	reader, err := NewCharsetReader(f, "text/html")
+	if err != nil {
+		t.Fatalf("Unable to create reader: %v", err)
+	}
+
+	data, err := io.ReadAll(reader)
+	if err != nil {
+		t.Fatalf("Unable to read data: %v", err)
+	}
+
+	if !utf8.Valid(data) {
+		t.Fatalf("Data is not valid UTF-8")
+	}
+
+	expectedUnicodeString := "Café"
+	if !bytes.Contains(data, []byte(expectedUnicodeString)) {
+		t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
+	}
+}
+
+func TestNewReaderWithISO88591Document(t *testing.T) {
+	file := "testdata/iso-8859-1.xml"
+
+	f, err := os.Open(file)
+	if err != nil {
+		t.Fatalf("Unable to open file: %v", err)
+	}
+
+	reader, err := NewCharsetReader(f, "text/html; charset=ISO-8859-1")
+	if err != nil {
+		t.Fatalf("Unable to create reader: %v", err)
+	}
+
+	data, err := io.ReadAll(reader)
+	if err != nil {
+		t.Fatalf("Unable to read data: %v", err)
+	}
+
+	if !utf8.Valid(data) {
+		t.Fatalf("Data is not valid UTF-8")
+	}
+
+	expectedUnicodeString := "Café"
+	if !bytes.Contains(data, []byte(expectedUnicodeString)) {
+		t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
+	}
+}
+
+func TestNewReaderWithISO88591DocumentAndNoContentType(t *testing.T) {
+	file := "testdata/iso-8859-1.xml"
+
+	f, err := os.Open(file)
+	if err != nil {
+		t.Fatalf("Unable to open file: %v", err)
+	}
+
+	reader, err := NewCharsetReader(f, "")
+	if err != nil {
+		t.Fatalf("Unable to create reader: %v", err)
+	}
+
+	data, err := io.ReadAll(reader)
+	if err != nil {
+		t.Fatalf("Unable to read data: %v", err)
+	}
+
+	if !utf8.Valid(data) {
+		t.Fatalf("Data is not valid UTF-8")
+	}
+
+	expectedUnicodeString := "Café"
+	if !bytes.Contains(data, []byte(expectedUnicodeString)) {
+		t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
+	}
+}
+
+func TestNewReaderWithISO88591DocumentWithMetaAfter1024Bytes(t *testing.T) {
+	file := "testdata/iso-8859-1-meta-after-1024.html"
+
+	f, err := os.Open(file)
+	if err != nil {
+		t.Fatalf("Unable to open file: %v", err)
+	}
+
+	reader, err := NewCharsetReader(f, "text/html")
+	if err != nil {
+		t.Fatalf("Unable to create reader: %v", err)
+	}
+
+	data, err := io.ReadAll(reader)
+	if err != nil {
+		t.Fatalf("Unable to read data: %v", err)
+	}
+
+	if !utf8.Valid(data) {
+		t.Fatalf("Data is not valid UTF-8")
+	}
+
+	expectedUnicodeString := "Café"
+	if !bytes.Contains(data, []byte(expectedUnicodeString)) {
+		t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
+	}
+}
+
+func TestNewReaderWithUTF8DocumentWithMetaAfter1024Bytes(t *testing.T) {
+	file := "testdata/utf8-meta-after-1024.html"
+
+	f, err := os.Open(file)
+	if err != nil {
+		t.Fatalf("Unable to open file: %v", err)
+	}
+
+	reader, err := NewCharsetReader(f, "text/html")
+	if err != nil {
+		t.Fatalf("Unable to create reader: %v", err)
+	}
+
+	data, err := io.ReadAll(reader)
+	if err != nil {
+		t.Fatalf("Unable to read data: %v", err)
+	}
+
+	if !utf8.Valid(data) {
+		t.Fatalf("Data is not valid UTF-8")
+	}
+
+	expectedUnicodeString := "Café"
+	if !bytes.Contains(data, []byte(expectedUnicodeString)) {
+		t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
+	}
 }
--- a/internal/reader/encoding/testdata/invalid-prolog.xml
+++ b/internal/reader/encoding/testdata/invalid-prolog.xml
@ -2,6 +2,6 @@
 <feed>
    <title>테스트 피드</title>
    <entry>
-        <title>こんにちは世界</title>
+        <title>Café</title>
    </entry>
 </feed>
--- a/internal/reader/encoding/testdata/iso-8859-1-meta-after-1024.html
+++ b/internal/reader/encoding/testdata/iso-8859-1-meta-after-1024.html
@ -0,0 +1,48 @@
+<!DOCTYPE html>
+<html>
+  <!---
+
+  This text is greater than 1024 bytes which are used by the charset.NewReader to determine the encoding of the file.
+
+  This comment is used to pad the file to 1024 bytes.
+
+  The <meta> tag must be after 1024 bytes to ensure that the encoding is detected correctly.
+
+  ---
+
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+
+  -->
+  <head>
+    <meta charset="iso-8859-1">
+    <title>Frédéric</title>
+  </head>
+  <body>
+    <p>Café</p>
+  </body>
+</html>
--- a/internal/reader/encoding/testdata/iso-8859-1.html
+++ b/internal/reader/encoding/testdata/iso-8859-1.html
@ -0,0 +1,10 @@
+<!DOCTYPE html>
+<html>
+  <head>
+    <meta charset="iso-8859-1">
+    <title>Frédéric</title>
+  </head>
+  <body>
+    <p>Café</p>
+  </body>
+</html>
--- a/internal/reader/encoding/testdata/utf8-incorrect-prolog.xml
+++ b/internal/reader/encoding/testdata/utf8-incorrect-prolog.xml
@ -2,6 +2,6 @@
 <feed>
    <title>테스트 피드</title>
    <entry>
-        <title>こんにちは世界</title>
+        <title>Café</title>
    </entry>
 </feed>
--- a/internal/reader/encoding/testdata/utf8-meta-after-1024.html
+++ b/internal/reader/encoding/testdata/utf8-meta-after-1024.html
@ -0,0 +1,48 @@
+<!DOCTYPE html>
+<html>
+  <!---
+
+  This text is greater than 1024 bytes which are used by the charset.NewReader to determine the encoding of the file.
+
+  This comment is used to pad the file to 1024 bytes.
+
+  The <meta> tag must be after 1024 bytes to ensure that the encoding is detected correctly.
+
+  ---
+
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+  More text to pad the file to 1024 bytes.
+
+  -->
+  <head>
+    <meta charset="utf-8">
+    <title>Frédéric</title>
+  </head>
+  <body>
+    <p>Café</p>
+  </body>
+</html>
--- a/internal/reader/encoding/testdata/utf8.html
+++ b/internal/reader/encoding/testdata/utf8.html
@ -0,0 +1,10 @@
+<!DOCTYPE html>
+<html>
+  <head>
+    <meta charset="utf-8">
+    <title>Café</title>
+  </head>
+  <body>
+    <p>Café</p>
+  </body>
+</html>
--- a/internal/reader/encoding/testdata/utf8.xml
+++ b/internal/reader/encoding/testdata/utf8.xml
@ -2,6 +2,6 @@
 <feed>
    <title>테스트 피드</title>
    <entry>
-        <title>こんにちは世界</title>
+        <title>Café</title>
    </entry>
 </feed>