diff --git a/internal/reader/encoding/encoding.go b/internal/reader/encoding/encoding.go
index 3987a885..ea33bd1f 100644
--- a/internal/reader/encoding/encoding.go
+++ b/internal/reader/encoding/encoding.go
@@ -5,6 +5,7 @@ package encoding // import "miniflux.app/v2/internal/reader/encoding"
import (
"bytes"
+ "fmt"
"io"
"unicode/utf8"
@@ -23,7 +24,11 @@ import (
// - Feeds with encoding specified only in XML document and not in HTTP header
// - Feeds with wrong encoding defined and already in UTF-8
func CharsetReader(charsetLabel string, input io.Reader) (io.Reader, error) {
- buffer, _ := io.ReadAll(input)
+ buffer, err := io.ReadAll(input)
+ if err != nil {
+ return nil, fmt.Errorf(`encoding: unable to read input: %w`, err)
+ }
+
r := bytes.NewReader(buffer)
// The document is already UTF-8, do not do anything (avoid double-encoding).
@@ -35,3 +40,24 @@ func CharsetReader(charsetLabel string, input io.Reader) (io.Reader, error) {
// Transform document to UTF-8 from the specified encoding in XML prolog.
return charset.NewReaderLabel(charsetLabel, r)
}
+
+// NewCharsetReader returns an io.Reader that converts the content of r to UTF-8.
+func NewCharsetReader(r io.Reader, contentType string) (io.Reader, error) {
+ buffer, err := io.ReadAll(r)
+ if err != nil {
+ return nil, fmt.Errorf(`encoding: unable to read input: %w`, err)
+ }
+
+ internalReader := bytes.NewReader(buffer)
+
+ // The document is already UTF-8, do not do anything.
+ if utf8.Valid(buffer) {
+ return internalReader, nil
+ }
+
+ // Transform document to UTF-8 from the specified encoding in Content-Type header.
+ // Note that only the first 1024 bytes are used to detect the encoding.
+ // If the tag is not found in the first 1024 bytes, charset.DetermineEncoding returns "windows-1252" resulting in encoding issues.
+ // See https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
+ return charset.NewReader(internalReader, contentType)
+}
diff --git a/internal/reader/encoding/encoding_test.go b/internal/reader/encoding/encoding_test.go
index 10d326ba..708a5e71 100644
--- a/internal/reader/encoding/encoding_test.go
+++ b/internal/reader/encoding/encoding_test.go
@@ -4,6 +4,7 @@
package encoding // import "miniflux.app/v2/internal/reader/encoding"
import (
+ "bytes"
"io"
"os"
"testing"
@@ -31,6 +32,11 @@ func TestCharsetReaderWithUTF8(t *testing.T) {
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
+
+ expectedUnicodeString := "Café"
+ if !bytes.Contains(data, []byte(expectedUnicodeString)) {
+ t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
+ }
}
func TestCharsetReaderWithISO88591(t *testing.T) {
@@ -54,6 +60,11 @@ func TestCharsetReaderWithISO88591(t *testing.T) {
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
+
+ expectedUnicodeString := "Café"
+ if !bytes.Contains(data, []byte(expectedUnicodeString)) {
+ t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
+ }
}
func TestCharsetReaderWithWindows1252(t *testing.T) {
@@ -77,6 +88,11 @@ func TestCharsetReaderWithWindows1252(t *testing.T) {
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
+
+ expectedUnicodeString := "Euro €"
+ if !bytes.Contains(data, []byte(expectedUnicodeString)) {
+ t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
+ }
}
func TestCharsetReaderWithInvalidProlog(t *testing.T) {
@@ -100,6 +116,11 @@ func TestCharsetReaderWithInvalidProlog(t *testing.T) {
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
+
+ expectedUnicodeString := "Café"
+ if !bytes.Contains(data, []byte(expectedUnicodeString)) {
+ t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
+ }
}
func TestCharsetReaderWithUTF8DocumentWithIncorrectProlog(t *testing.T) {
@@ -123,6 +144,11 @@ func TestCharsetReaderWithUTF8DocumentWithIncorrectProlog(t *testing.T) {
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
+
+ expectedUnicodeString := "Café"
+ if !bytes.Contains(data, []byte(expectedUnicodeString)) {
+ t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
+ }
}
func TestCharsetReaderWithWindows1252DocumentWithIncorrectProlog(t *testing.T) {
@@ -146,4 +172,177 @@ func TestCharsetReaderWithWindows1252DocumentWithIncorrectProlog(t *testing.T) {
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
+
+ expectedUnicodeString := "Euro €"
+ if !bytes.Contains(data, []byte(expectedUnicodeString)) {
+ t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
+ }
+}
+
+func TestNewReaderWithUTF8Document(t *testing.T) {
+ file := "testdata/utf8.html"
+
+ f, err := os.Open(file)
+ if err != nil {
+ t.Fatalf("Unable to open file: %v", err)
+ }
+
+ reader, err := NewCharsetReader(f, "text/html; charset=UTF-8")
+ if err != nil {
+ t.Fatalf("Unable to create reader: %v", err)
+ }
+
+ data, err := io.ReadAll(reader)
+ if err != nil {
+ t.Fatalf("Unable to read data: %v", err)
+ }
+
+ if !utf8.Valid(data) {
+ t.Fatalf("Data is not valid UTF-8")
+ }
+
+ expectedUnicodeString := "Café"
+ if !bytes.Contains(data, []byte(expectedUnicodeString)) {
+ t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
+ }
+}
+
+func TestNewReaderWithUTF8DocumentAndNoContentEncoding(t *testing.T) {
+ file := "testdata/utf8.html"
+
+ f, err := os.Open(file)
+ if err != nil {
+ t.Fatalf("Unable to open file: %v", err)
+ }
+
+ reader, err := NewCharsetReader(f, "text/html")
+ if err != nil {
+ t.Fatalf("Unable to create reader: %v", err)
+ }
+
+ data, err := io.ReadAll(reader)
+ if err != nil {
+ t.Fatalf("Unable to read data: %v", err)
+ }
+
+ if !utf8.Valid(data) {
+ t.Fatalf("Data is not valid UTF-8")
+ }
+
+ expectedUnicodeString := "Café"
+ if !bytes.Contains(data, []byte(expectedUnicodeString)) {
+ t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
+ }
+}
+
+func TestNewReaderWithISO88591Document(t *testing.T) {
+ file := "testdata/iso-8859-1.xml"
+
+ f, err := os.Open(file)
+ if err != nil {
+ t.Fatalf("Unable to open file: %v", err)
+ }
+
+ reader, err := NewCharsetReader(f, "text/html; charset=ISO-8859-1")
+ if err != nil {
+ t.Fatalf("Unable to create reader: %v", err)
+ }
+
+ data, err := io.ReadAll(reader)
+ if err != nil {
+ t.Fatalf("Unable to read data: %v", err)
+ }
+
+ if !utf8.Valid(data) {
+ t.Fatalf("Data is not valid UTF-8")
+ }
+
+ expectedUnicodeString := "Café"
+ if !bytes.Contains(data, []byte(expectedUnicodeString)) {
+ t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
+ }
+}
+
+func TestNewReaderWithISO88591DocumentAndNoContentType(t *testing.T) {
+ file := "testdata/iso-8859-1.xml"
+
+ f, err := os.Open(file)
+ if err != nil {
+ t.Fatalf("Unable to open file: %v", err)
+ }
+
+ reader, err := NewCharsetReader(f, "")
+ if err != nil {
+ t.Fatalf("Unable to create reader: %v", err)
+ }
+
+ data, err := io.ReadAll(reader)
+ if err != nil {
+ t.Fatalf("Unable to read data: %v", err)
+ }
+
+ if !utf8.Valid(data) {
+ t.Fatalf("Data is not valid UTF-8")
+ }
+
+ expectedUnicodeString := "Café"
+ if !bytes.Contains(data, []byte(expectedUnicodeString)) {
+ t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
+ }
+}
+
+func TestNewReaderWithISO88591DocumentWithMetaAfter1024Bytes(t *testing.T) {
+ file := "testdata/iso-8859-1-meta-after-1024.html"
+
+ f, err := os.Open(file)
+ if err != nil {
+ t.Fatalf("Unable to open file: %v", err)
+ }
+
+ reader, err := NewCharsetReader(f, "text/html")
+ if err != nil {
+ t.Fatalf("Unable to create reader: %v", err)
+ }
+
+ data, err := io.ReadAll(reader)
+ if err != nil {
+ t.Fatalf("Unable to read data: %v", err)
+ }
+
+ if !utf8.Valid(data) {
+ t.Fatalf("Data is not valid UTF-8")
+ }
+
+ expectedUnicodeString := "Café"
+ if !bytes.Contains(data, []byte(expectedUnicodeString)) {
+ t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
+ }
+}
+
+func TestNewReaderWithUTF8DocumentWithMetaAfter1024Bytes(t *testing.T) {
+ file := "testdata/utf8-meta-after-1024.html"
+
+ f, err := os.Open(file)
+ if err != nil {
+ t.Fatalf("Unable to open file: %v", err)
+ }
+
+ reader, err := NewCharsetReader(f, "text/html")
+ if err != nil {
+ t.Fatalf("Unable to create reader: %v", err)
+ }
+
+ data, err := io.ReadAll(reader)
+ if err != nil {
+ t.Fatalf("Unable to read data: %v", err)
+ }
+
+ if !utf8.Valid(data) {
+ t.Fatalf("Data is not valid UTF-8")
+ }
+
+ expectedUnicodeString := "Café"
+ if !bytes.Contains(data, []byte(expectedUnicodeString)) {
+ t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
+ }
}
diff --git a/internal/reader/encoding/testdata/invalid-prolog.xml b/internal/reader/encoding/testdata/invalid-prolog.xml
index 7c8b1283..70fea0d4 100644
--- a/internal/reader/encoding/testdata/invalid-prolog.xml
+++ b/internal/reader/encoding/testdata/invalid-prolog.xml
@@ -2,6 +2,6 @@
Caf
+ + \ No newline at end of file diff --git a/internal/reader/encoding/testdata/iso-8859-1.html b/internal/reader/encoding/testdata/iso-8859-1.html new file mode 100644 index 00000000..a7275a48 --- /dev/null +++ b/internal/reader/encoding/testdata/iso-8859-1.html @@ -0,0 +1,10 @@ + + + + +Caf
+ + \ No newline at end of file diff --git a/internal/reader/encoding/testdata/utf8-incorrect-prolog.xml b/internal/reader/encoding/testdata/utf8-incorrect-prolog.xml index 96590162..aa955203 100644 --- a/internal/reader/encoding/testdata/utf8-incorrect-prolog.xml +++ b/internal/reader/encoding/testdata/utf8-incorrect-prolog.xml @@ -2,6 +2,6 @@Café
+ + \ No newline at end of file diff --git a/internal/reader/encoding/testdata/utf8.html b/internal/reader/encoding/testdata/utf8.html new file mode 100644 index 00000000..bb7ba7e2 --- /dev/null +++ b/internal/reader/encoding/testdata/utf8.html @@ -0,0 +1,10 @@ + + + + +Café
+ + \ No newline at end of file diff --git a/internal/reader/encoding/testdata/utf8.xml b/internal/reader/encoding/testdata/utf8.xml index 11f10a5a..96bf7329 100644 --- a/internal/reader/encoding/testdata/utf8.xml +++ b/internal/reader/encoding/testdata/utf8.xml @@ -2,6 +2,6 @@