diff --git a/internal/reader/xml/decoder.go b/internal/reader/xml/decoder.go
index c9cb5346..f58f43ea 100644
--- a/internal/reader/xml/decoder.go
+++ b/internal/reader/xml/decoder.go
@@ -21,18 +21,23 @@ func NewXMLDecoder(data io.ReadSeeker) *xml.Decoder {
buffer := &bytes.Buffer{}
io.Copy(buffer, data)
- enc := getEncoding(buffer.Bytes())
- if enc == nil || bytes.EqualFold(enc, []byte("utf-8")) {
- // filter invalid chars now, since decoder.CharsetReader isn't called for utf-8 content
+ if hasUTF8XMLDeclaration(buffer.Bytes()) {
+ // TODO: detect actual encoding from bytes if not UTF-8 and convert to UTF-8 if needed.
+ // For now we just expect the invalid characters to be stripped out.
+
+ // Filter invalid chars now, since decoder.CharsetReader isn't called for utf-8 content
filteredBytes := filterValidXMLChars(buffer.Bytes())
+
decoder = xml.NewDecoder(bytes.NewReader(filteredBytes))
} else {
data.Seek(0, io.SeekStart)
- // invalid characters will be filtered later via decoder.CharsetReader
decoder = xml.NewDecoder(data)
+
+ // The XML document will be converted to UTF-8 by encoding.CharsetReader
+ // Invalid characters will be filtered later via decoder.CharsetReader
+ decoder.CharsetReader = charsetReaderFilterInvalidUtf8
}
- decoder.CharsetReader = charsetReaderFilterInvalidUtf8
decoder.Entity = xml.HTMLEntity
decoder.Strict = false
@@ -46,7 +51,7 @@ func charsetReaderFilterInvalidUtf8(charset string, input io.Reader) (io.Reader,
}
rawData, err := io.ReadAll(utf8Reader)
if err != nil {
- return nil, fmt.Errorf("encoding: unable to read data: %w", err)
+ return nil, fmt.Errorf("xml: unable to read data: %w", err)
}
filteredBytes := filterValidXMLChars(rawData)
return bytes.NewReader(filteredBytes), nil
@@ -110,3 +115,8 @@ func getEncoding(b []byte) []byte {
}
return v[1 : idx+1]
}
+
+func hasUTF8XMLDeclaration(data []byte) bool {
+ enc := getEncoding(data)
+ return enc == nil || bytes.EqualFold(enc, []byte("utf-8"))
+}
diff --git a/internal/reader/xml/decoder_test.go b/internal/reader/xml/decoder_test.go
index 879363a7..d33a204e 100644
--- a/internal/reader/xml/decoder_test.go
+++ b/internal/reader/xml/decoder_test.go
@@ -6,11 +6,78 @@ package xml // import "miniflux.app/v2/internal/reader/xml"
import (
"encoding/xml"
"fmt"
+ "os"
"strings"
"testing"
"unicode/utf8"
)
+func TestXMLDocumentWithISO88591Encoding(t *testing.T) {
+ fp, err := os.Open("testdata/iso88591.xml")
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer fp.Close()
+
+ type myXMLDocument struct {
+ XMLName xml.Name `xml:"note"`
+ To string `xml:"to"`
+ From string `xml:"from"`
+ }
+
+ var doc myXMLDocument
+
+ decoder := NewXMLDecoder(fp)
+ err = decoder.Decode(&doc)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ expectedTo := "Anaïs"
+ expectedFrom := "Jürgen"
+
+ if doc.To != expectedTo {
+ t.Errorf(`Incorrect "to" field, expected: %q, got: %q`, expectedTo, doc.To)
+ }
+ if doc.From != expectedFrom {
+ t.Errorf(`Incorrect "from" field, expected: %q, got: %q`, expectedFrom, doc.From)
+ }
+}
+
+func TestXMLDocumentWithISO88591FileEncodingButUTF8Prolog(t *testing.T) {
+ fp, err := os.Open("testdata/iso88591_utf8_mismatch.xml")
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer fp.Close()
+
+ type myXMLDocument struct {
+ XMLName xml.Name `xml:"note"`
+ To string `xml:"to"`
+ From string `xml:"from"`
+ }
+
+ var doc myXMLDocument
+
+ decoder := NewXMLDecoder(fp)
+ err = decoder.Decode(&doc)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ // TODO: detect actual encoding from bytes if not UTF-8 and convert to UTF-8 if needed.
+ // For now we just expect the invalid characters to be stripped out.
+ expectedTo := "Anas"
+ expectedFrom := "Jrgen"
+
+ if doc.To != expectedTo {
+ t.Errorf(`Incorrect "to" field, expected: %q, got: %q`, expectedTo, doc.To)
+ }
+ if doc.From != expectedFrom {
+ t.Errorf(`Incorrect "from" field, expected: %q, got: %q`, expectedFrom, doc.From)
+ }
+}
+
func TestXMLDocumentWithIllegalUnicodeCharacters(t *testing.T) {
type myxml struct {
XMLName xml.Name `xml:"rss"`
diff --git a/internal/reader/xml/testdata/iso88591.xml b/internal/reader/xml/testdata/iso88591.xml
new file mode 100644
index 00000000..df506038
--- /dev/null
+++ b/internal/reader/xml/testdata/iso88591.xml
@@ -0,0 +1,5 @@
+
+
+ Anaïs
+ Jürgen
+
diff --git a/internal/reader/xml/testdata/iso88591_utf8_mismatch.xml b/internal/reader/xml/testdata/iso88591_utf8_mismatch.xml
new file mode 100644
index 00000000..2fa1a61d
--- /dev/null
+++ b/internal/reader/xml/testdata/iso88591_utf8_mismatch.xml
@@ -0,0 +1,5 @@
+
+
+ Anaïs
+ Jürgen
+