diff --git a/internal/reader/xml/decoder.go b/internal/reader/xml/decoder.go index c9cb5346..f58f43ea 100644 --- a/internal/reader/xml/decoder.go +++ b/internal/reader/xml/decoder.go @@ -21,18 +21,23 @@ func NewXMLDecoder(data io.ReadSeeker) *xml.Decoder { buffer := &bytes.Buffer{} io.Copy(buffer, data) - enc := getEncoding(buffer.Bytes()) - if enc == nil || bytes.EqualFold(enc, []byte("utf-8")) { - // filter invalid chars now, since decoder.CharsetReader isn't called for utf-8 content + if hasUTF8XMLDeclaration(buffer.Bytes()) { + // TODO: detect actual encoding from bytes if not UTF-8 and convert to UTF-8 if needed. + // For now we just expect the invalid characters to be stripped out. + + // Filter invalid chars now, since decoder.CharsetReader isn't called for utf-8 content filteredBytes := filterValidXMLChars(buffer.Bytes()) + decoder = xml.NewDecoder(bytes.NewReader(filteredBytes)) } else { data.Seek(0, io.SeekStart) - // invalid characters will be filtered later via decoder.CharsetReader decoder = xml.NewDecoder(data) + + // The XML document will be converted to UTF-8 by encoding.CharsetReader + // Invalid characters will be filtered later via decoder.CharsetReader + decoder.CharsetReader = charsetReaderFilterInvalidUtf8 } - decoder.CharsetReader = charsetReaderFilterInvalidUtf8 decoder.Entity = xml.HTMLEntity decoder.Strict = false @@ -46,7 +51,7 @@ func charsetReaderFilterInvalidUtf8(charset string, input io.Reader) (io.Reader, } rawData, err := io.ReadAll(utf8Reader) if err != nil { - return nil, fmt.Errorf("encoding: unable to read data: %w", err) + return nil, fmt.Errorf("xml: unable to read data: %w", err) } filteredBytes := filterValidXMLChars(rawData) return bytes.NewReader(filteredBytes), nil @@ -110,3 +115,8 @@ func getEncoding(b []byte) []byte { } return v[1 : idx+1] } + +func hasUTF8XMLDeclaration(data []byte) bool { + enc := getEncoding(data) + return enc == nil || bytes.EqualFold(enc, []byte("utf-8")) +} diff --git a/internal/reader/xml/decoder_test.go b/internal/reader/xml/decoder_test.go index 879363a7..d33a204e 100644 --- a/internal/reader/xml/decoder_test.go +++ b/internal/reader/xml/decoder_test.go @@ -6,11 +6,78 @@ package xml // import "miniflux.app/v2/internal/reader/xml" import ( "encoding/xml" "fmt" + "os" "strings" "testing" "unicode/utf8" ) +func TestXMLDocumentWithISO88591Encoding(t *testing.T) { + fp, err := os.Open("testdata/iso88591.xml") + if err != nil { + t.Fatal(err) + } + defer fp.Close() + + type myXMLDocument struct { + XMLName xml.Name `xml:"note"` + To string `xml:"to"` + From string `xml:"from"` + } + + var doc myXMLDocument + + decoder := NewXMLDecoder(fp) + err = decoder.Decode(&doc) + if err != nil { + t.Fatal(err) + } + + expectedTo := "Anaïs" + expectedFrom := "Jürgen" + + if doc.To != expectedTo { + t.Errorf(`Incorrect "to" field, expected: %q, got: %q`, expectedTo, doc.To) + } + if doc.From != expectedFrom { + t.Errorf(`Incorrect "from" field, expected: %q, got: %q`, expectedFrom, doc.From) + } +} + +func TestXMLDocumentWithISO88591FileEncodingButUTF8Prolog(t *testing.T) { + fp, err := os.Open("testdata/iso88591_utf8_mismatch.xml") + if err != nil { + t.Fatal(err) + } + defer fp.Close() + + type myXMLDocument struct { + XMLName xml.Name `xml:"note"` + To string `xml:"to"` + From string `xml:"from"` + } + + var doc myXMLDocument + + decoder := NewXMLDecoder(fp) + err = decoder.Decode(&doc) + if err != nil { + t.Fatal(err) + } + + // TODO: detect actual encoding from bytes if not UTF-8 and convert to UTF-8 if needed. + // For now we just expect the invalid characters to be stripped out. + expectedTo := "Anas" + expectedFrom := "Jrgen" + + if doc.To != expectedTo { + t.Errorf(`Incorrect "to" field, expected: %q, got: %q`, expectedTo, doc.To) + } + if doc.From != expectedFrom { + t.Errorf(`Incorrect "from" field, expected: %q, got: %q`, expectedFrom, doc.From) + } +} + func TestXMLDocumentWithIllegalUnicodeCharacters(t *testing.T) { type myxml struct { XMLName xml.Name `xml:"rss"` diff --git a/internal/reader/xml/testdata/iso88591.xml b/internal/reader/xml/testdata/iso88591.xml new file mode 100644 index 00000000..df506038 --- /dev/null +++ b/internal/reader/xml/testdata/iso88591.xml @@ -0,0 +1,5 @@ + + + Anaïs + Jürgen + diff --git a/internal/reader/xml/testdata/iso88591_utf8_mismatch.xml b/internal/reader/xml/testdata/iso88591_utf8_mismatch.xml new file mode 100644 index 00000000..2fa1a61d --- /dev/null +++ b/internal/reader/xml/testdata/iso88591_utf8_mismatch.xml @@ -0,0 +1,5 @@ + + + Anaïs + Jürgen +