1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-10-05 19:31:01 +00:00

test(xml): add test cases regarding XML encoding

This commit is contained in:
Frédéric Guillot 2025-09-30 20:36:14 -07:00
parent fac18d5c57
commit 04a360a536
4 changed files with 93 additions and 6 deletions

View file

@ -21,18 +21,23 @@ func NewXMLDecoder(data io.ReadSeeker) *xml.Decoder {
buffer := &bytes.Buffer{}
io.Copy(buffer, data)
enc := getEncoding(buffer.Bytes())
if enc == nil || bytes.EqualFold(enc, []byte("utf-8")) {
// filter invalid chars now, since decoder.CharsetReader isn't called for utf-8 content
if hasUTF8XMLDeclaration(buffer.Bytes()) {
// TODO: detect actual encoding from bytes if not UTF-8 and convert to UTF-8 if needed.
// For now we just expect the invalid characters to be stripped out.
// Filter invalid chars now, since decoder.CharsetReader isn't called for utf-8 content
filteredBytes := filterValidXMLChars(buffer.Bytes())
decoder = xml.NewDecoder(bytes.NewReader(filteredBytes))
} else {
data.Seek(0, io.SeekStart)
// invalid characters will be filtered later via decoder.CharsetReader
decoder = xml.NewDecoder(data)
// The XML document will be converted to UTF-8 by encoding.CharsetReader
// Invalid characters will be filtered later via decoder.CharsetReader
decoder.CharsetReader = charsetReaderFilterInvalidUtf8
}
decoder.CharsetReader = charsetReaderFilterInvalidUtf8
decoder.Entity = xml.HTMLEntity
decoder.Strict = false
@ -46,7 +51,7 @@ func charsetReaderFilterInvalidUtf8(charset string, input io.Reader) (io.Reader,
}
rawData, err := io.ReadAll(utf8Reader)
if err != nil {
return nil, fmt.Errorf("encoding: unable to read data: %w", err)
return nil, fmt.Errorf("xml: unable to read data: %w", err)
}
filteredBytes := filterValidXMLChars(rawData)
return bytes.NewReader(filteredBytes), nil
@ -110,3 +115,8 @@ func getEncoding(b []byte) []byte {
}
return v[1 : idx+1]
}
func hasUTF8XMLDeclaration(data []byte) bool {
enc := getEncoding(data)
return enc == nil || bytes.EqualFold(enc, []byte("utf-8"))
}

View file

@ -6,11 +6,78 @@ package xml // import "miniflux.app/v2/internal/reader/xml"
import (
"encoding/xml"
"fmt"
"os"
"strings"
"testing"
"unicode/utf8"
)
func TestXMLDocumentWithISO88591Encoding(t *testing.T) {
fp, err := os.Open("testdata/iso88591.xml")
if err != nil {
t.Fatal(err)
}
defer fp.Close()
type myXMLDocument struct {
XMLName xml.Name `xml:"note"`
To string `xml:"to"`
From string `xml:"from"`
}
var doc myXMLDocument
decoder := NewXMLDecoder(fp)
err = decoder.Decode(&doc)
if err != nil {
t.Fatal(err)
}
expectedTo := "Anaïs"
expectedFrom := "Jürgen"
if doc.To != expectedTo {
t.Errorf(`Incorrect "to" field, expected: %q, got: %q`, expectedTo, doc.To)
}
if doc.From != expectedFrom {
t.Errorf(`Incorrect "from" field, expected: %q, got: %q`, expectedFrom, doc.From)
}
}
func TestXMLDocumentWithISO88591FileEncodingButUTF8Prolog(t *testing.T) {
fp, err := os.Open("testdata/iso88591_utf8_mismatch.xml")
if err != nil {
t.Fatal(err)
}
defer fp.Close()
type myXMLDocument struct {
XMLName xml.Name `xml:"note"`
To string `xml:"to"`
From string `xml:"from"`
}
var doc myXMLDocument
decoder := NewXMLDecoder(fp)
err = decoder.Decode(&doc)
if err != nil {
t.Fatal(err)
}
// TODO: detect actual encoding from bytes if not UTF-8 and convert to UTF-8 if needed.
// For now we just expect the invalid characters to be stripped out.
expectedTo := "Anas"
expectedFrom := "Jrgen"
if doc.To != expectedTo {
t.Errorf(`Incorrect "to" field, expected: %q, got: %q`, expectedTo, doc.To)
}
if doc.From != expectedFrom {
t.Errorf(`Incorrect "from" field, expected: %q, got: %q`, expectedFrom, doc.From)
}
}
func TestXMLDocumentWithIllegalUnicodeCharacters(t *testing.T) {
type myxml struct {
XMLName xml.Name `xml:"rss"`

View file

@ -0,0 +1,5 @@
<?xml version="1.0" encoding="iso8859-1"?>
<note>
<to>Anaïs</to>
<from>Jürgen</from>
</note>

View file

@ -0,0 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<note>
<to>Anaïs</to>
<from>Jürgen</from>
</note>