mirror of
https://github.com/miniflux/v2.git
synced 2025-10-05 19:31:01 +00:00
test(xml): add test cases regarding XML encoding
This commit is contained in:
parent
fac18d5c57
commit
04a360a536
4 changed files with 93 additions and 6 deletions
|
@ -21,18 +21,23 @@ func NewXMLDecoder(data io.ReadSeeker) *xml.Decoder {
|
|||
buffer := &bytes.Buffer{}
|
||||
io.Copy(buffer, data)
|
||||
|
||||
enc := getEncoding(buffer.Bytes())
|
||||
if enc == nil || bytes.EqualFold(enc, []byte("utf-8")) {
|
||||
// filter invalid chars now, since decoder.CharsetReader isn't called for utf-8 content
|
||||
if hasUTF8XMLDeclaration(buffer.Bytes()) {
|
||||
// TODO: detect actual encoding from bytes if not UTF-8 and convert to UTF-8 if needed.
|
||||
// For now we just expect the invalid characters to be stripped out.
|
||||
|
||||
// Filter invalid chars now, since decoder.CharsetReader isn't called for utf-8 content
|
||||
filteredBytes := filterValidXMLChars(buffer.Bytes())
|
||||
|
||||
decoder = xml.NewDecoder(bytes.NewReader(filteredBytes))
|
||||
} else {
|
||||
data.Seek(0, io.SeekStart)
|
||||
// invalid characters will be filtered later via decoder.CharsetReader
|
||||
decoder = xml.NewDecoder(data)
|
||||
|
||||
// The XML document will be converted to UTF-8 by encoding.CharsetReader
|
||||
// Invalid characters will be filtered later via decoder.CharsetReader
|
||||
decoder.CharsetReader = charsetReaderFilterInvalidUtf8
|
||||
}
|
||||
|
||||
decoder.CharsetReader = charsetReaderFilterInvalidUtf8
|
||||
decoder.Entity = xml.HTMLEntity
|
||||
decoder.Strict = false
|
||||
|
||||
|
@ -46,7 +51,7 @@ func charsetReaderFilterInvalidUtf8(charset string, input io.Reader) (io.Reader,
|
|||
}
|
||||
rawData, err := io.ReadAll(utf8Reader)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("encoding: unable to read data: %w", err)
|
||||
return nil, fmt.Errorf("xml: unable to read data: %w", err)
|
||||
}
|
||||
filteredBytes := filterValidXMLChars(rawData)
|
||||
return bytes.NewReader(filteredBytes), nil
|
||||
|
@ -110,3 +115,8 @@ func getEncoding(b []byte) []byte {
|
|||
}
|
||||
return v[1 : idx+1]
|
||||
}
|
||||
|
||||
func hasUTF8XMLDeclaration(data []byte) bool {
|
||||
enc := getEncoding(data)
|
||||
return enc == nil || bytes.EqualFold(enc, []byte("utf-8"))
|
||||
}
|
||||
|
|
|
@ -6,11 +6,78 @@ package xml // import "miniflux.app/v2/internal/reader/xml"
|
|||
import (
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
func TestXMLDocumentWithISO88591Encoding(t *testing.T) {
|
||||
fp, err := os.Open("testdata/iso88591.xml")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer fp.Close()
|
||||
|
||||
type myXMLDocument struct {
|
||||
XMLName xml.Name `xml:"note"`
|
||||
To string `xml:"to"`
|
||||
From string `xml:"from"`
|
||||
}
|
||||
|
||||
var doc myXMLDocument
|
||||
|
||||
decoder := NewXMLDecoder(fp)
|
||||
err = decoder.Decode(&doc)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
expectedTo := "Anaïs"
|
||||
expectedFrom := "Jürgen"
|
||||
|
||||
if doc.To != expectedTo {
|
||||
t.Errorf(`Incorrect "to" field, expected: %q, got: %q`, expectedTo, doc.To)
|
||||
}
|
||||
if doc.From != expectedFrom {
|
||||
t.Errorf(`Incorrect "from" field, expected: %q, got: %q`, expectedFrom, doc.From)
|
||||
}
|
||||
}
|
||||
|
||||
func TestXMLDocumentWithISO88591FileEncodingButUTF8Prolog(t *testing.T) {
|
||||
fp, err := os.Open("testdata/iso88591_utf8_mismatch.xml")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer fp.Close()
|
||||
|
||||
type myXMLDocument struct {
|
||||
XMLName xml.Name `xml:"note"`
|
||||
To string `xml:"to"`
|
||||
From string `xml:"from"`
|
||||
}
|
||||
|
||||
var doc myXMLDocument
|
||||
|
||||
decoder := NewXMLDecoder(fp)
|
||||
err = decoder.Decode(&doc)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// TODO: detect actual encoding from bytes if not UTF-8 and convert to UTF-8 if needed.
|
||||
// For now we just expect the invalid characters to be stripped out.
|
||||
expectedTo := "Anas"
|
||||
expectedFrom := "Jrgen"
|
||||
|
||||
if doc.To != expectedTo {
|
||||
t.Errorf(`Incorrect "to" field, expected: %q, got: %q`, expectedTo, doc.To)
|
||||
}
|
||||
if doc.From != expectedFrom {
|
||||
t.Errorf(`Incorrect "from" field, expected: %q, got: %q`, expectedFrom, doc.From)
|
||||
}
|
||||
}
|
||||
|
||||
func TestXMLDocumentWithIllegalUnicodeCharacters(t *testing.T) {
|
||||
type myxml struct {
|
||||
XMLName xml.Name `xml:"rss"`
|
||||
|
|
5
internal/reader/xml/testdata/iso88591.xml
vendored
Normal file
5
internal/reader/xml/testdata/iso88591.xml
vendored
Normal file
|
@ -0,0 +1,5 @@
|
|||
<?xml version="1.0" encoding="iso8859-1"?>
|
||||
<note>
|
||||
<to>Anaïs</to>
|
||||
<from>Jürgen</from>
|
||||
</note>
|
5
internal/reader/xml/testdata/iso88591_utf8_mismatch.xml
vendored
Normal file
5
internal/reader/xml/testdata/iso88591_utf8_mismatch.xml
vendored
Normal file
|
@ -0,0 +1,5 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<note>
|
||||
<to>Anaïs</to>
|
||||
<from>Jürgen</from>
|
||||
</note>
|
Loading…
Add table
Add a link
Reference in a new issue