mirror of
https://github.com/miniflux/v2.git
synced 2025-10-05 19:31:01 +00:00
test(xml): add test cases regarding XML encoding
This commit is contained in:
parent
fac18d5c57
commit
04a360a536
4 changed files with 93 additions and 6 deletions
|
@ -21,18 +21,23 @@ func NewXMLDecoder(data io.ReadSeeker) *xml.Decoder {
|
||||||
buffer := &bytes.Buffer{}
|
buffer := &bytes.Buffer{}
|
||||||
io.Copy(buffer, data)
|
io.Copy(buffer, data)
|
||||||
|
|
||||||
enc := getEncoding(buffer.Bytes())
|
if hasUTF8XMLDeclaration(buffer.Bytes()) {
|
||||||
if enc == nil || bytes.EqualFold(enc, []byte("utf-8")) {
|
// TODO: detect actual encoding from bytes if not UTF-8 and convert to UTF-8 if needed.
|
||||||
// filter invalid chars now, since decoder.CharsetReader isn't called for utf-8 content
|
// For now we just expect the invalid characters to be stripped out.
|
||||||
|
|
||||||
|
// Filter invalid chars now, since decoder.CharsetReader isn't called for utf-8 content
|
||||||
filteredBytes := filterValidXMLChars(buffer.Bytes())
|
filteredBytes := filterValidXMLChars(buffer.Bytes())
|
||||||
|
|
||||||
decoder = xml.NewDecoder(bytes.NewReader(filteredBytes))
|
decoder = xml.NewDecoder(bytes.NewReader(filteredBytes))
|
||||||
} else {
|
} else {
|
||||||
data.Seek(0, io.SeekStart)
|
data.Seek(0, io.SeekStart)
|
||||||
// invalid characters will be filtered later via decoder.CharsetReader
|
|
||||||
decoder = xml.NewDecoder(data)
|
decoder = xml.NewDecoder(data)
|
||||||
|
|
||||||
|
// The XML document will be converted to UTF-8 by encoding.CharsetReader
|
||||||
|
// Invalid characters will be filtered later via decoder.CharsetReader
|
||||||
|
decoder.CharsetReader = charsetReaderFilterInvalidUtf8
|
||||||
}
|
}
|
||||||
|
|
||||||
decoder.CharsetReader = charsetReaderFilterInvalidUtf8
|
|
||||||
decoder.Entity = xml.HTMLEntity
|
decoder.Entity = xml.HTMLEntity
|
||||||
decoder.Strict = false
|
decoder.Strict = false
|
||||||
|
|
||||||
|
@ -46,7 +51,7 @@ func charsetReaderFilterInvalidUtf8(charset string, input io.Reader) (io.Reader,
|
||||||
}
|
}
|
||||||
rawData, err := io.ReadAll(utf8Reader)
|
rawData, err := io.ReadAll(utf8Reader)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("encoding: unable to read data: %w", err)
|
return nil, fmt.Errorf("xml: unable to read data: %w", err)
|
||||||
}
|
}
|
||||||
filteredBytes := filterValidXMLChars(rawData)
|
filteredBytes := filterValidXMLChars(rawData)
|
||||||
return bytes.NewReader(filteredBytes), nil
|
return bytes.NewReader(filteredBytes), nil
|
||||||
|
@ -110,3 +115,8 @@ func getEncoding(b []byte) []byte {
|
||||||
}
|
}
|
||||||
return v[1 : idx+1]
|
return v[1 : idx+1]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func hasUTF8XMLDeclaration(data []byte) bool {
|
||||||
|
enc := getEncoding(data)
|
||||||
|
return enc == nil || bytes.EqualFold(enc, []byte("utf-8"))
|
||||||
|
}
|
||||||
|
|
|
@ -6,11 +6,78 @@ package xml // import "miniflux.app/v2/internal/reader/xml"
|
||||||
import (
|
import (
|
||||||
"encoding/xml"
|
"encoding/xml"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
"unicode/utf8"
|
"unicode/utf8"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func TestXMLDocumentWithISO88591Encoding(t *testing.T) {
|
||||||
|
fp, err := os.Open("testdata/iso88591.xml")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
defer fp.Close()
|
||||||
|
|
||||||
|
type myXMLDocument struct {
|
||||||
|
XMLName xml.Name `xml:"note"`
|
||||||
|
To string `xml:"to"`
|
||||||
|
From string `xml:"from"`
|
||||||
|
}
|
||||||
|
|
||||||
|
var doc myXMLDocument
|
||||||
|
|
||||||
|
decoder := NewXMLDecoder(fp)
|
||||||
|
err = decoder.Decode(&doc)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
expectedTo := "Anaïs"
|
||||||
|
expectedFrom := "Jürgen"
|
||||||
|
|
||||||
|
if doc.To != expectedTo {
|
||||||
|
t.Errorf(`Incorrect "to" field, expected: %q, got: %q`, expectedTo, doc.To)
|
||||||
|
}
|
||||||
|
if doc.From != expectedFrom {
|
||||||
|
t.Errorf(`Incorrect "from" field, expected: %q, got: %q`, expectedFrom, doc.From)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestXMLDocumentWithISO88591FileEncodingButUTF8Prolog(t *testing.T) {
|
||||||
|
fp, err := os.Open("testdata/iso88591_utf8_mismatch.xml")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
defer fp.Close()
|
||||||
|
|
||||||
|
type myXMLDocument struct {
|
||||||
|
XMLName xml.Name `xml:"note"`
|
||||||
|
To string `xml:"to"`
|
||||||
|
From string `xml:"from"`
|
||||||
|
}
|
||||||
|
|
||||||
|
var doc myXMLDocument
|
||||||
|
|
||||||
|
decoder := NewXMLDecoder(fp)
|
||||||
|
err = decoder.Decode(&doc)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: detect actual encoding from bytes if not UTF-8 and convert to UTF-8 if needed.
|
||||||
|
// For now we just expect the invalid characters to be stripped out.
|
||||||
|
expectedTo := "Anas"
|
||||||
|
expectedFrom := "Jrgen"
|
||||||
|
|
||||||
|
if doc.To != expectedTo {
|
||||||
|
t.Errorf(`Incorrect "to" field, expected: %q, got: %q`, expectedTo, doc.To)
|
||||||
|
}
|
||||||
|
if doc.From != expectedFrom {
|
||||||
|
t.Errorf(`Incorrect "from" field, expected: %q, got: %q`, expectedFrom, doc.From)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestXMLDocumentWithIllegalUnicodeCharacters(t *testing.T) {
|
func TestXMLDocumentWithIllegalUnicodeCharacters(t *testing.T) {
|
||||||
type myxml struct {
|
type myxml struct {
|
||||||
XMLName xml.Name `xml:"rss"`
|
XMLName xml.Name `xml:"rss"`
|
||||||
|
|
5
internal/reader/xml/testdata/iso88591.xml
vendored
Normal file
5
internal/reader/xml/testdata/iso88591.xml
vendored
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
<?xml version="1.0" encoding="iso8859-1"?>
|
||||||
|
<note>
|
||||||
|
<to>Anaïs</to>
|
||||||
|
<from>Jürgen</from>
|
||||||
|
</note>
|
5
internal/reader/xml/testdata/iso88591_utf8_mismatch.xml
vendored
Normal file
5
internal/reader/xml/testdata/iso88591_utf8_mismatch.xml
vendored
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<note>
|
||||||
|
<to>Anaïs</to>
|
||||||
|
<from>Jürgen</from>
|
||||||
|
</note>
|
Loading…
Add table
Add a link
Reference in a new issue