1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-06-27 16:36:00 +00:00

Filter valid XML characters for UTF-8 XML documents before decoding

This change should reduce "illegal character code" XML errors.
This commit is contained in:
Jebbs 2019-12-20 10:31:52 +08:00 committed by Frédéric Guillot
parent a4ebb33cd5
commit a155ab6deb
2 changed files with 96 additions and 4 deletions

View file

@ -10,13 +10,25 @@ import (
"fmt"
"io"
"io/ioutil"
"strings"
"miniflux.app/reader/encoding"
)
// NewDecoder returns a XML decoder that filters illegal characters.
func NewDecoder(data io.Reader) *xml.Decoder {
decoder := xml.NewDecoder(data)
var decoder *xml.Decoder
buffer, _ := ioutil.ReadAll(data)
enc := procInst("encoding", string(buffer))
if enc != "" && enc != "utf-8" && enc != "UTF-8" && !strings.EqualFold(enc, "utf-8") {
// filter invalid chars later within decoder.CharsetReader
decoder = xml.NewDecoder(bytes.NewReader(buffer))
} else {
// filter invalid chars now, since decoder.CharsetReader not called for utf-8 content
filteredBytes := bytes.Map(filterValidXMLChar, buffer)
decoder = xml.NewDecoder(bytes.NewReader(filteredBytes))
}
decoder.Entity = xml.HTMLEntity
decoder.Strict = false
decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
@ -48,3 +60,28 @@ func filterValidXMLChar(r rune) rune {
}
return -1
}
// This function is copied from encoding/xml package,
// procInst parses the `param="..."` or `param='...'`
// value out of the provided string, returning "" if not found.
func procInst(param, s string) string {
// TODO: this parsing is somewhat lame and not exact.
// It works for all actual cases, though.
param = param + "="
idx := strings.Index(s, param)
if idx == -1 {
return ""
}
v := s[idx+len(param):]
if v == "" {
return ""
}
if v[0] != '\'' && v[0] != '"' {
return ""
}
idx = strings.IndexRune(v[1:], rune(v[0]))
if idx == -1 {
return ""
}
return v[1 : idx+1]
}