From 69be57fc9d4a253100963fcffc012dc935e1a490 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Mon, 29 Sep 2025 19:46:29 +0200 Subject: [PATCH] refactor(reader): misc simplifications - There is no need for getEncoding to return a string instead of an array of bytes, so let's make it return a []byte instead of a string. - There is also no need to assign a decoder.CharsetReader when the encoding is utf-8, so let's not do that anymore. - Moreover, there is no reason why the function used for decoder.CharsetReader has to be defined as a lambda instead of a proper function. One might argue the other way around, but a lambda is living on the heap, while a "real" function doesn't. --- internal/reader/xml/decoder.go | 45 +++++++++++++++++----------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/internal/reader/xml/decoder.go b/internal/reader/xml/decoder.go index efe4fdcc..7a0f8a11 100644 --- a/internal/reader/xml/decoder.go +++ b/internal/reader/xml/decoder.go @@ -8,7 +8,6 @@ import ( "encoding/xml" "fmt" "io" - "strings" "unicode/utf8" "miniflux.app/v2/internal/reader/encoding" @@ -23,34 +22,36 @@ func NewXMLDecoder(data io.ReadSeeker) *xml.Decoder { io.Copy(buffer, data) enc := getEncoding(buffer.Bytes()) - if enc == "" || strings.EqualFold(enc, "utf-8") { - // filter invalid chars now, since decoder.CharsetReader not called for utf-8 content + if enc == nil || bytes.EqualFold(enc, []byte("utf-8")) { + // filter invalid chars now, since decoder.CharsetReader isn't called for utf-8 content filteredBytes := filterValidXMLChars(buffer.Bytes()) decoder = xml.NewDecoder(bytes.NewReader(filteredBytes)) } else { - // filter invalid chars later within decoder.CharsetReader data.Seek(0, io.SeekStart) decoder = xml.NewDecoder(data) + // invalid characters will be filtered later via decoder.CharsetReader + decoder.CharsetReader = charsetReaderFilterInvalidUtf8 } decoder.Entity = xml.HTMLEntity decoder.Strict = false - decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) { - utf8Reader, err := encoding.CharsetReader(charset, input) - if err != nil { - return nil, err - } - rawData, err := io.ReadAll(utf8Reader) - if err != nil { - return nil, fmt.Errorf("encoding: unable to read data: %w", err) - } - filteredBytes := filterValidXMLChars(rawData) - return bytes.NewReader(filteredBytes), nil - } return decoder } +func charsetReaderFilterInvalidUtf8(charset string, input io.Reader) (io.Reader, error) { + utf8Reader, err := encoding.CharsetReader(charset, input) + if err != nil { + return nil, err + } + rawData, err := io.ReadAll(utf8Reader) + if err != nil { + return nil, fmt.Errorf("encoding: unable to read data: %w", err) + } + filteredBytes := filterValidXMLChars(rawData) + return bytes.NewReader(filteredBytes), nil +} + // filterValidXMLChars filters inplace invalid XML characters. // This function is inspired from bytes.Map func filterValidXMLChars(s []byte) []byte { @@ -89,23 +90,23 @@ func filterValidXMLChar(r rune) rune { } // This function is copied from encoding/xml's procInst and adapted for []bytes instead of string -func getEncoding(b []byte) string { +func getEncoding(b []byte) []byte { // This parsing is somewhat lame and not exact. // It works for all actual cases, though. idx := bytes.Index(b, []byte("encoding=")) if idx == -1 { - return "" + return nil } v := b[idx+len("encoding="):] if len(v) == 0 { - return "" + return nil } if v[0] != '\'' && v[0] != '"' { - return "" + return nil } idx = bytes.IndexRune(v[1:], rune(v[0])) if idx == -1 { - return "" + return nil } - return string(v[1 : idx+1]) + return v[1 : idx+1] }