From d59990f1dd2c9921e4095e8201d3333f0cc1b107 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Mon, 9 Jun 2025 15:26:11 +0200 Subject: [PATCH] perf(xml): optimize xml filtering Instead of using bytes.Map which is returning a copy of the provided []byte, use a custom in-place implementation, as the bytes.Map call is taking around 25% of rss.Parse --- internal/reader/xml/decoder.go | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/internal/reader/xml/decoder.go b/internal/reader/xml/decoder.go index 13af32cb..b0f6b1f9 100644 --- a/internal/reader/xml/decoder.go +++ b/internal/reader/xml/decoder.go @@ -9,6 +9,7 @@ import ( "fmt" "io" "strings" + "unicode/utf8" "miniflux.app/v2/internal/reader/encoding" ) @@ -24,7 +25,7 @@ func NewXMLDecoder(data io.ReadSeeker) *xml.Decoder { enc := getEncoding(buffer.Bytes()) if enc == "" || strings.EqualFold(enc, "utf-8") { // filter invalid chars now, since decoder.CharsetReader not called for utf-8 content - filteredBytes := bytes.Map(filterValidXMLChar, buffer.Bytes()) + filteredBytes := filterValidXMLChars(buffer.Bytes()) decoder = xml.NewDecoder(bytes.NewReader(filteredBytes)) } else { // filter invalid chars later within decoder.CharsetReader @@ -43,13 +44,32 @@ func NewXMLDecoder(data io.ReadSeeker) *xml.Decoder { if err != nil { return nil, fmt.Errorf("encoding: unable to read data: %w", err) } - filteredBytes := bytes.Map(filterValidXMLChar, rawData) + filteredBytes := filterValidXMLChars(rawData) return bytes.NewReader(filteredBytes), nil } return decoder } +// filterValidXMLChars filters inplace invalid XML characters. +// This function is inspired from bytes.Map +func filterValidXMLChars(s []byte) []byte { + j := 0 + for i := 0; i < len(s); { + wid := 1 + r := rune(s[i]) + if r >= utf8.RuneSelf { + r, wid = utf8.DecodeRune(s[i:]) + } + if r = filterValidXMLChar(r); r >= 0 { + utf8.EncodeRune(s[j:], r) + j += wid + } + i += wid + } + return s[:j] +} + // This function is copied from encoding/xml package, // and is used to check if all the characters are legal. func filterValidXMLChar(r rune) rune {