1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-06-27 16:36:00 +00:00

perf(xml): optimize xml filtering

Instead of using bytes.Map which is returning a copy of the provided []byte,
use a custom in-place implementation, as the bytes.Map call is taking around
25% of rss.Parse
This commit is contained in:
jvoisin 2025-06-09 15:26:11 +02:00 committed by Frédéric Guillot
parent 49085daefe
commit d59990f1dd

View file

@ -9,6 +9,7 @@ import (
"fmt" "fmt"
"io" "io"
"strings" "strings"
"unicode/utf8"
"miniflux.app/v2/internal/reader/encoding" "miniflux.app/v2/internal/reader/encoding"
) )
@ -24,7 +25,7 @@ func NewXMLDecoder(data io.ReadSeeker) *xml.Decoder {
enc := getEncoding(buffer.Bytes()) enc := getEncoding(buffer.Bytes())
if enc == "" || strings.EqualFold(enc, "utf-8") { if enc == "" || strings.EqualFold(enc, "utf-8") {
// filter invalid chars now, since decoder.CharsetReader not called for utf-8 content // filter invalid chars now, since decoder.CharsetReader not called for utf-8 content
filteredBytes := bytes.Map(filterValidXMLChar, buffer.Bytes()) filteredBytes := filterValidXMLChars(buffer.Bytes())
decoder = xml.NewDecoder(bytes.NewReader(filteredBytes)) decoder = xml.NewDecoder(bytes.NewReader(filteredBytes))
} else { } else {
// filter invalid chars later within decoder.CharsetReader // filter invalid chars later within decoder.CharsetReader
@ -43,13 +44,32 @@ func NewXMLDecoder(data io.ReadSeeker) *xml.Decoder {
if err != nil { if err != nil {
return nil, fmt.Errorf("encoding: unable to read data: %w", err) return nil, fmt.Errorf("encoding: unable to read data: %w", err)
} }
filteredBytes := bytes.Map(filterValidXMLChar, rawData) filteredBytes := filterValidXMLChars(rawData)
return bytes.NewReader(filteredBytes), nil return bytes.NewReader(filteredBytes), nil
} }
return decoder return decoder
} }
// filterValidXMLChars filters inplace invalid XML characters.
// This function is inspired from bytes.Map
func filterValidXMLChars(s []byte) []byte {
j := 0
for i := 0; i < len(s); {
wid := 1
r := rune(s[i])
if r >= utf8.RuneSelf {
r, wid = utf8.DecodeRune(s[i:])
}
if r = filterValidXMLChar(r); r >= 0 {
utf8.EncodeRune(s[j:], r)
j += wid
}
i += wid
}
return s[:j]
}
// This function is copied from encoding/xml package, // This function is copied from encoding/xml package,
// and is used to check if all the characters are legal. // and is used to check if all the characters are legal.
func filterValidXMLChar(r rune) rune { func filterValidXMLChar(r rune) rune {