refactor(xml): improve the performances of NewXMLDecoder

- Invert a condition to make the code more readable - Extract the encoding directly from the slice of bytes instead of converting it to string first.
2025-08-16 18:01:37 +00:00 · 2025-01-31 03:37:06 +00:00 · 2025-01-31 03:37:06 +00:00 · b193bc212a
commit b193bc212a
parent 3ebeb38ade
1 changed files with 13 additions and 16 deletions
--- a/internal/reader/xml/decoder.go
+++ b/internal/reader/xml/decoder.go
@ -17,15 +17,15 @@ import (
 func NewXMLDecoder(data io.ReadSeeker) *xml.Decoder {
 	var decoder *xml.Decoder
 	buffer, _ := io.ReadAll(data)
-	enc := procInst("encoding", string(buffer))
+	enc := getEncoding(buffer)
-	if enc != "" && enc != "utf-8" && enc != "UTF-8" && !strings.EqualFold(enc, "utf-8") {
+	if enc == "" || strings.EqualFold(enc, "utf-8") {
 		// filter invalid chars later within decoder.CharsetReader
 		data.Seek(0, io.SeekStart)
 		decoder = xml.NewDecoder(data)
 	} else {
 		// filter invalid chars now, since decoder.CharsetReader not called for utf-8 content
 		filteredBytes := bytes.Map(filterValidXMLChar, buffer)
 		decoder = xml.NewDecoder(bytes.NewReader(filteredBytes))
 	} else {
 		// filter invalid chars later within decoder.CharsetReader
 		data.Seek(0, io.SeekStart)
 		decoder = xml.NewDecoder(data)
 	}
 	decoder.Entity = xml.HTMLEntity
@ -60,27 +60,24 @@ func filterValidXMLChar(r rune) rune {
 	return -1
 }
-// This function is copied from encoding/xml package,
+// This function is copied from encoding/xml's procInst and adapted for []bytes instead of string
-// procInst parses the `param="..."` or `param='...'`
+func getEncoding(b []byte) string {
 // value out of the provided string, returning "" if not found.
 func procInst(param, s string) string {
 	// TODO: this parsing is somewhat lame and not exact.
 	// It works for all actual cases, though.
-	param += "="
+	idx := bytes.Index(b, []byte("encoding="))
 	idx := strings.Index(s, param)
 	if idx == -1 {
 		return ""
 	}
-	v := s[idx+len(param):]
+	v := b[idx+len("encoding="):]
-	if v == "" {
+	if len(v) == 0 {
 		return ""
 	}
 	if v[0] != '\'' && v[0] != '"' {
 		return ""
 	}
-	idx = strings.IndexRune(v[1:], rune(v[0]))
+	idx = bytes.IndexRune(v[1:], rune(v[0]))
 	if idx == -1 {
 		return ""
 	}
-	return v[1 : idx+1]
+	return string(v[1 : idx+1])
 }