miniflux-v2/internal/reader/xml/decoder.go

// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

package xml // import "miniflux.app/v2/internal/reader/xml"

import (
	"bytes"
	"encoding/xml"
	"fmt"
	"io"
	"strings"
	"unicode/utf8"

	"miniflux.app/v2/internal/reader/encoding"
)

// NewXMLDecoder returns a XML decoder that filters illegal characters.
func NewXMLDecoder(data io.ReadSeeker) *xml.Decoder {
	var decoder *xml.Decoder

	// This is way fasted than io.ReadAll(data) as the buffer can be allocated in one go instead of dynamically grown.
	buffer := &bytes.Buffer{}
	io.Copy(buffer, data)

	enc := getEncoding(buffer.Bytes())
	if enc == "" || strings.EqualFold(enc, "utf-8") {
		// filter invalid chars now, since decoder.CharsetReader not called for utf-8 content
		filteredBytes := filterValidXMLChars(buffer.Bytes())
		decoder = xml.NewDecoder(bytes.NewReader(filteredBytes))
	} else {
		// filter invalid chars later within decoder.CharsetReader
		data.Seek(0, io.SeekStart)
		decoder = xml.NewDecoder(data)
	}

	decoder.Entity = xml.HTMLEntity
	decoder.Strict = false
	decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
		utf8Reader, err := encoding.CharsetReader(charset, input)
		if err != nil {
			return nil, err
		}
		rawData, err := io.ReadAll(utf8Reader)
		if err != nil {
			return nil, fmt.Errorf("encoding: unable to read data: %w", err)
		}
		filteredBytes := filterValidXMLChars(rawData)
		return bytes.NewReader(filteredBytes), nil
	}

	return decoder
}

// filterValidXMLChars filters inplace invalid XML characters.
// This function is inspired from bytes.Map
func filterValidXMLChars(s []byte) []byte {
	j := 0
	for i := 0; i < len(s); {
		wid := 1
		r := rune(s[i])
		if r >= utf8.RuneSelf {
			r, wid = utf8.DecodeRune(s[i:])
		}
		if r != utf8.RuneError {
			if r = filterValidXMLChar(r); r >= 0 {
				utf8.EncodeRune(s[j:], r)
				j += wid
			}
		}
		i += wid
	}
	return s[:j]
}

// This function is copied from encoding/xml package,
// and is used to check if all the characters are legal.
func filterValidXMLChar(r rune) rune {
	if r == 0x09 ||
		r == 0x0A ||
		r == 0x0D ||
		r >= 0x20 && r <= 0xD7FF ||
		r >= 0xE000 && r <= 0xFFFD ||
		r >= 0x10000 && r <= 0x10FFFF {
		return r
	}
	return -1
}

// This function is copied from encoding/xml's procInst and adapted for []bytes instead of string
func getEncoding(b []byte) string {
	// TODO: this parsing is somewhat lame and not exact.
	// It works for all actual cases, though.
	idx := bytes.Index(b, []byte("encoding="))
	if idx == -1 {
		return ""
	}
	v := b[idx+len("encoding="):]
	if len(v) == 0 {
		return ""
	}
	if v[0] != '\'' && v[0] != '"' {
		return ""
	}
	idx = bytes.IndexRune(v[1:], rune(v[0]))
	if idx == -1 {
		return ""
	}
	return string(v[1 : idx+1])
}
Replace copyright header with SPDX identifier 2023-06-19 14:42:47 -07:00			`// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.`
			`// SPDX-License-Identifier: Apache-2.0`
Improve XML decoder to remove illegal characters 2019-10-23 11:27:27 +08:00
Move internal packages to an internal folder For reference: https://go.dev/doc/go1.4#internalpackages 2023-08-10 19:46:45 -07:00			`package xml // import "miniflux.app/v2/internal/reader/xml"`
Improve XML decoder to remove illegal characters 2019-10-23 11:27:27 +08:00
			`import (`
			`"bytes"`
			`"encoding/xml"`
			`"fmt"`
			`"io"`
Filter valid XML characters for UTF-8 XML documents before decoding This change should reduce "illegal character code" XML errors. 2019-12-20 10:31:52 +08:00			`"strings"`
perf(xml): optimize xml filtering Instead of using bytes.Map which is returning a copy of the provided []byte, use a custom in-place implementation, as the bytes.Map call is taking around 25% of rss.Parse 2025-06-09 15:26:11 +02:00			`"unicode/utf8"`
Improve XML decoder to remove illegal characters 2019-10-23 11:27:27 +08:00
Move internal packages to an internal folder For reference: https://go.dev/doc/go1.4#internalpackages 2023-08-10 19:46:45 -07:00			`"miniflux.app/v2/internal/reader/encoding"`
Improve XML decoder to remove illegal characters 2019-10-23 11:27:27 +08:00			`)`

Regression: ensure all HTML documents are encoded in UTF-8 Fixes #2196 2023-12-01 16:27:18 -08:00			`// NewXMLDecoder returns a XML decoder that filters illegal characters.`
Use an io.ReadSeeker instead of an io.Reader to parse feeds This will allow to make use of func (*Reader) Seek, instead of re-recreating a new reader. It's a large commit for a small change, but anything to simply the reader/buffer/ReadAll/… mess is a step in the right direction I think, and it should enable more follow-up simplifications. 2024-03-06 14:57:21 +01:00			`func NewXMLDecoder(data io.ReadSeeker) *xml.Decoder {`
Filter valid XML characters for UTF-8 XML documents before decoding This change should reduce "illegal character code" XML errors. 2019-12-20 10:31:52 +08:00			`var decoder *xml.Decoder`
perf(xml): optimized NewXMLDecoder io.ReadAll is growing the underlying buffer progressively, while io.Copy is able to allocate it in one go, which is significantly faster. io.ReadAll is currently accounting for around 10% of the CPU time of rss.Parse 2025-06-09 15:15:40 +02:00
			`// This is way fasted than io.ReadAll(data) as the buffer can be allocated in one go instead of dynamically grown.`
			`buffer := &bytes.Buffer{}`
			`io.Copy(buffer, data)`

			`enc := getEncoding(buffer.Bytes())`
refactor(xml): improve the performances of `NewXMLDecoder` - Invert a condition to make the code more readable - Extract the encoding directly from the slice of bytes instead of converting it to string first. 2025-01-31 03:37:06 +00:00			`if enc == "" \|\| strings.EqualFold(enc, "utf-8") {`
Filter valid XML characters for UTF-8 XML documents before decoding This change should reduce "illegal character code" XML errors. 2019-12-20 10:31:52 +08:00			`// filter invalid chars now, since decoder.CharsetReader not called for utf-8 content`
perf(xml): optimize xml filtering Instead of using bytes.Map which is returning a copy of the provided []byte, use a custom in-place implementation, as the bytes.Map call is taking around 25% of rss.Parse 2025-06-09 15:26:11 +02:00			`filteredBytes := filterValidXMLChars(buffer.Bytes())`
Filter valid XML characters for UTF-8 XML documents before decoding This change should reduce "illegal character code" XML errors. 2019-12-20 10:31:52 +08:00			`decoder = xml.NewDecoder(bytes.NewReader(filteredBytes))`
refactor(xml): improve the performances of `NewXMLDecoder` - Invert a condition to make the code more readable - Extract the encoding directly from the slice of bytes instead of converting it to string first. 2025-01-31 03:37:06 +00:00			`} else {`
			`// filter invalid chars later within decoder.CharsetReader`
			`data.Seek(0, io.SeekStart)`
			`decoder = xml.NewDecoder(data)`
Filter valid XML characters for UTF-8 XML documents before decoding This change should reduce "illegal character code" XML errors. 2019-12-20 10:31:52 +08:00			`}`

Improve XML decoder to remove illegal characters 2019-10-23 11:27:27 +08:00			`decoder.Entity = xml.HTMLEntity`
			`decoder.Strict = false`
			`decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {`
			`utf8Reader, err := encoding.CharsetReader(charset, input)`
			`if err != nil {`
			`return nil, err`
			`}`
Remove deprecated io/ioutil package Miniflux now requires at least Go 1.16 and io/util is deprecated. https://golang.org/doc/go1.16#ioutil 2021-02-16 21:19:03 -08:00			`rawData, err := io.ReadAll(utf8Reader)`
Improve XML decoder to remove illegal characters 2019-10-23 11:27:27 +08:00			`if err != nil {`
Regression: ensure all HTML documents are encoded in UTF-8 Fixes #2196 2023-12-01 16:27:18 -08:00			`return nil, fmt.Errorf("encoding: unable to read data: %w", err)`
Improve XML decoder to remove illegal characters 2019-10-23 11:27:27 +08:00			`}`
perf(xml): optimize xml filtering Instead of using bytes.Map which is returning a copy of the provided []byte, use a custom in-place implementation, as the bytes.Map call is taking around 25% of rss.Parse 2025-06-09 15:26:11 +02:00			`filteredBytes := filterValidXMLChars(rawData)`
Improve XML decoder to remove illegal characters 2019-10-23 11:27:27 +08:00			`return bytes.NewReader(filteredBytes), nil`
			`}`

			`return decoder`
			`}`

perf(xml): optimize xml filtering Instead of using bytes.Map which is returning a copy of the provided []byte, use a custom in-place implementation, as the bytes.Map call is taking around 25% of rss.Parse 2025-06-09 15:26:11 +02:00			`// filterValidXMLChars filters inplace invalid XML characters.`
			`// This function is inspired from bytes.Map`
			`func filterValidXMLChars(s []byte) []byte {`
			`j := 0`
			`for i := 0; i < len(s); {`
			`wid := 1`
			`r := rune(s[i])`
			`if r >= utf8.RuneSelf {`
			`r, wid = utf8.DecodeRune(s[i:])`
			`}`
fix(reader): fix a crash introduced by d59990f1 And add a fuzzer and a testcase as well to validate that nothing breaks. 2025-06-10 22:35:55 +02:00			`if r != utf8.RuneError {`
			`if r = filterValidXMLChar(r); r >= 0 {`
			`utf8.EncodeRune(s[j:], r)`
			`j += wid`
			`}`
perf(xml): optimize xml filtering Instead of using bytes.Map which is returning a copy of the provided []byte, use a custom in-place implementation, as the bytes.Map call is taking around 25% of rss.Parse 2025-06-09 15:26:11 +02:00			`}`
			`i += wid`
			`}`
			`return s[:j]`
			`}`

Improve XML decoder to remove illegal characters 2019-10-23 11:27:27 +08:00			`// This function is copied from encoding/xml package,`
			`// and is used to check if all the characters are legal.`
			`func filterValidXMLChar(r rune) rune {`
			`if r == 0x09 \|\|`
			`r == 0x0A \|\|`
			`r == 0x0D \|\|`
			`r >= 0x20 && r <= 0xD7FF \|\|`
			`r >= 0xE000 && r <= 0xFFFD \|\|`
			`r >= 0x10000 && r <= 0x10FFFF {`
			`return r`
			`}`
			`return -1`
			`}`
Filter valid XML characters for UTF-8 XML documents before decoding This change should reduce "illegal character code" XML errors. 2019-12-20 10:31:52 +08:00
refactor(xml): improve the performances of `NewXMLDecoder` - Invert a condition to make the code more readable - Extract the encoding directly from the slice of bytes instead of converting it to string first. 2025-01-31 03:37:06 +00:00			`// This function is copied from encoding/xml's procInst and adapted for []bytes instead of string`
			`func getEncoding(b []byte) string {`
Filter valid XML characters for UTF-8 XML documents before decoding This change should reduce "illegal character code" XML errors. 2019-12-20 10:31:52 +08:00			`// TODO: this parsing is somewhat lame and not exact.`
			`// It works for all actual cases, though.`
refactor(xml): improve the performances of `NewXMLDecoder` - Invert a condition to make the code more readable - Extract the encoding directly from the slice of bytes instead of converting it to string first. 2025-01-31 03:37:06 +00:00			`idx := bytes.Index(b, []byte("encoding="))`
Filter valid XML characters for UTF-8 XML documents before decoding This change should reduce "illegal character code" XML errors. 2019-12-20 10:31:52 +08:00			`if idx == -1 {`
			`return ""`
			`}`
refactor(xml): improve the performances of `NewXMLDecoder` - Invert a condition to make the code more readable - Extract the encoding directly from the slice of bytes instead of converting it to string first. 2025-01-31 03:37:06 +00:00			`v := b[idx+len("encoding="):]`
			`if len(v) == 0 {`
Filter valid XML characters for UTF-8 XML documents before decoding This change should reduce "illegal character code" XML errors. 2019-12-20 10:31:52 +08:00			`return ""`
			`}`
			`if v[0] != '\'' && v[0] != '"' {`
			`return ""`
			`}`
refactor(xml): improve the performances of `NewXMLDecoder` - Invert a condition to make the code more readable - Extract the encoding directly from the slice of bytes instead of converting it to string first. 2025-01-31 03:37:06 +00:00			`idx = bytes.IndexRune(v[1:], rune(v[0]))`
Filter valid XML characters for UTF-8 XML documents before decoding This change should reduce "illegal character code" XML errors. 2019-12-20 10:31:52 +08:00			`if idx == -1 {`
			`return ""`
			`}`
refactor(xml): improve the performances of `NewXMLDecoder` - Invert a condition to make the code more readable - Extract the encoding directly from the slice of bytes instead of converting it to string first. 2025-01-31 03:37:06 +00:00			`return string(v[1 : idx+1])`
Filter valid XML characters for UTF-8 XML documents before decoding This change should reduce "illegal character code" XML errors. 2019-12-20 10:31:52 +08:00			`}`