mirror of
https://github.com/miniflux/v2.git
synced 2025-06-27 16:36:00 +00:00
109 lines
2.7 KiB
Go
109 lines
2.7 KiB
Go
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
|
|
// SPDX-License-Identifier: Apache-2.0
|
|
|
|
package xml // import "miniflux.app/v2/internal/reader/xml"
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/xml"
|
|
"fmt"
|
|
"io"
|
|
"strings"
|
|
"unicode/utf8"
|
|
|
|
"miniflux.app/v2/internal/reader/encoding"
|
|
)
|
|
|
|
// NewXMLDecoder returns a XML decoder that filters illegal characters.
|
|
func NewXMLDecoder(data io.ReadSeeker) *xml.Decoder {
|
|
var decoder *xml.Decoder
|
|
|
|
// This is way fasted than io.ReadAll(data) as the buffer can be allocated in one go instead of dynamically grown.
|
|
buffer := &bytes.Buffer{}
|
|
io.Copy(buffer, data)
|
|
|
|
enc := getEncoding(buffer.Bytes())
|
|
if enc == "" || strings.EqualFold(enc, "utf-8") {
|
|
// filter invalid chars now, since decoder.CharsetReader not called for utf-8 content
|
|
filteredBytes := filterValidXMLChars(buffer.Bytes())
|
|
decoder = xml.NewDecoder(bytes.NewReader(filteredBytes))
|
|
} else {
|
|
// filter invalid chars later within decoder.CharsetReader
|
|
data.Seek(0, io.SeekStart)
|
|
decoder = xml.NewDecoder(data)
|
|
}
|
|
|
|
decoder.Entity = xml.HTMLEntity
|
|
decoder.Strict = false
|
|
decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
|
|
utf8Reader, err := encoding.CharsetReader(charset, input)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
rawData, err := io.ReadAll(utf8Reader)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("encoding: unable to read data: %w", err)
|
|
}
|
|
filteredBytes := filterValidXMLChars(rawData)
|
|
return bytes.NewReader(filteredBytes), nil
|
|
}
|
|
|
|
return decoder
|
|
}
|
|
|
|
// filterValidXMLChars filters inplace invalid XML characters.
|
|
// This function is inspired from bytes.Map
|
|
func filterValidXMLChars(s []byte) []byte {
|
|
j := 0
|
|
for i := 0; i < len(s); {
|
|
wid := 1
|
|
r := rune(s[i])
|
|
if r >= utf8.RuneSelf {
|
|
r, wid = utf8.DecodeRune(s[i:])
|
|
}
|
|
if r != utf8.RuneError {
|
|
if r = filterValidXMLChar(r); r >= 0 {
|
|
utf8.EncodeRune(s[j:], r)
|
|
j += wid
|
|
}
|
|
}
|
|
i += wid
|
|
}
|
|
return s[:j]
|
|
}
|
|
|
|
// This function is copied from encoding/xml package,
|
|
// and is used to check if all the characters are legal.
|
|
func filterValidXMLChar(r rune) rune {
|
|
if r == 0x09 ||
|
|
r == 0x0A ||
|
|
r == 0x0D ||
|
|
r >= 0x20 && r <= 0xD7FF ||
|
|
r >= 0xE000 && r <= 0xFFFD ||
|
|
r >= 0x10000 && r <= 0x10FFFF {
|
|
return r
|
|
}
|
|
return -1
|
|
}
|
|
|
|
// This function is copied from encoding/xml's procInst and adapted for []bytes instead of string
|
|
func getEncoding(b []byte) string {
|
|
// TODO: this parsing is somewhat lame and not exact.
|
|
// It works for all actual cases, though.
|
|
idx := bytes.Index(b, []byte("encoding="))
|
|
if idx == -1 {
|
|
return ""
|
|
}
|
|
v := b[idx+len("encoding="):]
|
|
if len(v) == 0 {
|
|
return ""
|
|
}
|
|
if v[0] != '\'' && v[0] != '"' {
|
|
return ""
|
|
}
|
|
idx = bytes.IndexRune(v[1:], rune(v[0]))
|
|
if idx == -1 {
|
|
return ""
|
|
}
|
|
return string(v[1 : idx+1])
|
|
}
|