diff --git a/internal/reader/xml/decoder.go b/internal/reader/xml/decoder.go index b0f6b1f9..e06a56f6 100644 --- a/internal/reader/xml/decoder.go +++ b/internal/reader/xml/decoder.go @@ -61,9 +61,11 @@ func filterValidXMLChars(s []byte) []byte { if r >= utf8.RuneSelf { r, wid = utf8.DecodeRune(s[i:]) } - if r = filterValidXMLChar(r); r >= 0 { - utf8.EncodeRune(s[j:], r) - j += wid + if r != utf8.RuneError { + if r = filterValidXMLChar(r); r >= 0 { + utf8.EncodeRune(s[j:], r) + j += wid + } } i += wid } diff --git a/internal/reader/xml/decoder_test.go b/internal/reader/xml/decoder_test.go index 35bf9e64..879363a7 100644 --- a/internal/reader/xml/decoder_test.go +++ b/internal/reader/xml/decoder_test.go @@ -8,6 +8,7 @@ import ( "fmt" "strings" "testing" + "unicode/utf8" ) func TestXMLDocumentWithIllegalUnicodeCharacters(t *testing.T) { @@ -81,3 +82,25 @@ func TestXMLDocumentWithIncorrectEncodingField(t *testing.T) { t.Errorf("Incorrect entry title, expected: %s, got: %s", expected, x.Title) } } + +func TestFilterValidXMLCharsWithInvalidUTF8Sequence(t *testing.T) { + // Create input with invalid UTF-8 sequence + input := []byte{0x41, 0xC0, 0xAF, 0x42} // 'A', invalid UTF-8, 'B' + + filtered := filterValidXMLChars(input) + + // The function would replace invalid UTF-8 with replacement char + // rather than properly filtering + if utf8.Valid(filtered) { + r, _ := utf8.DecodeRune(filtered[1:]) + if r == utf8.RuneError { + t.Error("Invalid UTF-8 was not properly filtered") + } + } +} + +func FuzzFilterValidXMLChars(f *testing.F) { + f.Fuzz(func(t *testing.T, s []byte) { + filterValidXMLChars(s) + }) +}