1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-06-27 16:36:00 +00:00

fix(scraper): avoid encoding issue if charset meta tag is after 1024 bytes

This commit is contained in:
Frédéric Guillot 2025-02-15 16:58:06 -08:00
parent af1f966250
commit 6eedf4111f
12 changed files with 352 additions and 10 deletions

View file

@ -4,6 +4,7 @@
package encoding // import "miniflux.app/v2/internal/reader/encoding"
import (
"bytes"
"io"
"os"
"testing"
@ -31,6 +32,11 @@ func TestCharsetReaderWithUTF8(t *testing.T) {
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
expectedUnicodeString := "Café"
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
}
}
func TestCharsetReaderWithISO88591(t *testing.T) {
@ -54,6 +60,11 @@ func TestCharsetReaderWithISO88591(t *testing.T) {
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
expectedUnicodeString := "Café"
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
}
}
func TestCharsetReaderWithWindows1252(t *testing.T) {
@ -77,6 +88,11 @@ func TestCharsetReaderWithWindows1252(t *testing.T) {
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
expectedUnicodeString := "Euro €"
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
}
}
func TestCharsetReaderWithInvalidProlog(t *testing.T) {
@ -100,6 +116,11 @@ func TestCharsetReaderWithInvalidProlog(t *testing.T) {
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
expectedUnicodeString := "Café"
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
}
}
func TestCharsetReaderWithUTF8DocumentWithIncorrectProlog(t *testing.T) {
@ -123,6 +144,11 @@ func TestCharsetReaderWithUTF8DocumentWithIncorrectProlog(t *testing.T) {
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
expectedUnicodeString := "Café"
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
}
}
func TestCharsetReaderWithWindows1252DocumentWithIncorrectProlog(t *testing.T) {
@ -146,4 +172,177 @@ func TestCharsetReaderWithWindows1252DocumentWithIncorrectProlog(t *testing.T) {
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
expectedUnicodeString := "Euro €"
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
}
}
func TestNewReaderWithUTF8Document(t *testing.T) {
file := "testdata/utf8.html"
f, err := os.Open(file)
if err != nil {
t.Fatalf("Unable to open file: %v", err)
}
reader, err := NewCharsetReader(f, "text/html; charset=UTF-8")
if err != nil {
t.Fatalf("Unable to create reader: %v", err)
}
data, err := io.ReadAll(reader)
if err != nil {
t.Fatalf("Unable to read data: %v", err)
}
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
expectedUnicodeString := "Café"
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
}
}
func TestNewReaderWithUTF8DocumentAndNoContentEncoding(t *testing.T) {
file := "testdata/utf8.html"
f, err := os.Open(file)
if err != nil {
t.Fatalf("Unable to open file: %v", err)
}
reader, err := NewCharsetReader(f, "text/html")
if err != nil {
t.Fatalf("Unable to create reader: %v", err)
}
data, err := io.ReadAll(reader)
if err != nil {
t.Fatalf("Unable to read data: %v", err)
}
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
expectedUnicodeString := "Café"
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
}
}
func TestNewReaderWithISO88591Document(t *testing.T) {
file := "testdata/iso-8859-1.xml"
f, err := os.Open(file)
if err != nil {
t.Fatalf("Unable to open file: %v", err)
}
reader, err := NewCharsetReader(f, "text/html; charset=ISO-8859-1")
if err != nil {
t.Fatalf("Unable to create reader: %v", err)
}
data, err := io.ReadAll(reader)
if err != nil {
t.Fatalf("Unable to read data: %v", err)
}
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
expectedUnicodeString := "Café"
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
}
}
func TestNewReaderWithISO88591DocumentAndNoContentType(t *testing.T) {
file := "testdata/iso-8859-1.xml"
f, err := os.Open(file)
if err != nil {
t.Fatalf("Unable to open file: %v", err)
}
reader, err := NewCharsetReader(f, "")
if err != nil {
t.Fatalf("Unable to create reader: %v", err)
}
data, err := io.ReadAll(reader)
if err != nil {
t.Fatalf("Unable to read data: %v", err)
}
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
expectedUnicodeString := "Café"
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
}
}
func TestNewReaderWithISO88591DocumentWithMetaAfter1024Bytes(t *testing.T) {
file := "testdata/iso-8859-1-meta-after-1024.html"
f, err := os.Open(file)
if err != nil {
t.Fatalf("Unable to open file: %v", err)
}
reader, err := NewCharsetReader(f, "text/html")
if err != nil {
t.Fatalf("Unable to create reader: %v", err)
}
data, err := io.ReadAll(reader)
if err != nil {
t.Fatalf("Unable to read data: %v", err)
}
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
expectedUnicodeString := "Café"
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
}
}
func TestNewReaderWithUTF8DocumentWithMetaAfter1024Bytes(t *testing.T) {
file := "testdata/utf8-meta-after-1024.html"
f, err := os.Open(file)
if err != nil {
t.Fatalf("Unable to open file: %v", err)
}
reader, err := NewCharsetReader(f, "text/html")
if err != nil {
t.Fatalf("Unable to create reader: %v", err)
}
data, err := io.ReadAll(reader)
if err != nil {
t.Fatalf("Unable to read data: %v", err)
}
if !utf8.Valid(data) {
t.Fatalf("Data is not valid UTF-8")
}
expectedUnicodeString := "Café"
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
}
}