diff --git a/internal/reader/encoding/encoding.go b/internal/reader/encoding/encoding.go index 3987a885..ea33bd1f 100644 --- a/internal/reader/encoding/encoding.go +++ b/internal/reader/encoding/encoding.go @@ -5,6 +5,7 @@ package encoding // import "miniflux.app/v2/internal/reader/encoding" import ( "bytes" + "fmt" "io" "unicode/utf8" @@ -23,7 +24,11 @@ import ( // - Feeds with encoding specified only in XML document and not in HTTP header // - Feeds with wrong encoding defined and already in UTF-8 func CharsetReader(charsetLabel string, input io.Reader) (io.Reader, error) { - buffer, _ := io.ReadAll(input) + buffer, err := io.ReadAll(input) + if err != nil { + return nil, fmt.Errorf(`encoding: unable to read input: %w`, err) + } + r := bytes.NewReader(buffer) // The document is already UTF-8, do not do anything (avoid double-encoding). @@ -35,3 +40,24 @@ func CharsetReader(charsetLabel string, input io.Reader) (io.Reader, error) { // Transform document to UTF-8 from the specified encoding in XML prolog. return charset.NewReaderLabel(charsetLabel, r) } + +// NewCharsetReader returns an io.Reader that converts the content of r to UTF-8. +func NewCharsetReader(r io.Reader, contentType string) (io.Reader, error) { + buffer, err := io.ReadAll(r) + if err != nil { + return nil, fmt.Errorf(`encoding: unable to read input: %w`, err) + } + + internalReader := bytes.NewReader(buffer) + + // The document is already UTF-8, do not do anything. + if utf8.Valid(buffer) { + return internalReader, nil + } + + // Transform document to UTF-8 from the specified encoding in Content-Type header. + // Note that only the first 1024 bytes are used to detect the encoding. + // If the tag is not found in the first 1024 bytes, charset.DetermineEncoding returns "windows-1252" resulting in encoding issues. + // See https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding + return charset.NewReader(internalReader, contentType) +} diff --git a/internal/reader/encoding/encoding_test.go b/internal/reader/encoding/encoding_test.go index 10d326ba..708a5e71 100644 --- a/internal/reader/encoding/encoding_test.go +++ b/internal/reader/encoding/encoding_test.go @@ -4,6 +4,7 @@ package encoding // import "miniflux.app/v2/internal/reader/encoding" import ( + "bytes" "io" "os" "testing" @@ -31,6 +32,11 @@ func TestCharsetReaderWithUTF8(t *testing.T) { if !utf8.Valid(data) { t.Fatalf("Data is not valid UTF-8") } + + expectedUnicodeString := "Café" + if !bytes.Contains(data, []byte(expectedUnicodeString)) { + t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString) + } } func TestCharsetReaderWithISO88591(t *testing.T) { @@ -54,6 +60,11 @@ func TestCharsetReaderWithISO88591(t *testing.T) { if !utf8.Valid(data) { t.Fatalf("Data is not valid UTF-8") } + + expectedUnicodeString := "Café" + if !bytes.Contains(data, []byte(expectedUnicodeString)) { + t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString) + } } func TestCharsetReaderWithWindows1252(t *testing.T) { @@ -77,6 +88,11 @@ func TestCharsetReaderWithWindows1252(t *testing.T) { if !utf8.Valid(data) { t.Fatalf("Data is not valid UTF-8") } + + expectedUnicodeString := "Euro €" + if !bytes.Contains(data, []byte(expectedUnicodeString)) { + t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString) + } } func TestCharsetReaderWithInvalidProlog(t *testing.T) { @@ -100,6 +116,11 @@ func TestCharsetReaderWithInvalidProlog(t *testing.T) { if !utf8.Valid(data) { t.Fatalf("Data is not valid UTF-8") } + + expectedUnicodeString := "Café" + if !bytes.Contains(data, []byte(expectedUnicodeString)) { + t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString) + } } func TestCharsetReaderWithUTF8DocumentWithIncorrectProlog(t *testing.T) { @@ -123,6 +144,11 @@ func TestCharsetReaderWithUTF8DocumentWithIncorrectProlog(t *testing.T) { if !utf8.Valid(data) { t.Fatalf("Data is not valid UTF-8") } + + expectedUnicodeString := "Café" + if !bytes.Contains(data, []byte(expectedUnicodeString)) { + t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString) + } } func TestCharsetReaderWithWindows1252DocumentWithIncorrectProlog(t *testing.T) { @@ -146,4 +172,177 @@ func TestCharsetReaderWithWindows1252DocumentWithIncorrectProlog(t *testing.T) { if !utf8.Valid(data) { t.Fatalf("Data is not valid UTF-8") } + + expectedUnicodeString := "Euro €" + if !bytes.Contains(data, []byte(expectedUnicodeString)) { + t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString) + } +} + +func TestNewReaderWithUTF8Document(t *testing.T) { + file := "testdata/utf8.html" + + f, err := os.Open(file) + if err != nil { + t.Fatalf("Unable to open file: %v", err) + } + + reader, err := NewCharsetReader(f, "text/html; charset=UTF-8") + if err != nil { + t.Fatalf("Unable to create reader: %v", err) + } + + data, err := io.ReadAll(reader) + if err != nil { + t.Fatalf("Unable to read data: %v", err) + } + + if !utf8.Valid(data) { + t.Fatalf("Data is not valid UTF-8") + } + + expectedUnicodeString := "Café" + if !bytes.Contains(data, []byte(expectedUnicodeString)) { + t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString) + } +} + +func TestNewReaderWithUTF8DocumentAndNoContentEncoding(t *testing.T) { + file := "testdata/utf8.html" + + f, err := os.Open(file) + if err != nil { + t.Fatalf("Unable to open file: %v", err) + } + + reader, err := NewCharsetReader(f, "text/html") + if err != nil { + t.Fatalf("Unable to create reader: %v", err) + } + + data, err := io.ReadAll(reader) + if err != nil { + t.Fatalf("Unable to read data: %v", err) + } + + if !utf8.Valid(data) { + t.Fatalf("Data is not valid UTF-8") + } + + expectedUnicodeString := "Café" + if !bytes.Contains(data, []byte(expectedUnicodeString)) { + t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString) + } +} + +func TestNewReaderWithISO88591Document(t *testing.T) { + file := "testdata/iso-8859-1.xml" + + f, err := os.Open(file) + if err != nil { + t.Fatalf("Unable to open file: %v", err) + } + + reader, err := NewCharsetReader(f, "text/html; charset=ISO-8859-1") + if err != nil { + t.Fatalf("Unable to create reader: %v", err) + } + + data, err := io.ReadAll(reader) + if err != nil { + t.Fatalf("Unable to read data: %v", err) + } + + if !utf8.Valid(data) { + t.Fatalf("Data is not valid UTF-8") + } + + expectedUnicodeString := "Café" + if !bytes.Contains(data, []byte(expectedUnicodeString)) { + t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString) + } +} + +func TestNewReaderWithISO88591DocumentAndNoContentType(t *testing.T) { + file := "testdata/iso-8859-1.xml" + + f, err := os.Open(file) + if err != nil { + t.Fatalf("Unable to open file: %v", err) + } + + reader, err := NewCharsetReader(f, "") + if err != nil { + t.Fatalf("Unable to create reader: %v", err) + } + + data, err := io.ReadAll(reader) + if err != nil { + t.Fatalf("Unable to read data: %v", err) + } + + if !utf8.Valid(data) { + t.Fatalf("Data is not valid UTF-8") + } + + expectedUnicodeString := "Café" + if !bytes.Contains(data, []byte(expectedUnicodeString)) { + t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString) + } +} + +func TestNewReaderWithISO88591DocumentWithMetaAfter1024Bytes(t *testing.T) { + file := "testdata/iso-8859-1-meta-after-1024.html" + + f, err := os.Open(file) + if err != nil { + t.Fatalf("Unable to open file: %v", err) + } + + reader, err := NewCharsetReader(f, "text/html") + if err != nil { + t.Fatalf("Unable to create reader: %v", err) + } + + data, err := io.ReadAll(reader) + if err != nil { + t.Fatalf("Unable to read data: %v", err) + } + + if !utf8.Valid(data) { + t.Fatalf("Data is not valid UTF-8") + } + + expectedUnicodeString := "Café" + if !bytes.Contains(data, []byte(expectedUnicodeString)) { + t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString) + } +} + +func TestNewReaderWithUTF8DocumentWithMetaAfter1024Bytes(t *testing.T) { + file := "testdata/utf8-meta-after-1024.html" + + f, err := os.Open(file) + if err != nil { + t.Fatalf("Unable to open file: %v", err) + } + + reader, err := NewCharsetReader(f, "text/html") + if err != nil { + t.Fatalf("Unable to create reader: %v", err) + } + + data, err := io.ReadAll(reader) + if err != nil { + t.Fatalf("Unable to read data: %v", err) + } + + if !utf8.Valid(data) { + t.Fatalf("Data is not valid UTF-8") + } + + expectedUnicodeString := "Café" + if !bytes.Contains(data, []byte(expectedUnicodeString)) { + t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString) + } } diff --git a/internal/reader/encoding/testdata/invalid-prolog.xml b/internal/reader/encoding/testdata/invalid-prolog.xml index 7c8b1283..70fea0d4 100644 --- a/internal/reader/encoding/testdata/invalid-prolog.xml +++ b/internal/reader/encoding/testdata/invalid-prolog.xml @@ -2,6 +2,6 @@ 테스트 피드 - こんにちは世界 + Café \ No newline at end of file diff --git a/internal/reader/encoding/testdata/iso-8859-1-meta-after-1024.html b/internal/reader/encoding/testdata/iso-8859-1-meta-after-1024.html new file mode 100644 index 00000000..40f90237 --- /dev/null +++ b/internal/reader/encoding/testdata/iso-8859-1-meta-after-1024.html @@ -0,0 +1,48 @@ + + + + + + Frdric + + +

Caf

+ + \ No newline at end of file diff --git a/internal/reader/encoding/testdata/iso-8859-1.html b/internal/reader/encoding/testdata/iso-8859-1.html new file mode 100644 index 00000000..a7275a48 --- /dev/null +++ b/internal/reader/encoding/testdata/iso-8859-1.html @@ -0,0 +1,10 @@ + + + + + Frdric + + +

Caf

+ + \ No newline at end of file diff --git a/internal/reader/encoding/testdata/utf8-incorrect-prolog.xml b/internal/reader/encoding/testdata/utf8-incorrect-prolog.xml index 96590162..aa955203 100644 --- a/internal/reader/encoding/testdata/utf8-incorrect-prolog.xml +++ b/internal/reader/encoding/testdata/utf8-incorrect-prolog.xml @@ -2,6 +2,6 @@ 테스트 피드 - こんにちは世界 + Café \ No newline at end of file diff --git a/internal/reader/encoding/testdata/utf8-meta-after-1024.html b/internal/reader/encoding/testdata/utf8-meta-after-1024.html new file mode 100644 index 00000000..e49df916 --- /dev/null +++ b/internal/reader/encoding/testdata/utf8-meta-after-1024.html @@ -0,0 +1,48 @@ + + + + + + Frédéric + + +

Café

+ + \ No newline at end of file diff --git a/internal/reader/encoding/testdata/utf8.html b/internal/reader/encoding/testdata/utf8.html new file mode 100644 index 00000000..bb7ba7e2 --- /dev/null +++ b/internal/reader/encoding/testdata/utf8.html @@ -0,0 +1,10 @@ + + + + + Café + + +

Café

+ + \ No newline at end of file diff --git a/internal/reader/encoding/testdata/utf8.xml b/internal/reader/encoding/testdata/utf8.xml index 11f10a5a..96bf7329 100644 --- a/internal/reader/encoding/testdata/utf8.xml +++ b/internal/reader/encoding/testdata/utf8.xml @@ -2,6 +2,6 @@ 테스트 피드 - こんにちは世界 + Café \ No newline at end of file diff --git a/internal/reader/icon/finder.go b/internal/reader/icon/finder.go index ab89a5c7..e238f5f5 100644 --- a/internal/reader/icon/finder.go +++ b/internal/reader/icon/finder.go @@ -21,12 +21,12 @@ import ( "miniflux.app/v2/internal/config" "miniflux.app/v2/internal/crypto" "miniflux.app/v2/internal/model" + "miniflux.app/v2/internal/reader/encoding" "miniflux.app/v2/internal/reader/fetcher" "miniflux.app/v2/internal/urllib" "github.com/PuerkitoBio/goquery" "golang.org/x/image/draw" - "golang.org/x/net/html/charset" ) type IconFinder struct { @@ -248,7 +248,7 @@ func findIconURLsFromHTMLDocument(body io.Reader, contentType string) ([]string, "link[rel='apple-touch-icon-precomposed.png']", } - htmlDocumentReader, err := charset.NewReader(body, contentType) + htmlDocumentReader, err := encoding.NewCharsetReader(body, contentType) if err != nil { return nil, fmt.Errorf("icon: unable to create charset reader: %w", err) } diff --git a/internal/reader/scraper/scraper.go b/internal/reader/scraper/scraper.go index de8e3afc..cf17e0c5 100644 --- a/internal/reader/scraper/scraper.go +++ b/internal/reader/scraper/scraper.go @@ -10,12 +10,12 @@ import ( "strings" "miniflux.app/v2/internal/config" + "miniflux.app/v2/internal/reader/encoding" "miniflux.app/v2/internal/reader/fetcher" "miniflux.app/v2/internal/reader/readability" "miniflux.app/v2/internal/urllib" "github.com/PuerkitoBio/goquery" - "golang.org/x/net/html/charset" ) func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, pageURL, rules string) (baseURL string, extractedContent string, err error) { @@ -39,10 +39,11 @@ func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, pageURL, rules string rules = getPredefinedScraperRules(pageURL) } - htmlDocumentReader, err := charset.NewReader( + htmlDocumentReader, err := encoding.NewCharsetReader( responseHandler.Body(config.Opts.HTTPClientMaxBodySize()), responseHandler.ContentType(), ) + if err != nil { return "", "", fmt.Errorf("scraper: unable to read HTML document with charset reader: %v", err) } diff --git a/internal/reader/subscription/finder.go b/internal/reader/subscription/finder.go index 55088735..3ca8f0cc 100644 --- a/internal/reader/subscription/finder.go +++ b/internal/reader/subscription/finder.go @@ -16,12 +16,12 @@ import ( "miniflux.app/v2/internal/integration/rssbridge" "miniflux.app/v2/internal/locale" "miniflux.app/v2/internal/model" + "miniflux.app/v2/internal/reader/encoding" "miniflux.app/v2/internal/reader/fetcher" "miniflux.app/v2/internal/reader/parser" "miniflux.app/v2/internal/urllib" "github.com/PuerkitoBio/goquery" - "golang.org/x/net/html/charset" ) var ( @@ -136,7 +136,7 @@ func (f *SubscriptionFinder) FindSubscriptionsFromWebPage(websiteURL, contentTyp "link[type='application/feed+json']": parser.FormatJSON, } - htmlDocumentReader, err := charset.NewReader(body, contentType) + htmlDocumentReader, err := encoding.NewCharsetReader(body, contentType) if err != nil { return nil, locale.NewLocalizedErrorWrapper(err, "error.unable_to_parse_html_document", err) }