mirror of
https://github.com/miniflux/v2.git
synced 2025-06-27 16:36:00 +00:00
fix(scraper): avoid encoding issue if charset meta tag is after 1024 bytes
This commit is contained in:
parent
af1f966250
commit
6eedf4111f
12 changed files with 352 additions and 10 deletions
|
@ -5,6 +5,7 @@ package encoding // import "miniflux.app/v2/internal/reader/encoding"
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"unicode/utf8"
|
"unicode/utf8"
|
||||||
|
|
||||||
|
@ -23,7 +24,11 @@ import (
|
||||||
// - Feeds with encoding specified only in XML document and not in HTTP header
|
// - Feeds with encoding specified only in XML document and not in HTTP header
|
||||||
// - Feeds with wrong encoding defined and already in UTF-8
|
// - Feeds with wrong encoding defined and already in UTF-8
|
||||||
func CharsetReader(charsetLabel string, input io.Reader) (io.Reader, error) {
|
func CharsetReader(charsetLabel string, input io.Reader) (io.Reader, error) {
|
||||||
buffer, _ := io.ReadAll(input)
|
buffer, err := io.ReadAll(input)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf(`encoding: unable to read input: %w`, err)
|
||||||
|
}
|
||||||
|
|
||||||
r := bytes.NewReader(buffer)
|
r := bytes.NewReader(buffer)
|
||||||
|
|
||||||
// The document is already UTF-8, do not do anything (avoid double-encoding).
|
// The document is already UTF-8, do not do anything (avoid double-encoding).
|
||||||
|
@ -35,3 +40,24 @@ func CharsetReader(charsetLabel string, input io.Reader) (io.Reader, error) {
|
||||||
// Transform document to UTF-8 from the specified encoding in XML prolog.
|
// Transform document to UTF-8 from the specified encoding in XML prolog.
|
||||||
return charset.NewReaderLabel(charsetLabel, r)
|
return charset.NewReaderLabel(charsetLabel, r)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NewCharsetReader returns an io.Reader that converts the content of r to UTF-8.
|
||||||
|
func NewCharsetReader(r io.Reader, contentType string) (io.Reader, error) {
|
||||||
|
buffer, err := io.ReadAll(r)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf(`encoding: unable to read input: %w`, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
internalReader := bytes.NewReader(buffer)
|
||||||
|
|
||||||
|
// The document is already UTF-8, do not do anything.
|
||||||
|
if utf8.Valid(buffer) {
|
||||||
|
return internalReader, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Transform document to UTF-8 from the specified encoding in Content-Type header.
|
||||||
|
// Note that only the first 1024 bytes are used to detect the encoding.
|
||||||
|
// If the <meta charset> tag is not found in the first 1024 bytes, charset.DetermineEncoding returns "windows-1252" resulting in encoding issues.
|
||||||
|
// See https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
|
||||||
|
return charset.NewReader(internalReader, contentType)
|
||||||
|
}
|
||||||
|
|
|
@ -4,6 +4,7 @@
|
||||||
package encoding // import "miniflux.app/v2/internal/reader/encoding"
|
package encoding // import "miniflux.app/v2/internal/reader/encoding"
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"testing"
|
"testing"
|
||||||
|
@ -31,6 +32,11 @@ func TestCharsetReaderWithUTF8(t *testing.T) {
|
||||||
if !utf8.Valid(data) {
|
if !utf8.Valid(data) {
|
||||||
t.Fatalf("Data is not valid UTF-8")
|
t.Fatalf("Data is not valid UTF-8")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
expectedUnicodeString := "Café"
|
||||||
|
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||||
|
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestCharsetReaderWithISO88591(t *testing.T) {
|
func TestCharsetReaderWithISO88591(t *testing.T) {
|
||||||
|
@ -54,6 +60,11 @@ func TestCharsetReaderWithISO88591(t *testing.T) {
|
||||||
if !utf8.Valid(data) {
|
if !utf8.Valid(data) {
|
||||||
t.Fatalf("Data is not valid UTF-8")
|
t.Fatalf("Data is not valid UTF-8")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
expectedUnicodeString := "Café"
|
||||||
|
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||||
|
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestCharsetReaderWithWindows1252(t *testing.T) {
|
func TestCharsetReaderWithWindows1252(t *testing.T) {
|
||||||
|
@ -77,6 +88,11 @@ func TestCharsetReaderWithWindows1252(t *testing.T) {
|
||||||
if !utf8.Valid(data) {
|
if !utf8.Valid(data) {
|
||||||
t.Fatalf("Data is not valid UTF-8")
|
t.Fatalf("Data is not valid UTF-8")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
expectedUnicodeString := "Euro €"
|
||||||
|
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||||
|
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestCharsetReaderWithInvalidProlog(t *testing.T) {
|
func TestCharsetReaderWithInvalidProlog(t *testing.T) {
|
||||||
|
@ -100,6 +116,11 @@ func TestCharsetReaderWithInvalidProlog(t *testing.T) {
|
||||||
if !utf8.Valid(data) {
|
if !utf8.Valid(data) {
|
||||||
t.Fatalf("Data is not valid UTF-8")
|
t.Fatalf("Data is not valid UTF-8")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
expectedUnicodeString := "Café"
|
||||||
|
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||||
|
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestCharsetReaderWithUTF8DocumentWithIncorrectProlog(t *testing.T) {
|
func TestCharsetReaderWithUTF8DocumentWithIncorrectProlog(t *testing.T) {
|
||||||
|
@ -123,6 +144,11 @@ func TestCharsetReaderWithUTF8DocumentWithIncorrectProlog(t *testing.T) {
|
||||||
if !utf8.Valid(data) {
|
if !utf8.Valid(data) {
|
||||||
t.Fatalf("Data is not valid UTF-8")
|
t.Fatalf("Data is not valid UTF-8")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
expectedUnicodeString := "Café"
|
||||||
|
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||||
|
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestCharsetReaderWithWindows1252DocumentWithIncorrectProlog(t *testing.T) {
|
func TestCharsetReaderWithWindows1252DocumentWithIncorrectProlog(t *testing.T) {
|
||||||
|
@ -146,4 +172,177 @@ func TestCharsetReaderWithWindows1252DocumentWithIncorrectProlog(t *testing.T) {
|
||||||
if !utf8.Valid(data) {
|
if !utf8.Valid(data) {
|
||||||
t.Fatalf("Data is not valid UTF-8")
|
t.Fatalf("Data is not valid UTF-8")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
expectedUnicodeString := "Euro €"
|
||||||
|
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||||
|
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNewReaderWithUTF8Document(t *testing.T) {
|
||||||
|
file := "testdata/utf8.html"
|
||||||
|
|
||||||
|
f, err := os.Open(file)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Unable to open file: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
reader, err := NewCharsetReader(f, "text/html; charset=UTF-8")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Unable to create reader: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := io.ReadAll(reader)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Unable to read data: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !utf8.Valid(data) {
|
||||||
|
t.Fatalf("Data is not valid UTF-8")
|
||||||
|
}
|
||||||
|
|
||||||
|
expectedUnicodeString := "Café"
|
||||||
|
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||||
|
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNewReaderWithUTF8DocumentAndNoContentEncoding(t *testing.T) {
|
||||||
|
file := "testdata/utf8.html"
|
||||||
|
|
||||||
|
f, err := os.Open(file)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Unable to open file: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
reader, err := NewCharsetReader(f, "text/html")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Unable to create reader: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := io.ReadAll(reader)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Unable to read data: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !utf8.Valid(data) {
|
||||||
|
t.Fatalf("Data is not valid UTF-8")
|
||||||
|
}
|
||||||
|
|
||||||
|
expectedUnicodeString := "Café"
|
||||||
|
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||||
|
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNewReaderWithISO88591Document(t *testing.T) {
|
||||||
|
file := "testdata/iso-8859-1.xml"
|
||||||
|
|
||||||
|
f, err := os.Open(file)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Unable to open file: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
reader, err := NewCharsetReader(f, "text/html; charset=ISO-8859-1")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Unable to create reader: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := io.ReadAll(reader)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Unable to read data: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !utf8.Valid(data) {
|
||||||
|
t.Fatalf("Data is not valid UTF-8")
|
||||||
|
}
|
||||||
|
|
||||||
|
expectedUnicodeString := "Café"
|
||||||
|
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||||
|
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNewReaderWithISO88591DocumentAndNoContentType(t *testing.T) {
|
||||||
|
file := "testdata/iso-8859-1.xml"
|
||||||
|
|
||||||
|
f, err := os.Open(file)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Unable to open file: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
reader, err := NewCharsetReader(f, "")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Unable to create reader: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := io.ReadAll(reader)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Unable to read data: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !utf8.Valid(data) {
|
||||||
|
t.Fatalf("Data is not valid UTF-8")
|
||||||
|
}
|
||||||
|
|
||||||
|
expectedUnicodeString := "Café"
|
||||||
|
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||||
|
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNewReaderWithISO88591DocumentWithMetaAfter1024Bytes(t *testing.T) {
|
||||||
|
file := "testdata/iso-8859-1-meta-after-1024.html"
|
||||||
|
|
||||||
|
f, err := os.Open(file)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Unable to open file: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
reader, err := NewCharsetReader(f, "text/html")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Unable to create reader: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := io.ReadAll(reader)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Unable to read data: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !utf8.Valid(data) {
|
||||||
|
t.Fatalf("Data is not valid UTF-8")
|
||||||
|
}
|
||||||
|
|
||||||
|
expectedUnicodeString := "Café"
|
||||||
|
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||||
|
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNewReaderWithUTF8DocumentWithMetaAfter1024Bytes(t *testing.T) {
|
||||||
|
file := "testdata/utf8-meta-after-1024.html"
|
||||||
|
|
||||||
|
f, err := os.Open(file)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Unable to open file: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
reader, err := NewCharsetReader(f, "text/html")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Unable to create reader: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := io.ReadAll(reader)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Unable to read data: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !utf8.Valid(data) {
|
||||||
|
t.Fatalf("Data is not valid UTF-8")
|
||||||
|
}
|
||||||
|
|
||||||
|
expectedUnicodeString := "Café"
|
||||||
|
if !bytes.Contains(data, []byte(expectedUnicodeString)) {
|
||||||
|
t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,6 +2,6 @@
|
||||||
<feed>
|
<feed>
|
||||||
<title>테스트 피드</title>
|
<title>테스트 피드</title>
|
||||||
<entry>
|
<entry>
|
||||||
<title>こんにちは世界</title>
|
<title>Café</title>
|
||||||
</entry>
|
</entry>
|
||||||
</feed>
|
</feed>
|
48
internal/reader/encoding/testdata/iso-8859-1-meta-after-1024.html
vendored
Normal file
48
internal/reader/encoding/testdata/iso-8859-1-meta-after-1024.html
vendored
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<!---
|
||||||
|
|
||||||
|
This text is greater than 1024 bytes which are used by the charset.NewReader to determine the encoding of the file.
|
||||||
|
|
||||||
|
This comment is used to pad the file to 1024 bytes.
|
||||||
|
|
||||||
|
The <meta> tag must be after 1024 bytes to ensure that the encoding is detected correctly.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
|
||||||
|
-->
|
||||||
|
<head>
|
||||||
|
<meta charset="iso-8859-1">
|
||||||
|
<title>Frédéric</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>Café</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
10
internal/reader/encoding/testdata/iso-8859-1.html
vendored
Normal file
10
internal/reader/encoding/testdata/iso-8859-1.html
vendored
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta charset="iso-8859-1">
|
||||||
|
<title>Frédéric</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>Café</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -2,6 +2,6 @@
|
||||||
<feed>
|
<feed>
|
||||||
<title>테스트 피드</title>
|
<title>테스트 피드</title>
|
||||||
<entry>
|
<entry>
|
||||||
<title>こんにちは世界</title>
|
<title>Café</title>
|
||||||
</entry>
|
</entry>
|
||||||
</feed>
|
</feed>
|
48
internal/reader/encoding/testdata/utf8-meta-after-1024.html
vendored
Normal file
48
internal/reader/encoding/testdata/utf8-meta-after-1024.html
vendored
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<!---
|
||||||
|
|
||||||
|
This text is greater than 1024 bytes which are used by the charset.NewReader to determine the encoding of the file.
|
||||||
|
|
||||||
|
This comment is used to pad the file to 1024 bytes.
|
||||||
|
|
||||||
|
The <meta> tag must be after 1024 bytes to ensure that the encoding is detected correctly.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
More text to pad the file to 1024 bytes.
|
||||||
|
|
||||||
|
-->
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<title>Frédéric</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>Café</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
10
internal/reader/encoding/testdata/utf8.html
vendored
Normal file
10
internal/reader/encoding/testdata/utf8.html
vendored
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<title>Café</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>Café</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
2
internal/reader/encoding/testdata/utf8.xml
vendored
2
internal/reader/encoding/testdata/utf8.xml
vendored
|
@ -2,6 +2,6 @@
|
||||||
<feed>
|
<feed>
|
||||||
<title>테스트 피드</title>
|
<title>테스트 피드</title>
|
||||||
<entry>
|
<entry>
|
||||||
<title>こんにちは世界</title>
|
<title>Café</title>
|
||||||
</entry>
|
</entry>
|
||||||
</feed>
|
</feed>
|
|
@ -21,12 +21,12 @@ import (
|
||||||
"miniflux.app/v2/internal/config"
|
"miniflux.app/v2/internal/config"
|
||||||
"miniflux.app/v2/internal/crypto"
|
"miniflux.app/v2/internal/crypto"
|
||||||
"miniflux.app/v2/internal/model"
|
"miniflux.app/v2/internal/model"
|
||||||
|
"miniflux.app/v2/internal/reader/encoding"
|
||||||
"miniflux.app/v2/internal/reader/fetcher"
|
"miniflux.app/v2/internal/reader/fetcher"
|
||||||
"miniflux.app/v2/internal/urllib"
|
"miniflux.app/v2/internal/urllib"
|
||||||
|
|
||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
"golang.org/x/image/draw"
|
"golang.org/x/image/draw"
|
||||||
"golang.org/x/net/html/charset"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type IconFinder struct {
|
type IconFinder struct {
|
||||||
|
@ -248,7 +248,7 @@ func findIconURLsFromHTMLDocument(body io.Reader, contentType string) ([]string,
|
||||||
"link[rel='apple-touch-icon-precomposed.png']",
|
"link[rel='apple-touch-icon-precomposed.png']",
|
||||||
}
|
}
|
||||||
|
|
||||||
htmlDocumentReader, err := charset.NewReader(body, contentType)
|
htmlDocumentReader, err := encoding.NewCharsetReader(body, contentType)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("icon: unable to create charset reader: %w", err)
|
return nil, fmt.Errorf("icon: unable to create charset reader: %w", err)
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,12 +10,12 @@ import (
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"miniflux.app/v2/internal/config"
|
"miniflux.app/v2/internal/config"
|
||||||
|
"miniflux.app/v2/internal/reader/encoding"
|
||||||
"miniflux.app/v2/internal/reader/fetcher"
|
"miniflux.app/v2/internal/reader/fetcher"
|
||||||
"miniflux.app/v2/internal/reader/readability"
|
"miniflux.app/v2/internal/reader/readability"
|
||||||
"miniflux.app/v2/internal/urllib"
|
"miniflux.app/v2/internal/urllib"
|
||||||
|
|
||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
"golang.org/x/net/html/charset"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, pageURL, rules string) (baseURL string, extractedContent string, err error) {
|
func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, pageURL, rules string) (baseURL string, extractedContent string, err error) {
|
||||||
|
@ -39,10 +39,11 @@ func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, pageURL, rules string
|
||||||
rules = getPredefinedScraperRules(pageURL)
|
rules = getPredefinedScraperRules(pageURL)
|
||||||
}
|
}
|
||||||
|
|
||||||
htmlDocumentReader, err := charset.NewReader(
|
htmlDocumentReader, err := encoding.NewCharsetReader(
|
||||||
responseHandler.Body(config.Opts.HTTPClientMaxBodySize()),
|
responseHandler.Body(config.Opts.HTTPClientMaxBodySize()),
|
||||||
responseHandler.ContentType(),
|
responseHandler.ContentType(),
|
||||||
)
|
)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", "", fmt.Errorf("scraper: unable to read HTML document with charset reader: %v", err)
|
return "", "", fmt.Errorf("scraper: unable to read HTML document with charset reader: %v", err)
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,12 +16,12 @@ import (
|
||||||
"miniflux.app/v2/internal/integration/rssbridge"
|
"miniflux.app/v2/internal/integration/rssbridge"
|
||||||
"miniflux.app/v2/internal/locale"
|
"miniflux.app/v2/internal/locale"
|
||||||
"miniflux.app/v2/internal/model"
|
"miniflux.app/v2/internal/model"
|
||||||
|
"miniflux.app/v2/internal/reader/encoding"
|
||||||
"miniflux.app/v2/internal/reader/fetcher"
|
"miniflux.app/v2/internal/reader/fetcher"
|
||||||
"miniflux.app/v2/internal/reader/parser"
|
"miniflux.app/v2/internal/reader/parser"
|
||||||
"miniflux.app/v2/internal/urllib"
|
"miniflux.app/v2/internal/urllib"
|
||||||
|
|
||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
"golang.org/x/net/html/charset"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
@ -136,7 +136,7 @@ func (f *SubscriptionFinder) FindSubscriptionsFromWebPage(websiteURL, contentTyp
|
||||||
"link[type='application/feed+json']": parser.FormatJSON,
|
"link[type='application/feed+json']": parser.FormatJSON,
|
||||||
}
|
}
|
||||||
|
|
||||||
htmlDocumentReader, err := charset.NewReader(body, contentType)
|
htmlDocumentReader, err := encoding.NewCharsetReader(body, contentType)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, locale.NewLocalizedErrorWrapper(err, "error.unable_to_parse_html_document", err)
|
return nil, locale.NewLocalizedErrorWrapper(err, "error.unable_to_parse_html_document", err)
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue