mirror of
https://github.com/miniflux/v2.git
synced 2025-08-01 17:38:37 +00:00
fix(scraper): avoid encoding issue if charset meta tag is after 1024 bytes
This commit is contained in:
parent
af1f966250
commit
6eedf4111f
12 changed files with 352 additions and 10 deletions
|
@ -2,6 +2,6 @@
|
|||
<feed>
|
||||
<title>테스트 피드</title>
|
||||
<entry>
|
||||
<title>こんにちは世界</title>
|
||||
<title>Café</title>
|
||||
</entry>
|
||||
</feed>
|
48
internal/reader/encoding/testdata/iso-8859-1-meta-after-1024.html
vendored
Normal file
48
internal/reader/encoding/testdata/iso-8859-1-meta-after-1024.html
vendored
Normal file
|
@ -0,0 +1,48 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<!---
|
||||
|
||||
This text is greater than 1024 bytes which are used by the charset.NewReader to determine the encoding of the file.
|
||||
|
||||
This comment is used to pad the file to 1024 bytes.
|
||||
|
||||
The <meta> tag must be after 1024 bytes to ensure that the encoding is detected correctly.
|
||||
|
||||
---
|
||||
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
|
||||
-->
|
||||
<head>
|
||||
<meta charset="iso-8859-1">
|
||||
<title>Frédéric</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>Café</p>
|
||||
</body>
|
||||
</html>
|
10
internal/reader/encoding/testdata/iso-8859-1.html
vendored
Normal file
10
internal/reader/encoding/testdata/iso-8859-1.html
vendored
Normal file
|
@ -0,0 +1,10 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="iso-8859-1">
|
||||
<title>Frédéric</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>Café</p>
|
||||
</body>
|
||||
</html>
|
|
@ -2,6 +2,6 @@
|
|||
<feed>
|
||||
<title>테스트 피드</title>
|
||||
<entry>
|
||||
<title>こんにちは世界</title>
|
||||
<title>Café</title>
|
||||
</entry>
|
||||
</feed>
|
48
internal/reader/encoding/testdata/utf8-meta-after-1024.html
vendored
Normal file
48
internal/reader/encoding/testdata/utf8-meta-after-1024.html
vendored
Normal file
|
@ -0,0 +1,48 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<!---
|
||||
|
||||
This text is greater than 1024 bytes which are used by the charset.NewReader to determine the encoding of the file.
|
||||
|
||||
This comment is used to pad the file to 1024 bytes.
|
||||
|
||||
The <meta> tag must be after 1024 bytes to ensure that the encoding is detected correctly.
|
||||
|
||||
---
|
||||
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
More text to pad the file to 1024 bytes.
|
||||
|
||||
-->
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Frédéric</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>Café</p>
|
||||
</body>
|
||||
</html>
|
10
internal/reader/encoding/testdata/utf8.html
vendored
Normal file
10
internal/reader/encoding/testdata/utf8.html
vendored
Normal file
|
@ -0,0 +1,10 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Café</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>Café</p>
|
||||
</body>
|
||||
</html>
|
2
internal/reader/encoding/testdata/utf8.xml
vendored
2
internal/reader/encoding/testdata/utf8.xml
vendored
|
@ -2,6 +2,6 @@
|
|||
<feed>
|
||||
<title>테스트 피드</title>
|
||||
<entry>
|
||||
<title>こんにちは世界</title>
|
||||
<title>Café</title>
|
||||
</entry>
|
||||
</feed>
|
Loading…
Add table
Add a link
Reference in a new issue