mirror of
https://github.com/miniflux/v2.git
synced 2025-06-27 16:36:00 +00:00
fix(scraper): avoid encoding issue if charset meta tag is after 1024 bytes
This commit is contained in:
parent
af1f966250
commit
6eedf4111f
12 changed files with 352 additions and 10 deletions
|
@ -5,6 +5,7 @@ package encoding // import "miniflux.app/v2/internal/reader/encoding"
|
|||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"unicode/utf8"
|
||||
|
||||
|
@ -23,7 +24,11 @@ import (
|
|||
// - Feeds with encoding specified only in XML document and not in HTTP header
|
||||
// - Feeds with wrong encoding defined and already in UTF-8
|
||||
func CharsetReader(charsetLabel string, input io.Reader) (io.Reader, error) {
|
||||
buffer, _ := io.ReadAll(input)
|
||||
buffer, err := io.ReadAll(input)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`encoding: unable to read input: %w`, err)
|
||||
}
|
||||
|
||||
r := bytes.NewReader(buffer)
|
||||
|
||||
// The document is already UTF-8, do not do anything (avoid double-encoding).
|
||||
|
@ -35,3 +40,24 @@ func CharsetReader(charsetLabel string, input io.Reader) (io.Reader, error) {
|
|||
// Transform document to UTF-8 from the specified encoding in XML prolog.
|
||||
return charset.NewReaderLabel(charsetLabel, r)
|
||||
}
|
||||
|
||||
// NewCharsetReader returns an io.Reader that converts the content of r to UTF-8.
|
||||
func NewCharsetReader(r io.Reader, contentType string) (io.Reader, error) {
|
||||
buffer, err := io.ReadAll(r)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`encoding: unable to read input: %w`, err)
|
||||
}
|
||||
|
||||
internalReader := bytes.NewReader(buffer)
|
||||
|
||||
// The document is already UTF-8, do not do anything.
|
||||
if utf8.Valid(buffer) {
|
||||
return internalReader, nil
|
||||
}
|
||||
|
||||
// Transform document to UTF-8 from the specified encoding in Content-Type header.
|
||||
// Note that only the first 1024 bytes are used to detect the encoding.
|
||||
// If the <meta charset> tag is not found in the first 1024 bytes, charset.DetermineEncoding returns "windows-1252" resulting in encoding issues.
|
||||
// See https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
|
||||
return charset.NewReader(internalReader, contentType)
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue