1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-09-15 18:57:04 +00:00

sanitizer: handle image URLs in srcset attribute with comma

This commit is contained in:
Frédéric Guillot 2022-07-04 12:48:48 -07:00
parent d85908e3de
commit 806a069785
6 changed files with 212 additions and 88 deletions

View file

@ -20,7 +20,6 @@ import (
var (
youtubeEmbedRegex = regexp.MustCompile(`//www\.youtube\.com/embed/(.*)`)
splitSrcsetRegex = regexp.MustCompile(`,\s?`)
)
// Sanitize returns safe HTML.
@ -447,52 +446,17 @@ func isBlockedTag(tagName string) bool {
return false
}
/*
One or more strings separated by commas, indicating possible image sources for the user agent to use.
Each string is composed of:
- A URL to an image
- Optionally, whitespace followed by one of:
- A width descriptor (a positive integer directly followed by w). The width descriptor is divided by the source size given in the sizes attribute to calculate the effective pixel density.
- A pixel density descriptor (a positive floating point number directly followed by x).
*/
func sanitizeSrcsetAttr(baseURL, value string) string {
var sanitizedSources []string
rawSources := splitSrcsetRegex.Split(value, -1)
for _, rawSource := range rawSources {
parts := strings.Split(strings.TrimSpace(rawSource), " ")
nbParts := len(parts)
imageCandidates := ParseSrcSetAttribute(value)
if nbParts > 0 {
sanitizedSource, err := url.AbsoluteURL(baseURL, parts[0])
if err != nil {
continue
}
if nbParts == 2 && isValidWidthOrDensityDescriptor(parts[1]) {
sanitizedSource += " " + parts[1]
}
sanitizedSources = append(sanitizedSources, sanitizedSource)
for _, imageCandidate := range imageCandidates {
absoluteURL, err := url.AbsoluteURL(baseURL, imageCandidate.ImageURL)
if err == nil {
imageCandidate.ImageURL = absoluteURL
}
}
return strings.Join(sanitizedSources, ", ")
}
func isValidWidthOrDensityDescriptor(value string) bool {
if value == "" {
return false
}
lastChar := value[len(value)-1:]
if lastChar != "w" && lastChar != "x" {
return false
}
_, err := strconv.ParseFloat(value[0:len(value)-1], 32)
return err == nil
return imageCandidates.String()
}
func isValidDataAttribute(value string) bool {

View file

@ -85,16 +85,6 @@ func TestMediumImgWithSrcset(t *testing.T) {
}
}
func TestEconomistImgWithSrcset(t *testing.T) {
input := `<img loading="lazy" src="https://www.economist.com/img/b/608/634/90/sites/default/files/images/print-edition/20211009_WWC585.png" srcSet="https://www.economist.com/img/b/200/209/90/sites/default/files/images/print-edition/20211009_WWC585.png 200w,https://www.economist.com/img/b/300/313/90/sites/default/files/images/print-edition/20211009_WWC585.png 300w,https://www.economist.com/img/b/400/417/90/sites/default/files/images/print-edition/20211009_WWC585.png 400w,https://www.economist.com/img/b/600/626/90/sites/default/files/images/print-edition/20211009_WWC585.png 600w,https://www.economist.com/img/b/640/667/90/sites/default/files/images/print-edition/20211009_WWC585.png 640w,https://www.economist.com/img/b/800/834/90/sites/default/files/images/print-edition/20211009_WWC585.png 800w,https://www.economist.com/img/b/1000/1043/90/sites/default/files/images/print-edition/20211009_WWC585.png 1000w,https://www.economist.com/img/b/1280/1335/90/sites/default/files/images/print-edition/20211009_WWC585.png 1280w" sizes="300px" alt=""/>`
expected := `<img src="https://www.economist.com/img/b/608/634/90/sites/default/files/images/print-edition/20211009_WWC585.png" srcset="https://www.economist.com/img/b/200/209/90/sites/default/files/images/print-edition/20211009_WWC585.png 200w, https://www.economist.com/img/b/300/313/90/sites/default/files/images/print-edition/20211009_WWC585.png 300w, https://www.economist.com/img/b/400/417/90/sites/default/files/images/print-edition/20211009_WWC585.png 400w, https://www.economist.com/img/b/600/626/90/sites/default/files/images/print-edition/20211009_WWC585.png 600w, https://www.economist.com/img/b/640/667/90/sites/default/files/images/print-edition/20211009_WWC585.png 640w, https://www.economist.com/img/b/800/834/90/sites/default/files/images/print-edition/20211009_WWC585.png 800w, https://www.economist.com/img/b/1000/1043/90/sites/default/files/images/print-edition/20211009_WWC585.png 1000w, https://www.economist.com/img/b/1280/1335/90/sites/default/files/images/print-edition/20211009_WWC585.png 1280w" sizes="300px" alt="" loading="lazy"/>`
output := Sanitize("http://example.org/", input)
if output != expected {
t.Errorf(`Wrong output: %s`, output)
}
}
func TestSelfClosingTags(t *testing.T) {
input := `<p>This <br> is a <strong>text</strong> <br/>with an image: <img src="http://example.org/" alt="Test" loading="lazy"/>.</p>`
output := Sanitize("http://example.org/", input)

View file

@ -0,0 +1,82 @@
// Copyright 2022 Frédéric Guillot. All rights reserved.
// Use of this source code is governed by the Apache 2.0
// license that can be found in the LICENSE file.
package sanitizer
import (
"fmt"
"strconv"
"strings"
)
type ImageCandidate struct {
ImageURL string
Descriptor string
}
type ImageCandidates []*ImageCandidate
func (c ImageCandidates) String() string {
var htmlCandidates []string
for _, imageCandidate := range c {
var htmlCandidate string
if imageCandidate.Descriptor != "" {
htmlCandidate = fmt.Sprintf(`%s %s`, imageCandidate.ImageURL, imageCandidate.Descriptor)
} else {
htmlCandidate = imageCandidate.ImageURL
}
htmlCandidates = append(htmlCandidates, htmlCandidate)
}
return strings.Join(htmlCandidates, ", ")
}
// ParseSrcSetAttribute returns the list of image candidates from the set.
// https://html.spec.whatwg.org/#parse-a-srcset-attribute
func ParseSrcSetAttribute(attributeValue string) (imageCandidates ImageCandidates) {
unparsedCandidates := strings.Split(attributeValue, ", ")
for _, unparsedCandidate := range unparsedCandidates {
if candidate, err := parseImageCandidate(unparsedCandidate); err == nil {
imageCandidates = append(imageCandidates, candidate)
}
}
return imageCandidates
}
func parseImageCandidate(input string) (*ImageCandidate, error) {
input = strings.TrimSpace(input)
parts := strings.Split(strings.TrimSpace(input), " ")
nbParts := len(parts)
if nbParts > 2 || nbParts == 0 {
return nil, fmt.Errorf(`srcset: invalid number of descriptors`)
}
if nbParts == 2 {
if !isValidWidthOrDensityDescriptor(parts[1]) {
return nil, fmt.Errorf(`srcset: invalid descriptor`)
}
return &ImageCandidate{ImageURL: parts[0], Descriptor: parts[1]}, nil
}
return &ImageCandidate{ImageURL: parts[0]}, nil
}
func isValidWidthOrDensityDescriptor(value string) bool {
if value == "" {
return false
}
lastChar := value[len(value)-1:]
if lastChar != "w" && lastChar != "x" {
return false
}
_, err := strconv.ParseFloat(value[0:len(value)-1], 32)
return err == nil
}

View file

@ -0,0 +1,85 @@
// Copyright 2022 Frédéric Guillot. All rights reserved.
// Use of this source code is governed by the Apache 2.0
// license that can be found in the LICENSE file.
package sanitizer
import "testing"
func TestParseSrcSetAttributeWithRelativeURLs(t *testing.T) {
input := `example-320w.jpg, example-480w.jpg 1.5x, example-640,w.jpg 2x, example-640w.jpg 640w`
candidates := ParseSrcSetAttribute(input)
if len(candidates) != 4 {
t.Error(`Incorrect number of candidates`)
}
if candidates.String() != `example-320w.jpg, example-480w.jpg 1.5x, example-640,w.jpg 2x, example-640w.jpg 640w` {
t.Errorf(`Unexpected string output`)
}
}
func TestParseSrcSetAttributeWithAbsoluteURLs(t *testing.T) {
input := `http://example.org/example-320w.jpg 320w, http://example.org/example-480w.jpg 1.5x`
candidates := ParseSrcSetAttribute(input)
if len(candidates) != 2 {
t.Error(`Incorrect number of candidates`)
}
if candidates.String() != `http://example.org/example-320w.jpg 320w, http://example.org/example-480w.jpg 1.5x` {
t.Errorf(`Unexpected string output`)
}
}
func TestParseSrcSetAttributeWithOneCandidate(t *testing.T) {
input := `http://example.org/example-320w.jpg`
candidates := ParseSrcSetAttribute(input)
if len(candidates) != 1 {
t.Error(`Incorrect number of candidates`)
}
if candidates.String() != `http://example.org/example-320w.jpg` {
t.Errorf(`Unexpected string output`)
}
}
func TestParseSrcSetAttributeWithCommaURL(t *testing.T) {
input := `http://example.org/example,a:b/d.jpg , example-480w.jpg 1.5x`
candidates := ParseSrcSetAttribute(input)
if len(candidates) != 2 {
t.Error(`Incorrect number of candidates`)
}
if candidates.String() != `http://example.org/example,a:b/d.jpg, example-480w.jpg 1.5x` {
t.Errorf(`Unexpected string output`)
}
}
func TestParseSrcSetAttributeWithIncorrectDescriptor(t *testing.T) {
input := `http://example.org/example-320w.jpg test`
candidates := ParseSrcSetAttribute(input)
if len(candidates) != 0 {
t.Error(`Incorrect number of candidates`)
}
if candidates.String() != `` {
t.Errorf(`Unexpected string output`)
}
}
func TestParseSrcSetAttributeWithTooManyDescriptors(t *testing.T) {
input := `http://example.org/example-320w.jpg 10w 1x`
candidates := ParseSrcSetAttribute(input)
if len(candidates) != 0 {
t.Error(`Incorrect number of candidates`)
}
if candidates.String() != `` {
t.Errorf(`Unexpected string output`)
}
}