1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-09-15 18:57:04 +00:00

Speed the sanitizer up a bit, again

- allow youtube urls to start with `www`
- use `strings.Builder` instead of a `bytes.Buffer`
- use a `strings.NewReader` instead of a `bytes.NewBufferString`
- sprinkles a couple of `continue` to make the code-flow more obvious
- inline calls to `inList`, and put their parameters in the right order
- simplify isPixelTracker
- simplify `isValidIframeSource`, by extracting the hostname and comparing it
  directly, instead of using the full url and checking if it starts with
  multiple variations of the same one (`//`, `http:`, `https://` multiplied by
  ``/`www.`)
- add a benchmark
This commit is contained in:
jvoisin 2024-03-05 18:00:21 +01:00 committed by Frédéric Guillot
parent eda2e2f3f5
commit 3d0126be0b
4 changed files with 3502 additions and 51 deletions

View file

@ -4,7 +4,6 @@
package sanitizer // import "miniflux.app/v2/internal/reader/sanitizer"
import (
"bytes"
"fmt"
"io"
"regexp"
@ -19,7 +18,7 @@ import (
)
var (
youtubeEmbedRegex = regexp.MustCompile(`//www\.youtube\.com/embed/(.*)$`)
youtubeEmbedRegex = regexp.MustCompile(`//(?:www\.)?youtube\.com/embed/(.+)$`)
tagAllowList = map[string][]string{
"a": {"href", "title", "id"},
"abbr": {"title"},
@ -80,12 +79,12 @@ var (
// Sanitize returns safe HTML.
func Sanitize(baseURL, input string) string {
var buffer bytes.Buffer
var buffer strings.Builder
var tagStack []string
var parentTag string
blacklistedTagDepth := 0
tokenizer := html.NewTokenizer(bytes.NewBufferString(input))
tokenizer := html.NewTokenizer(strings.NewReader(input))
for {
if tokenizer.Next() == html.ErrorToken {
err := tokenizer.Err()
@ -114,7 +113,10 @@ func Sanitize(baseURL, input string) string {
tagName := token.DataAtom.String()
parentTag = tagName
if !isPixelTracker(tagName, token.Attr) && isValidTag(tagName) {
if isPixelTracker(tagName, token.Attr) {
continue
}
if isValidTag(tagName) {
attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
if hasRequiredAttributes(tagName, attrNames) {
@ -131,16 +133,18 @@ func Sanitize(baseURL, input string) string {
}
case html.EndTagToken:
tagName := token.DataAtom.String()
if isValidTag(tagName) && inList(tagName, tagStack) {
buffer.WriteString(fmt.Sprintf("</%s>", tagName))
if isValidTag(tagName) && slices.Contains(tagStack, tagName) {
buffer.WriteString("</" + tagName + ">")
} else if isBlockedTag(tagName) {
blacklistedTagDepth--
}
case html.SelfClosingTagToken:
tagName := token.DataAtom.String()
if !isPixelTracker(tagName, token.Attr) && isValidTag(tagName) {
if isPixelTracker(tagName, token.Attr) {
continue
}
if isValidTag(tagName) {
attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
if hasRequiredAttributes(tagName, attrNames) {
if len(attrNames) > 0 {
buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>")
@ -187,11 +191,10 @@ func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) ([
if isExternalResourceAttribute(attribute.Key) {
if tagName == "iframe" {
if isValidIframeSource(baseURL, attribute.Val) {
value = rewriteIframeURL(attribute.Val)
} else {
if !isValidIframeSource(baseURL, attribute.Val) {
continue
}
value = rewriteIframeURL(attribute.Val)
} else if tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val) {
value = attribute.Val
} else if isAnchor("a", attribute) {
@ -248,7 +251,7 @@ func isValidTag(tagName string) bool {
func isValidAttribute(tagName, attributeName string) bool {
if attributes, ok := tagAllowList[tagName]; ok {
return inList(attributeName, attributes)
return slices.Contains(attributes, attributeName)
}
return false
}
@ -263,24 +266,23 @@ func isExternalResourceAttribute(attribute string) bool {
}
func isPixelTracker(tagName string, attributes []html.Attribute) bool {
if tagName == "img" {
hasHeight := false
hasWidth := false
if tagName != "img" {
return false
}
hasHeight := false
hasWidth := false
for _, attribute := range attributes {
if attribute.Key == "height" && attribute.Val == "1" {
for _, attribute := range attributes {
if attribute.Val == "1" {
if attribute.Key == "height" {
hasHeight = true
}
if attribute.Key == "width" && attribute.Val == "1" {
} else if attribute.Key == "width" {
hasWidth = true
}
}
return hasHeight && hasWidth
}
return false
return hasHeight && hasWidth
}
func hasRequiredAttributes(tagName string, attributes []string) bool {
@ -371,43 +373,31 @@ func isBlockedResource(src string) bool {
func isValidIframeSource(baseURL, src string) bool {
whitelist := []string{
"//www.youtube.com",
"http://www.youtube.com",
"https://www.youtube.com",
"https://www.youtube-nocookie.com",
"http://player.vimeo.com",
"https://player.vimeo.com",
"http://www.dailymotion.com",
"https://www.dailymotion.com",
"http://vk.com",
"https://vk.com",
"http://soundcloud.com",
"https://soundcloud.com",
"http://w.soundcloud.com",
"https://w.soundcloud.com",
"http://bandcamp.com",
"https://bandcamp.com",
"https://cdn.embedly.com",
"https://player.bilibili.com",
"https://player.twitch.tv",
"bandcamp.com",
"cdn.embedly.com",
"player.bilibili.com",
"player.twitch.tv",
"player.vimeo.com",
"soundcloud.com",
"vk.com",
"w.soundcloud.com",
"dailymotion.com",
"youtube-nocookie.com",
"youtube.com",
}
domain := urllib.Domain(src)
// allow iframe from same origin
if urllib.Domain(baseURL) == urllib.Domain(src) {
if urllib.Domain(baseURL) == domain {
return true
}
// allow iframe from custom invidious instance
if config.Opts != nil && config.Opts.InvidiousInstance() == urllib.Domain(src) {
if config.Opts != nil && config.Opts.InvidiousInstance() == domain {
return true
}
return slices.ContainsFunc(whitelist, func(prefix string) bool {
return strings.HasPrefix(src, prefix)
})
}
func inList(needle string, haystack []string) bool {
return slices.Contains(haystack, needle)
return slices.Contains(whitelist, strings.TrimPrefix(domain, "www."))
}
func rewriteIframeURL(link string) string {