mirror of
https://github.com/miniflux/v2.git
synced 2025-09-15 18:57:04 +00:00
Speed the sanitizer up a bit, again
- allow youtube urls to start with `www` - use `strings.Builder` instead of a `bytes.Buffer` - use a `strings.NewReader` instead of a `bytes.NewBufferString` - sprinkles a couple of `continue` to make the code-flow more obvious - inline calls to `inList`, and put their parameters in the right order - simplify isPixelTracker - simplify `isValidIframeSource`, by extracting the hostname and comparing it directly, instead of using the full url and checking if it starts with multiple variations of the same one (`//`, `http:`, `https://` multiplied by ``/`www.`) - add a benchmark
This commit is contained in:
parent
eda2e2f3f5
commit
3d0126be0b
4 changed files with 3502 additions and 51 deletions
|
@ -4,7 +4,6 @@
|
|||
package sanitizer // import "miniflux.app/v2/internal/reader/sanitizer"
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"regexp"
|
||||
|
@ -19,7 +18,7 @@ import (
|
|||
)
|
||||
|
||||
var (
|
||||
youtubeEmbedRegex = regexp.MustCompile(`//www\.youtube\.com/embed/(.*)$`)
|
||||
youtubeEmbedRegex = regexp.MustCompile(`//(?:www\.)?youtube\.com/embed/(.+)$`)
|
||||
tagAllowList = map[string][]string{
|
||||
"a": {"href", "title", "id"},
|
||||
"abbr": {"title"},
|
||||
|
@ -80,12 +79,12 @@ var (
|
|||
|
||||
// Sanitize returns safe HTML.
|
||||
func Sanitize(baseURL, input string) string {
|
||||
var buffer bytes.Buffer
|
||||
var buffer strings.Builder
|
||||
var tagStack []string
|
||||
var parentTag string
|
||||
blacklistedTagDepth := 0
|
||||
|
||||
tokenizer := html.NewTokenizer(bytes.NewBufferString(input))
|
||||
tokenizer := html.NewTokenizer(strings.NewReader(input))
|
||||
for {
|
||||
if tokenizer.Next() == html.ErrorToken {
|
||||
err := tokenizer.Err()
|
||||
|
@ -114,7 +113,10 @@ func Sanitize(baseURL, input string) string {
|
|||
tagName := token.DataAtom.String()
|
||||
parentTag = tagName
|
||||
|
||||
if !isPixelTracker(tagName, token.Attr) && isValidTag(tagName) {
|
||||
if isPixelTracker(tagName, token.Attr) {
|
||||
continue
|
||||
}
|
||||
if isValidTag(tagName) {
|
||||
attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
|
||||
|
||||
if hasRequiredAttributes(tagName, attrNames) {
|
||||
|
@ -131,16 +133,18 @@ func Sanitize(baseURL, input string) string {
|
|||
}
|
||||
case html.EndTagToken:
|
||||
tagName := token.DataAtom.String()
|
||||
if isValidTag(tagName) && inList(tagName, tagStack) {
|
||||
buffer.WriteString(fmt.Sprintf("</%s>", tagName))
|
||||
if isValidTag(tagName) && slices.Contains(tagStack, tagName) {
|
||||
buffer.WriteString("</" + tagName + ">")
|
||||
} else if isBlockedTag(tagName) {
|
||||
blacklistedTagDepth--
|
||||
}
|
||||
case html.SelfClosingTagToken:
|
||||
tagName := token.DataAtom.String()
|
||||
if !isPixelTracker(tagName, token.Attr) && isValidTag(tagName) {
|
||||
if isPixelTracker(tagName, token.Attr) {
|
||||
continue
|
||||
}
|
||||
if isValidTag(tagName) {
|
||||
attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
|
||||
|
||||
if hasRequiredAttributes(tagName, attrNames) {
|
||||
if len(attrNames) > 0 {
|
||||
buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>")
|
||||
|
@ -187,11 +191,10 @@ func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) ([
|
|||
|
||||
if isExternalResourceAttribute(attribute.Key) {
|
||||
if tagName == "iframe" {
|
||||
if isValidIframeSource(baseURL, attribute.Val) {
|
||||
value = rewriteIframeURL(attribute.Val)
|
||||
} else {
|
||||
if !isValidIframeSource(baseURL, attribute.Val) {
|
||||
continue
|
||||
}
|
||||
value = rewriteIframeURL(attribute.Val)
|
||||
} else if tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val) {
|
||||
value = attribute.Val
|
||||
} else if isAnchor("a", attribute) {
|
||||
|
@ -248,7 +251,7 @@ func isValidTag(tagName string) bool {
|
|||
|
||||
func isValidAttribute(tagName, attributeName string) bool {
|
||||
if attributes, ok := tagAllowList[tagName]; ok {
|
||||
return inList(attributeName, attributes)
|
||||
return slices.Contains(attributes, attributeName)
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
@ -263,24 +266,23 @@ func isExternalResourceAttribute(attribute string) bool {
|
|||
}
|
||||
|
||||
func isPixelTracker(tagName string, attributes []html.Attribute) bool {
|
||||
if tagName == "img" {
|
||||
hasHeight := false
|
||||
hasWidth := false
|
||||
if tagName != "img" {
|
||||
return false
|
||||
}
|
||||
hasHeight := false
|
||||
hasWidth := false
|
||||
|
||||
for _, attribute := range attributes {
|
||||
if attribute.Key == "height" && attribute.Val == "1" {
|
||||
for _, attribute := range attributes {
|
||||
if attribute.Val == "1" {
|
||||
if attribute.Key == "height" {
|
||||
hasHeight = true
|
||||
}
|
||||
|
||||
if attribute.Key == "width" && attribute.Val == "1" {
|
||||
} else if attribute.Key == "width" {
|
||||
hasWidth = true
|
||||
}
|
||||
}
|
||||
|
||||
return hasHeight && hasWidth
|
||||
}
|
||||
|
||||
return false
|
||||
return hasHeight && hasWidth
|
||||
}
|
||||
|
||||
func hasRequiredAttributes(tagName string, attributes []string) bool {
|
||||
|
@ -371,43 +373,31 @@ func isBlockedResource(src string) bool {
|
|||
|
||||
func isValidIframeSource(baseURL, src string) bool {
|
||||
whitelist := []string{
|
||||
"//www.youtube.com",
|
||||
"http://www.youtube.com",
|
||||
"https://www.youtube.com",
|
||||
"https://www.youtube-nocookie.com",
|
||||
"http://player.vimeo.com",
|
||||
"https://player.vimeo.com",
|
||||
"http://www.dailymotion.com",
|
||||
"https://www.dailymotion.com",
|
||||
"http://vk.com",
|
||||
"https://vk.com",
|
||||
"http://soundcloud.com",
|
||||
"https://soundcloud.com",
|
||||
"http://w.soundcloud.com",
|
||||
"https://w.soundcloud.com",
|
||||
"http://bandcamp.com",
|
||||
"https://bandcamp.com",
|
||||
"https://cdn.embedly.com",
|
||||
"https://player.bilibili.com",
|
||||
"https://player.twitch.tv",
|
||||
"bandcamp.com",
|
||||
"cdn.embedly.com",
|
||||
"player.bilibili.com",
|
||||
"player.twitch.tv",
|
||||
"player.vimeo.com",
|
||||
"soundcloud.com",
|
||||
"vk.com",
|
||||
"w.soundcloud.com",
|
||||
"dailymotion.com",
|
||||
"youtube-nocookie.com",
|
||||
"youtube.com",
|
||||
}
|
||||
domain := urllib.Domain(src)
|
||||
|
||||
// allow iframe from same origin
|
||||
if urllib.Domain(baseURL) == urllib.Domain(src) {
|
||||
if urllib.Domain(baseURL) == domain {
|
||||
return true
|
||||
}
|
||||
|
||||
// allow iframe from custom invidious instance
|
||||
if config.Opts != nil && config.Opts.InvidiousInstance() == urllib.Domain(src) {
|
||||
if config.Opts != nil && config.Opts.InvidiousInstance() == domain {
|
||||
return true
|
||||
}
|
||||
|
||||
return slices.ContainsFunc(whitelist, func(prefix string) bool {
|
||||
return strings.HasPrefix(src, prefix)
|
||||
})
|
||||
}
|
||||
func inList(needle string, haystack []string) bool {
|
||||
return slices.Contains(haystack, needle)
|
||||
return slices.Contains(whitelist, strings.TrimPrefix(domain, "www."))
|
||||
}
|
||||
|
||||
func rewriteIframeURL(link string) string {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue