mirror of
https://github.com/miniflux/v2.git
synced 2025-06-27 16:36:00 +00:00
perf(sanitizer): extract a call to url.Parse and make intensive use of it
Previously, url.Parse(baseUrl) was called on every self-closing tags, and on most opening tags, accounting for around 15% of the CPU time spent in processor.ProcessFeedEntries
This commit is contained in:
parent
40727704c2
commit
44c48d109f
1 changed files with 14 additions and 8 deletions
|
@ -128,6 +128,9 @@ func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) s
|
||||||
var parentTag string
|
var parentTag string
|
||||||
var blockedStack []string
|
var blockedStack []string
|
||||||
|
|
||||||
|
// Errors are a non-issue, so they're handled later in the function.
|
||||||
|
parsedBaseUrl, _ := url.Parse(baseURL)
|
||||||
|
|
||||||
tokenizer := html.NewTokenizer(strings.NewReader(rawHTML))
|
tokenizer := html.NewTokenizer(strings.NewReader(rawHTML))
|
||||||
for {
|
for {
|
||||||
if tokenizer.Next() == html.ErrorToken {
|
if tokenizer.Next() == html.ErrorToken {
|
||||||
|
@ -175,7 +178,7 @@ func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) s
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(blockedStack) == 0 && isValidTag(tagName) {
|
if len(blockedStack) == 0 && isValidTag(tagName) {
|
||||||
attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr, sanitizerOptions)
|
attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, baseURL, tagName, token.Attr, sanitizerOptions)
|
||||||
if hasRequiredAttributes(tagName, attrNames) {
|
if hasRequiredAttributes(tagName, attrNames) {
|
||||||
if len(attrNames) > 0 {
|
if len(attrNames) > 0 {
|
||||||
// Rewrite the start tag with allowed attributes.
|
// Rewrite the start tag with allowed attributes.
|
||||||
|
@ -203,7 +206,7 @@ func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) s
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if len(blockedStack) == 0 && isValidTag(tagName) {
|
if len(blockedStack) == 0 && isValidTag(tagName) {
|
||||||
attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr, sanitizerOptions)
|
attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, baseURL, tagName, token.Attr, sanitizerOptions)
|
||||||
if hasRequiredAttributes(tagName, attrNames) {
|
if hasRequiredAttributes(tagName, attrNames) {
|
||||||
if len(attrNames) > 0 {
|
if len(attrNames) > 0 {
|
||||||
buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>")
|
buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>")
|
||||||
|
@ -216,7 +219,7 @@ func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) s
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) ([]string, string) {
|
func sanitizeAttributes(parsedBaseUrl *url.URL, baseURL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) ([]string, string) {
|
||||||
var htmlAttrs, attrNames []string
|
var htmlAttrs, attrNames []string
|
||||||
var err error
|
var err error
|
||||||
var isImageLargerThanLayout bool
|
var isImageLargerThanLayout bool
|
||||||
|
@ -227,8 +230,6 @@ func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute, sa
|
||||||
isImageLargerThanLayout = imgWidth > 750
|
isImageLargerThanLayout = imgWidth > 750
|
||||||
}
|
}
|
||||||
|
|
||||||
parsedBaseUrl, _ := url.Parse(baseURL)
|
|
||||||
|
|
||||||
for _, attribute := range attributes {
|
for _, attribute := range attributes {
|
||||||
value := attribute.Val
|
value := attribute.Val
|
||||||
|
|
||||||
|
@ -265,7 +266,7 @@ func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute, sa
|
||||||
if isExternalResourceAttribute(attribute.Key) {
|
if isExternalResourceAttribute(attribute.Key) {
|
||||||
switch {
|
switch {
|
||||||
case tagName == "iframe":
|
case tagName == "iframe":
|
||||||
if !isValidIframeSource(baseURL, attribute.Val) {
|
if !isValidIframeSource(parsedBaseUrl, baseURL, attribute.Val) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
value = rewriteIframeURL(attribute.Val)
|
value = rewriteIframeURL(attribute.Val)
|
||||||
|
@ -447,7 +448,7 @@ func isBlockedResource(src string) bool {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func isValidIframeSource(baseURL, src string) bool {
|
func isValidIframeSource(parsedBaseUrl *url.URL, baseURL, src string) bool {
|
||||||
whitelist := []string{
|
whitelist := []string{
|
||||||
"bandcamp.com",
|
"bandcamp.com",
|
||||||
"cdn.embedly.com",
|
"cdn.embedly.com",
|
||||||
|
@ -464,8 +465,13 @@ func isValidIframeSource(baseURL, src string) bool {
|
||||||
}
|
}
|
||||||
domain := urllib.Domain(src)
|
domain := urllib.Domain(src)
|
||||||
|
|
||||||
|
baseDomain := baseURL
|
||||||
|
if parsedBaseUrl != nil {
|
||||||
|
baseDomain = parsedBaseUrl.Hostname()
|
||||||
|
}
|
||||||
|
|
||||||
// allow iframe from same origin
|
// allow iframe from same origin
|
||||||
if urllib.Domain(baseURL) == domain {
|
if baseDomain == domain {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue