1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-08-06 17:41:00 +00:00

perf(sanitizer): improve the performances of the sanitizer (#3497)

- Grow the underlying buffer of SanitizeHTML's strings.Builder to 3/4 of the
  raw HTML from the start, to reduce the amount of iterative allocations. This
  number is a complete guesstimation, but it sounds reasonable to me.
- Add a `absoluteURLParsedBase` function to avoid parsing baseURL over and over.
This commit is contained in:
Julien Voisin 2025-07-08 00:21:13 +02:00 committed by GitHub
parent 15e4c3a374
commit a8b4e88742
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 48 additions and 15 deletions

View file

@ -204,10 +204,15 @@ func SanitizeHTMLWithDefaultOptions(baseURL, rawHTML string) string {
} }
func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) string { func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) string {
var buffer strings.Builder
var tagStack []string var tagStack []string
var parentTag string var parentTag string
var blockedStack []string var blockedStack []string
var buffer strings.Builder
// Educated guess about how big the sanitized HTML will be,
// to reduce the amount of buffer re-allocations in this function.
estimatedRatio := len(rawHTML) * 3 / 4
buffer.Grow(estimatedRatio)
// Errors are a non-issue, so they're handled later in the function. // Errors are a non-issue, so they're handled later in the function.
parsedBaseUrl, _ := url.Parse(baseURL) parsedBaseUrl, _ := url.Parse(baseURL)
@ -259,7 +264,7 @@ func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) s
} }
if len(blockedStack) == 0 && isValidTag(tagName) { if len(blockedStack) == 0 && isValidTag(tagName) {
attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, baseURL, tagName, token.Attr, sanitizerOptions) attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, tagName, token.Attr, sanitizerOptions)
if hasRequiredAttributes(tagName, attrNames) { if hasRequiredAttributes(tagName, attrNames) {
if len(attrNames) > 0 { if len(attrNames) > 0 {
// Rewrite the start tag with allowed attributes. // Rewrite the start tag with allowed attributes.
@ -287,7 +292,7 @@ func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) s
continue continue
} }
if len(blockedStack) == 0 && isValidTag(tagName) { if len(blockedStack) == 0 && isValidTag(tagName) {
attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, baseURL, tagName, token.Attr, sanitizerOptions) attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, tagName, token.Attr, sanitizerOptions)
if hasRequiredAttributes(tagName, attrNames) { if hasRequiredAttributes(tagName, attrNames) {
if len(attrNames) > 0 { if len(attrNames) > 0 {
buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>") buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>")
@ -300,7 +305,7 @@ func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) s
} }
} }
func sanitizeAttributes(parsedBaseUrl *url.URL, baseURL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) ([]string, string) { func sanitizeAttributes(parsedBaseUrl *url.URL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) ([]string, string) {
var htmlAttrs, attrNames []string var htmlAttrs, attrNames []string
var err error var err error
var isAnchorLink bool var isAnchorLink bool
@ -339,11 +344,11 @@ func sanitizeAttributes(parsedBaseUrl *url.URL, baseURL, tagName string, attribu
continue continue
} }
case "srcset": case "srcset":
value = sanitizeSrcsetAttr(baseURL, value) value = sanitizeSrcsetAttr(parsedBaseUrl, value)
} }
case "source": case "source":
if attribute.Key == "srcset" { if attribute.Key == "srcset" {
value = sanitizeSrcsetAttr(baseURL, value) value = sanitizeSrcsetAttr(parsedBaseUrl, value)
} }
} }
@ -360,7 +365,7 @@ func sanitizeAttributes(parsedBaseUrl *url.URL, baseURL, tagName string, attribu
value = attribute.Val value = attribute.Val
isAnchorLink = true isAnchorLink = true
default: default:
value, err = urllib.AbsoluteURL(baseURL, value) value, err = absoluteURLParsedBase(parsedBaseUrl, value)
if err != nil { if err != nil {
continue continue
} }
@ -541,11 +546,11 @@ func isBlockedTag(tagName string) bool {
return false return false
} }
func sanitizeSrcsetAttr(baseURL, value string) string { func sanitizeSrcsetAttr(parsedBaseURL *url.URL, value string) string {
imageCandidates := ParseSrcSetAttribute(value) imageCandidates := ParseSrcSetAttribute(value)
for _, imageCandidate := range imageCandidates { for _, imageCandidate := range imageCandidates {
if absoluteURL, err := urllib.AbsoluteURL(baseURL, imageCandidate.ImageURL); err == nil { if absoluteURL, err := absoluteURLParsedBase(parsedBaseURL, imageCandidate.ImageURL); err == nil {
imageCandidate.ImageURL = absoluteURL imageCandidate.ImageURL = absoluteURL
} }
} }
@ -597,3 +602,19 @@ func isValidDecodingValue(value string) bool {
} }
return false return false
} }
// absoluteURLParsedBase is used instead of urllib.AbsoluteURL to avoid parsing baseURL over and over.
func absoluteURLParsedBase(parsedBaseURL *url.URL, input string) (string, error) {
absURL, u, err := urllib.GetAbsoluteURL(input)
if err != nil {
return "", err
}
if absURL != "" {
return absURL, nil
}
if parsedBaseURL == nil {
return "", nil
}
return parsedBaseURL.ResolveReference(u).String(), nil
}

View file

@ -18,22 +18,34 @@ func IsAbsoluteURL(link string) bool {
return u.IsAbs() return u.IsAbs()
} }
// AbsoluteURL converts the input URL as absolute URL if necessary. // GetAbsoluteURL return the absolute form of `input` is possible, as well as its parser form.
func AbsoluteURL(baseURL, input string) (string, error) { func GetAbsoluteURL(input string) (string, *url.URL, error) {
if strings.HasPrefix(input, "//") { if strings.HasPrefix(input, "//") {
return "https:" + input, nil return "https:" + input, nil, nil
} }
if strings.HasPrefix(input, "https://") || strings.HasPrefix(input, "http://") { if strings.HasPrefix(input, "https://") || strings.HasPrefix(input, "http://") {
return input, nil return input, nil, nil
} }
u, err := url.Parse(input) u, err := url.Parse(input)
if err != nil { if err != nil {
return "", fmt.Errorf("unable to parse input URL: %v", err) return "", nil, fmt.Errorf("unable to parse input URL: %v", err)
} }
if u.IsAbs() { if u.IsAbs() {
return u.String(), nil return u.String(), u, nil
}
return "", u, nil
}
// AbsoluteURL converts the input URL as absolute URL if necessary.
func AbsoluteURL(baseURL, input string) (string, error) {
absURL, u, err := GetAbsoluteURL(input)
if err != nil {
return "", err
}
if absURL != "" {
return absURL, nil
} }
base, err := url.Parse(baseURL) base, err := url.Parse(baseURL)