1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-08-06 17:41:00 +00:00

perf(sanitizer): improve the performances of the sanitizer (#3497)

- Grow the underlying buffer of SanitizeHTML's strings.Builder to 3/4 of the
  raw HTML from the start, to reduce the amount of iterative allocations. This
  number is a complete guesstimation, but it sounds reasonable to me.
- Add a `absoluteURLParsedBase` function to avoid parsing baseURL over and over.
This commit is contained in:
Julien Voisin 2025-07-08 00:21:13 +02:00 committed by GitHub
parent 15e4c3a374
commit a8b4e88742
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 48 additions and 15 deletions

View file

@ -204,10 +204,15 @@ func SanitizeHTMLWithDefaultOptions(baseURL, rawHTML string) string {
}
func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) string {
var buffer strings.Builder
var tagStack []string
var parentTag string
var blockedStack []string
var buffer strings.Builder
// Educated guess about how big the sanitized HTML will be,
// to reduce the amount of buffer re-allocations in this function.
estimatedRatio := len(rawHTML) * 3 / 4
buffer.Grow(estimatedRatio)
// Errors are a non-issue, so they're handled later in the function.
parsedBaseUrl, _ := url.Parse(baseURL)
@ -259,7 +264,7 @@ func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) s
}
if len(blockedStack) == 0 && isValidTag(tagName) {
attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, baseURL, tagName, token.Attr, sanitizerOptions)
attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, tagName, token.Attr, sanitizerOptions)
if hasRequiredAttributes(tagName, attrNames) {
if len(attrNames) > 0 {
// Rewrite the start tag with allowed attributes.
@ -287,7 +292,7 @@ func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) s
continue
}
if len(blockedStack) == 0 && isValidTag(tagName) {
attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, baseURL, tagName, token.Attr, sanitizerOptions)
attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, tagName, token.Attr, sanitizerOptions)
if hasRequiredAttributes(tagName, attrNames) {
if len(attrNames) > 0 {
buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>")
@ -300,7 +305,7 @@ func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) s
}
}
func sanitizeAttributes(parsedBaseUrl *url.URL, baseURL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) ([]string, string) {
func sanitizeAttributes(parsedBaseUrl *url.URL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) ([]string, string) {
var htmlAttrs, attrNames []string
var err error
var isAnchorLink bool
@ -339,11 +344,11 @@ func sanitizeAttributes(parsedBaseUrl *url.URL, baseURL, tagName string, attribu
continue
}
case "srcset":
value = sanitizeSrcsetAttr(baseURL, value)
value = sanitizeSrcsetAttr(parsedBaseUrl, value)
}
case "source":
if attribute.Key == "srcset" {
value = sanitizeSrcsetAttr(baseURL, value)
value = sanitizeSrcsetAttr(parsedBaseUrl, value)
}
}
@ -360,7 +365,7 @@ func sanitizeAttributes(parsedBaseUrl *url.URL, baseURL, tagName string, attribu
value = attribute.Val
isAnchorLink = true
default:
value, err = urllib.AbsoluteURL(baseURL, value)
value, err = absoluteURLParsedBase(parsedBaseUrl, value)
if err != nil {
continue
}
@ -541,11 +546,11 @@ func isBlockedTag(tagName string) bool {
return false
}
func sanitizeSrcsetAttr(baseURL, value string) string {
func sanitizeSrcsetAttr(parsedBaseURL *url.URL, value string) string {
imageCandidates := ParseSrcSetAttribute(value)
for _, imageCandidate := range imageCandidates {
if absoluteURL, err := urllib.AbsoluteURL(baseURL, imageCandidate.ImageURL); err == nil {
if absoluteURL, err := absoluteURLParsedBase(parsedBaseURL, imageCandidate.ImageURL); err == nil {
imageCandidate.ImageURL = absoluteURL
}
}
@ -597,3 +602,19 @@ func isValidDecodingValue(value string) bool {
}
return false
}
// absoluteURLParsedBase is used instead of urllib.AbsoluteURL to avoid parsing baseURL over and over.
func absoluteURLParsedBase(parsedBaseURL *url.URL, input string) (string, error) {
absURL, u, err := urllib.GetAbsoluteURL(input)
if err != nil {
return "", err
}
if absURL != "" {
return absURL, nil
}
if parsedBaseURL == nil {
return "", nil
}
return parsedBaseURL.ResolveReference(u).String(), nil
}

View file

@ -18,22 +18,34 @@ func IsAbsoluteURL(link string) bool {
return u.IsAbs()
}
// AbsoluteURL converts the input URL as absolute URL if necessary.
func AbsoluteURL(baseURL, input string) (string, error) {
// GetAbsoluteURL return the absolute form of `input` is possible, as well as its parser form.
func GetAbsoluteURL(input string) (string, *url.URL, error) {
if strings.HasPrefix(input, "//") {
return "https:" + input, nil
return "https:" + input, nil, nil
}
if strings.HasPrefix(input, "https://") || strings.HasPrefix(input, "http://") {
return input, nil
return input, nil, nil
}
u, err := url.Parse(input)
if err != nil {
return "", fmt.Errorf("unable to parse input URL: %v", err)
return "", nil, fmt.Errorf("unable to parse input URL: %v", err)
}
if u.IsAbs() {
return u.String(), nil
return u.String(), u, nil
}
return "", u, nil
}
// AbsoluteURL converts the input URL as absolute URL if necessary.
func AbsoluteURL(baseURL, input string) (string, error) {
absURL, u, err := GetAbsoluteURL(input)
if err != nil {
return "", err
}
if absURL != "" {
return absURL, nil
}
base, err := url.Parse(baseURL)