diff --git a/internal/reader/sanitizer/sanitizer.go b/internal/reader/sanitizer/sanitizer.go index e9aaf06f..a5c0b6d1 100644 --- a/internal/reader/sanitizer/sanitizer.go +++ b/internal/reader/sanitizer/sanitizer.go @@ -204,10 +204,15 @@ func SanitizeHTMLWithDefaultOptions(baseURL, rawHTML string) string { } func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) string { - var buffer strings.Builder var tagStack []string var parentTag string var blockedStack []string + var buffer strings.Builder + + // Educated guess about how big the sanitized HTML will be, + // to reduce the amount of buffer re-allocations in this function. + estimatedRatio := len(rawHTML) * 3 / 4 + buffer.Grow(estimatedRatio) // Errors are a non-issue, so they're handled later in the function. parsedBaseUrl, _ := url.Parse(baseURL) @@ -259,7 +264,7 @@ func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) s } if len(blockedStack) == 0 && isValidTag(tagName) { - attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, baseURL, tagName, token.Attr, sanitizerOptions) + attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, tagName, token.Attr, sanitizerOptions) if hasRequiredAttributes(tagName, attrNames) { if len(attrNames) > 0 { // Rewrite the start tag with allowed attributes. @@ -287,7 +292,7 @@ func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) s continue } if len(blockedStack) == 0 && isValidTag(tagName) { - attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, baseURL, tagName, token.Attr, sanitizerOptions) + attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, tagName, token.Attr, sanitizerOptions) if hasRequiredAttributes(tagName, attrNames) { if len(attrNames) > 0 { buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>") @@ -300,7 +305,7 @@ func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) s } } -func sanitizeAttributes(parsedBaseUrl *url.URL, baseURL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) ([]string, string) { +func sanitizeAttributes(parsedBaseUrl *url.URL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) ([]string, string) { var htmlAttrs, attrNames []string var err error var isAnchorLink bool @@ -339,11 +344,11 @@ func sanitizeAttributes(parsedBaseUrl *url.URL, baseURL, tagName string, attribu continue } case "srcset": - value = sanitizeSrcsetAttr(baseURL, value) + value = sanitizeSrcsetAttr(parsedBaseUrl, value) } case "source": if attribute.Key == "srcset" { - value = sanitizeSrcsetAttr(baseURL, value) + value = sanitizeSrcsetAttr(parsedBaseUrl, value) } } @@ -360,7 +365,7 @@ func sanitizeAttributes(parsedBaseUrl *url.URL, baseURL, tagName string, attribu value = attribute.Val isAnchorLink = true default: - value, err = urllib.AbsoluteURL(baseURL, value) + value, err = absoluteURLParsedBase(parsedBaseUrl, value) if err != nil { continue } @@ -541,11 +546,11 @@ func isBlockedTag(tagName string) bool { return false } -func sanitizeSrcsetAttr(baseURL, value string) string { +func sanitizeSrcsetAttr(parsedBaseURL *url.URL, value string) string { imageCandidates := ParseSrcSetAttribute(value) for _, imageCandidate := range imageCandidates { - if absoluteURL, err := urllib.AbsoluteURL(baseURL, imageCandidate.ImageURL); err == nil { + if absoluteURL, err := absoluteURLParsedBase(parsedBaseURL, imageCandidate.ImageURL); err == nil { imageCandidate.ImageURL = absoluteURL } } @@ -597,3 +602,19 @@ func isValidDecodingValue(value string) bool { } return false } + +// absoluteURLParsedBase is used instead of urllib.AbsoluteURL to avoid parsing baseURL over and over. +func absoluteURLParsedBase(parsedBaseURL *url.URL, input string) (string, error) { + absURL, u, err := urllib.GetAbsoluteURL(input) + if err != nil { + return "", err + } + if absURL != "" { + return absURL, nil + } + if parsedBaseURL == nil { + return "", nil + } + + return parsedBaseURL.ResolveReference(u).String(), nil +} diff --git a/internal/urllib/url.go b/internal/urllib/url.go index 259b7a9b..c7ad671b 100644 --- a/internal/urllib/url.go +++ b/internal/urllib/url.go @@ -18,22 +18,34 @@ func IsAbsoluteURL(link string) bool { return u.IsAbs() } -// AbsoluteURL converts the input URL as absolute URL if necessary. -func AbsoluteURL(baseURL, input string) (string, error) { +// GetAbsoluteURL return the absolute form of `input` is possible, as well as its parser form. +func GetAbsoluteURL(input string) (string, *url.URL, error) { if strings.HasPrefix(input, "//") { - return "https:" + input, nil + return "https:" + input, nil, nil } if strings.HasPrefix(input, "https://") || strings.HasPrefix(input, "http://") { - return input, nil + return input, nil, nil } u, err := url.Parse(input) if err != nil { - return "", fmt.Errorf("unable to parse input URL: %v", err) + return "", nil, fmt.Errorf("unable to parse input URL: %v", err) } if u.IsAbs() { - return u.String(), nil + return u.String(), u, nil + } + return "", u, nil +} + +// AbsoluteURL converts the input URL as absolute URL if necessary. +func AbsoluteURL(baseURL, input string) (string, error) { + absURL, u, err := GetAbsoluteURL(input) + if err != nil { + return "", err + } + if absURL != "" { + return absURL, nil } base, err := url.Parse(baseURL)