mirror of
https://github.com/miniflux/v2.git
synced 2025-08-06 17:41:00 +00:00
perf(sanitizer): improve the performances of the sanitizer (#3497)
- Grow the underlying buffer of SanitizeHTML's strings.Builder to 3/4 of the raw HTML from the start, to reduce the amount of iterative allocations. This number is a complete guesstimation, but it sounds reasonable to me. - Add a `absoluteURLParsedBase` function to avoid parsing baseURL over and over.
This commit is contained in:
parent
15e4c3a374
commit
a8b4e88742
2 changed files with 48 additions and 15 deletions
|
@ -204,10 +204,15 @@ func SanitizeHTMLWithDefaultOptions(baseURL, rawHTML string) string {
|
||||||
}
|
}
|
||||||
|
|
||||||
func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) string {
|
func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) string {
|
||||||
var buffer strings.Builder
|
|
||||||
var tagStack []string
|
var tagStack []string
|
||||||
var parentTag string
|
var parentTag string
|
||||||
var blockedStack []string
|
var blockedStack []string
|
||||||
|
var buffer strings.Builder
|
||||||
|
|
||||||
|
// Educated guess about how big the sanitized HTML will be,
|
||||||
|
// to reduce the amount of buffer re-allocations in this function.
|
||||||
|
estimatedRatio := len(rawHTML) * 3 / 4
|
||||||
|
buffer.Grow(estimatedRatio)
|
||||||
|
|
||||||
// Errors are a non-issue, so they're handled later in the function.
|
// Errors are a non-issue, so they're handled later in the function.
|
||||||
parsedBaseUrl, _ := url.Parse(baseURL)
|
parsedBaseUrl, _ := url.Parse(baseURL)
|
||||||
|
@ -259,7 +264,7 @@ func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) s
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(blockedStack) == 0 && isValidTag(tagName) {
|
if len(blockedStack) == 0 && isValidTag(tagName) {
|
||||||
attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, baseURL, tagName, token.Attr, sanitizerOptions)
|
attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, tagName, token.Attr, sanitizerOptions)
|
||||||
if hasRequiredAttributes(tagName, attrNames) {
|
if hasRequiredAttributes(tagName, attrNames) {
|
||||||
if len(attrNames) > 0 {
|
if len(attrNames) > 0 {
|
||||||
// Rewrite the start tag with allowed attributes.
|
// Rewrite the start tag with allowed attributes.
|
||||||
|
@ -287,7 +292,7 @@ func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) s
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if len(blockedStack) == 0 && isValidTag(tagName) {
|
if len(blockedStack) == 0 && isValidTag(tagName) {
|
||||||
attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, baseURL, tagName, token.Attr, sanitizerOptions)
|
attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, tagName, token.Attr, sanitizerOptions)
|
||||||
if hasRequiredAttributes(tagName, attrNames) {
|
if hasRequiredAttributes(tagName, attrNames) {
|
||||||
if len(attrNames) > 0 {
|
if len(attrNames) > 0 {
|
||||||
buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>")
|
buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>")
|
||||||
|
@ -300,7 +305,7 @@ func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) s
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func sanitizeAttributes(parsedBaseUrl *url.URL, baseURL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) ([]string, string) {
|
func sanitizeAttributes(parsedBaseUrl *url.URL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) ([]string, string) {
|
||||||
var htmlAttrs, attrNames []string
|
var htmlAttrs, attrNames []string
|
||||||
var err error
|
var err error
|
||||||
var isAnchorLink bool
|
var isAnchorLink bool
|
||||||
|
@ -339,11 +344,11 @@ func sanitizeAttributes(parsedBaseUrl *url.URL, baseURL, tagName string, attribu
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
case "srcset":
|
case "srcset":
|
||||||
value = sanitizeSrcsetAttr(baseURL, value)
|
value = sanitizeSrcsetAttr(parsedBaseUrl, value)
|
||||||
}
|
}
|
||||||
case "source":
|
case "source":
|
||||||
if attribute.Key == "srcset" {
|
if attribute.Key == "srcset" {
|
||||||
value = sanitizeSrcsetAttr(baseURL, value)
|
value = sanitizeSrcsetAttr(parsedBaseUrl, value)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -360,7 +365,7 @@ func sanitizeAttributes(parsedBaseUrl *url.URL, baseURL, tagName string, attribu
|
||||||
value = attribute.Val
|
value = attribute.Val
|
||||||
isAnchorLink = true
|
isAnchorLink = true
|
||||||
default:
|
default:
|
||||||
value, err = urllib.AbsoluteURL(baseURL, value)
|
value, err = absoluteURLParsedBase(parsedBaseUrl, value)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
@ -541,11 +546,11 @@ func isBlockedTag(tagName string) bool {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
func sanitizeSrcsetAttr(baseURL, value string) string {
|
func sanitizeSrcsetAttr(parsedBaseURL *url.URL, value string) string {
|
||||||
imageCandidates := ParseSrcSetAttribute(value)
|
imageCandidates := ParseSrcSetAttribute(value)
|
||||||
|
|
||||||
for _, imageCandidate := range imageCandidates {
|
for _, imageCandidate := range imageCandidates {
|
||||||
if absoluteURL, err := urllib.AbsoluteURL(baseURL, imageCandidate.ImageURL); err == nil {
|
if absoluteURL, err := absoluteURLParsedBase(parsedBaseURL, imageCandidate.ImageURL); err == nil {
|
||||||
imageCandidate.ImageURL = absoluteURL
|
imageCandidate.ImageURL = absoluteURL
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -597,3 +602,19 @@ func isValidDecodingValue(value string) bool {
|
||||||
}
|
}
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// absoluteURLParsedBase is used instead of urllib.AbsoluteURL to avoid parsing baseURL over and over.
|
||||||
|
func absoluteURLParsedBase(parsedBaseURL *url.URL, input string) (string, error) {
|
||||||
|
absURL, u, err := urllib.GetAbsoluteURL(input)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
if absURL != "" {
|
||||||
|
return absURL, nil
|
||||||
|
}
|
||||||
|
if parsedBaseURL == nil {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return parsedBaseURL.ResolveReference(u).String(), nil
|
||||||
|
}
|
||||||
|
|
|
@ -18,22 +18,34 @@ func IsAbsoluteURL(link string) bool {
|
||||||
return u.IsAbs()
|
return u.IsAbs()
|
||||||
}
|
}
|
||||||
|
|
||||||
// AbsoluteURL converts the input URL as absolute URL if necessary.
|
// GetAbsoluteURL return the absolute form of `input` is possible, as well as its parser form.
|
||||||
func AbsoluteURL(baseURL, input string) (string, error) {
|
func GetAbsoluteURL(input string) (string, *url.URL, error) {
|
||||||
if strings.HasPrefix(input, "//") {
|
if strings.HasPrefix(input, "//") {
|
||||||
return "https:" + input, nil
|
return "https:" + input, nil, nil
|
||||||
}
|
}
|
||||||
if strings.HasPrefix(input, "https://") || strings.HasPrefix(input, "http://") {
|
if strings.HasPrefix(input, "https://") || strings.HasPrefix(input, "http://") {
|
||||||
return input, nil
|
return input, nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
u, err := url.Parse(input)
|
u, err := url.Parse(input)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", fmt.Errorf("unable to parse input URL: %v", err)
|
return "", nil, fmt.Errorf("unable to parse input URL: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
if u.IsAbs() {
|
if u.IsAbs() {
|
||||||
return u.String(), nil
|
return u.String(), u, nil
|
||||||
|
}
|
||||||
|
return "", u, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// AbsoluteURL converts the input URL as absolute URL if necessary.
|
||||||
|
func AbsoluteURL(baseURL, input string) (string, error) {
|
||||||
|
absURL, u, err := GetAbsoluteURL(input)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
if absURL != "" {
|
||||||
|
return absURL, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
base, err := url.Parse(baseURL)
|
base, err := url.Parse(baseURL)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue