mirror of
https://github.com/miniflux/v2.git
synced 2025-09-15 18:57:04 +00:00
First commit
This commit is contained in:
commit
8ffb773f43
2121 changed files with 1118910 additions and 0 deletions
360
reader/sanitizer/sanitizer.go
Normal file
360
reader/sanitizer/sanitizer.go
Normal file
|
@ -0,0 +1,360 @@
|
|||
// Copyright 2017 Frédéric Guillot. All rights reserved.
|
||||
// Use of this source code is governed by the Apache 2.0
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package sanitizer
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"github.com/miniflux/miniflux2/reader/url"
|
||||
"io"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
// Sanitize returns safe HTML.
|
||||
func Sanitize(baseURL, input string) string {
|
||||
tokenizer := html.NewTokenizer(bytes.NewBufferString(input))
|
||||
var buffer bytes.Buffer
|
||||
var tagStack []string
|
||||
|
||||
for {
|
||||
if tokenizer.Next() == html.ErrorToken {
|
||||
err := tokenizer.Err()
|
||||
if err == io.EOF {
|
||||
return buffer.String()
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
token := tokenizer.Token()
|
||||
switch token.Type {
|
||||
case html.TextToken:
|
||||
buffer.WriteString(token.Data)
|
||||
case html.StartTagToken:
|
||||
tagName := token.DataAtom.String()
|
||||
|
||||
if !isPixelTracker(tagName, token.Attr) && isValidTag(tagName) {
|
||||
attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
|
||||
|
||||
if hasRequiredAttributes(tagName, attrNames) {
|
||||
if len(attrNames) > 0 {
|
||||
buffer.WriteString("<" + tagName + " " + htmlAttributes + ">")
|
||||
} else {
|
||||
buffer.WriteString("<" + tagName + ">")
|
||||
}
|
||||
|
||||
tagStack = append(tagStack, tagName)
|
||||
}
|
||||
}
|
||||
case html.EndTagToken:
|
||||
tagName := token.DataAtom.String()
|
||||
if isValidTag(tagName) && inList(tagName, tagStack) {
|
||||
buffer.WriteString(fmt.Sprintf("</%s>", tagName))
|
||||
}
|
||||
case html.SelfClosingTagToken:
|
||||
tagName := token.DataAtom.String()
|
||||
if !isPixelTracker(tagName, token.Attr) && isValidTag(tagName) {
|
||||
attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
|
||||
|
||||
if hasRequiredAttributes(tagName, attrNames) {
|
||||
if len(attrNames) > 0 {
|
||||
buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>")
|
||||
} else {
|
||||
buffer.WriteString("<" + tagName + "/>")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) (attrNames []string, html string) {
|
||||
var htmlAttrs []string
|
||||
var err error
|
||||
|
||||
for _, attribute := range attributes {
|
||||
value := attribute.Val
|
||||
|
||||
if !isValidAttribute(tagName, attribute.Key) {
|
||||
continue
|
||||
}
|
||||
|
||||
if isExternalResourceAttribute(attribute.Key) {
|
||||
if tagName == "iframe" && !isValidIframeSource(attribute.Val) {
|
||||
continue
|
||||
} else {
|
||||
value, err = url.GetAbsoluteURL(baseURL, value)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if !hasValidScheme(value) || isBlacklistedResource(value) {
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
attrNames = append(attrNames, attribute.Key)
|
||||
htmlAttrs = append(htmlAttrs, fmt.Sprintf(`%s="%s"`, attribute.Key, value))
|
||||
}
|
||||
|
||||
extraAttrNames, extraHTMLAttributes := getExtraAttributes(tagName)
|
||||
if len(extraAttrNames) > 0 {
|
||||
attrNames = append(attrNames, extraAttrNames...)
|
||||
htmlAttrs = append(htmlAttrs, extraHTMLAttributes...)
|
||||
}
|
||||
|
||||
return attrNames, strings.Join(htmlAttrs, " ")
|
||||
}
|
||||
|
||||
func getExtraAttributes(tagName string) ([]string, []string) {
|
||||
if tagName == "a" {
|
||||
return []string{"rel", "target", "referrerpolicy"}, []string{`rel="noopener noreferrer"`, `target="_blank"`, `referrerpolicy="no-referrer"`}
|
||||
}
|
||||
|
||||
if tagName == "video" || tagName == "audio" {
|
||||
return []string{"controls"}, []string{"controls"}
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func isValidTag(tagName string) bool {
|
||||
for element := range getTagWhitelist() {
|
||||
if tagName == element {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func isValidAttribute(tagName, attributeName string) bool {
|
||||
for element, attributes := range getTagWhitelist() {
|
||||
if tagName == element {
|
||||
if inList(attributeName, attributes) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func isExternalResourceAttribute(attribute string) bool {
|
||||
switch attribute {
|
||||
case "src", "href", "poster", "cite":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func isPixelTracker(tagName string, attributes []html.Attribute) bool {
|
||||
if tagName == "img" {
|
||||
hasHeight := false
|
||||
hasWidth := false
|
||||
|
||||
for _, attribute := range attributes {
|
||||
if attribute.Key == "height" && attribute.Val == "1" {
|
||||
hasHeight = true
|
||||
}
|
||||
|
||||
if attribute.Key == "width" && attribute.Val == "1" {
|
||||
hasWidth = true
|
||||
}
|
||||
}
|
||||
|
||||
return hasHeight && hasWidth
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func hasRequiredAttributes(tagName string, attributes []string) bool {
|
||||
elements := make(map[string][]string)
|
||||
elements["a"] = []string{"href"}
|
||||
elements["iframe"] = []string{"src"}
|
||||
elements["img"] = []string{"src"}
|
||||
elements["source"] = []string{"src"}
|
||||
|
||||
for element, attrs := range elements {
|
||||
if tagName == element {
|
||||
for _, attribute := range attributes {
|
||||
for _, attr := range attrs {
|
||||
if attr == attribute {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func hasValidScheme(src string) bool {
|
||||
// See https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
|
||||
whitelist := []string{
|
||||
"apt://",
|
||||
"bitcoin://",
|
||||
"callto://",
|
||||
"ed2k://",
|
||||
"facetime://",
|
||||
"feed://",
|
||||
"ftp://",
|
||||
"geo://",
|
||||
"gopher://",
|
||||
"git://",
|
||||
"http://",
|
||||
"https://",
|
||||
"irc://",
|
||||
"irc6://",
|
||||
"ircs://",
|
||||
"itms://",
|
||||
"jabber://",
|
||||
"magnet://",
|
||||
"mailto://",
|
||||
"maps://",
|
||||
"news://",
|
||||
"nfs://",
|
||||
"nntp://",
|
||||
"rtmp://",
|
||||
"sip://",
|
||||
"sips://",
|
||||
"skype://",
|
||||
"smb://",
|
||||
"sms://",
|
||||
"spotify://",
|
||||
"ssh://",
|
||||
"sftp://",
|
||||
"steam://",
|
||||
"svn://",
|
||||
"tel://",
|
||||
"webcal://",
|
||||
"xmpp://",
|
||||
}
|
||||
|
||||
for _, prefix := range whitelist {
|
||||
if strings.HasPrefix(src, prefix) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func isBlacklistedResource(src string) bool {
|
||||
blacklist := []string{
|
||||
"feedsportal.com",
|
||||
"api.flattr.com",
|
||||
"stats.wordpress.com",
|
||||
"plus.google.com/share",
|
||||
"twitter.com/share",
|
||||
"feeds.feedburner.com",
|
||||
}
|
||||
|
||||
for _, element := range blacklist {
|
||||
if strings.Contains(src, element) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func isValidIframeSource(src string) bool {
|
||||
whitelist := []string{
|
||||
"http://www.youtube.com",
|
||||
"https://www.youtube.com",
|
||||
"http://player.vimeo.com",
|
||||
"https://player.vimeo.com",
|
||||
"http://www.dailymotion.com",
|
||||
"https://www.dailymotion.com",
|
||||
"http://vk.com",
|
||||
"https://vk.com",
|
||||
}
|
||||
|
||||
for _, prefix := range whitelist {
|
||||
if strings.HasPrefix(src, prefix) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func getTagWhitelist() map[string][]string {
|
||||
whitelist := make(map[string][]string)
|
||||
whitelist["img"] = []string{"alt", "title", "src"}
|
||||
whitelist["audio"] = []string{"src"}
|
||||
whitelist["video"] = []string{"poster", "height", "width", "src"}
|
||||
whitelist["source"] = []string{"src", "type"}
|
||||
whitelist["dt"] = []string{}
|
||||
whitelist["dd"] = []string{}
|
||||
whitelist["dl"] = []string{}
|
||||
whitelist["table"] = []string{}
|
||||
whitelist["caption"] = []string{}
|
||||
whitelist["thead"] = []string{}
|
||||
whitelist["tfooter"] = []string{}
|
||||
whitelist["tr"] = []string{}
|
||||
whitelist["td"] = []string{"rowspan", "colspan"}
|
||||
whitelist["th"] = []string{"rowspan", "colspan"}
|
||||
whitelist["h1"] = []string{}
|
||||
whitelist["h2"] = []string{}
|
||||
whitelist["h3"] = []string{}
|
||||
whitelist["h4"] = []string{}
|
||||
whitelist["h5"] = []string{}
|
||||
whitelist["h6"] = []string{}
|
||||
whitelist["strong"] = []string{}
|
||||
whitelist["em"] = []string{}
|
||||
whitelist["code"] = []string{}
|
||||
whitelist["pre"] = []string{}
|
||||
whitelist["blockquote"] = []string{}
|
||||
whitelist["q"] = []string{"cite"}
|
||||
whitelist["p"] = []string{}
|
||||
whitelist["ul"] = []string{}
|
||||
whitelist["li"] = []string{}
|
||||
whitelist["ol"] = []string{}
|
||||
whitelist["br"] = []string{}
|
||||
whitelist["del"] = []string{}
|
||||
whitelist["a"] = []string{"href", "title"}
|
||||
whitelist["figure"] = []string{}
|
||||
whitelist["figcaption"] = []string{}
|
||||
whitelist["cite"] = []string{}
|
||||
whitelist["time"] = []string{"datetime"}
|
||||
whitelist["abbr"] = []string{"title"}
|
||||
whitelist["acronym"] = []string{"title"}
|
||||
whitelist["wbr"] = []string{}
|
||||
whitelist["dfn"] = []string{}
|
||||
whitelist["sub"] = []string{}
|
||||
whitelist["sup"] = []string{}
|
||||
whitelist["var"] = []string{}
|
||||
whitelist["samp"] = []string{}
|
||||
whitelist["s"] = []string{}
|
||||
whitelist["del"] = []string{}
|
||||
whitelist["ins"] = []string{}
|
||||
whitelist["kbd"] = []string{}
|
||||
whitelist["rp"] = []string{}
|
||||
whitelist["rt"] = []string{}
|
||||
whitelist["rtc"] = []string{}
|
||||
whitelist["ruby"] = []string{}
|
||||
whitelist["iframe"] = []string{"width", "height", "frameborder", "src", "allowfullscreen"}
|
||||
return whitelist
|
||||
}
|
||||
|
||||
func inList(needle string, haystack []string) bool {
|
||||
for _, element := range haystack {
|
||||
if element == needle {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
144
reader/sanitizer/sanitizer_test.go
Normal file
144
reader/sanitizer/sanitizer_test.go
Normal file
|
@ -0,0 +1,144 @@
|
|||
// Copyright 2017 Frédéric Guillot. All rights reserved.
|
||||
// Use of this source code is governed by the Apache 2.0
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package sanitizer
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestValidInput(t *testing.T) {
|
||||
input := `<p>This is a <strong>text</strong> with an image: <img src="http://example.org/" alt="Test">.</p>`
|
||||
output := Sanitize("http://example.org/", input)
|
||||
|
||||
if input != output {
|
||||
t.Errorf(`Wrong output: "%s" != "%s"`, input, output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSelfClosingTags(t *testing.T) {
|
||||
input := `<p>This <br> is a <strong>text</strong> <br/>with an image: <img src="http://example.org/" alt="Test"/>.</p>`
|
||||
output := Sanitize("http://example.org/", input)
|
||||
|
||||
if input != output {
|
||||
t.Errorf(`Wrong output: "%s" != "%s"`, input, output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTable(t *testing.T) {
|
||||
input := `<table><tr><th>A</th><th colspan="2">B</th></tr><tr><td>C</td><td>D</td><td>E</td></tr></table>`
|
||||
output := Sanitize("http://example.org/", input)
|
||||
|
||||
if input != output {
|
||||
t.Errorf(`Wrong output: "%s" != "%s"`, input, output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRelativeURL(t *testing.T) {
|
||||
input := `This <a href="/test.html">link is relative</a> and this image: <img src="../folder/image.png"/>`
|
||||
expected := `This <a href="http://example.org/test.html" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">link is relative</a> and this image: <img src="http://example.org/folder/image.png"/>`
|
||||
output := Sanitize("http://example.org/", input)
|
||||
|
||||
if expected != output {
|
||||
t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestProtocolRelativeURL(t *testing.T) {
|
||||
input := `This <a href="//static.example.org/index.html">link is relative</a>.`
|
||||
expected := `This <a href="https://static.example.org/index.html" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">link is relative</a>.`
|
||||
output := Sanitize("http://example.org/", input)
|
||||
|
||||
if expected != output {
|
||||
t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestInvalidTag(t *testing.T) {
|
||||
input := `<p>My invalid <b>tag</b>.</p>`
|
||||
expected := `<p>My invalid tag.</p>`
|
||||
output := Sanitize("http://example.org/", input)
|
||||
|
||||
if expected != output {
|
||||
t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVideoTag(t *testing.T) {
|
||||
input := `<p>My valid <video src="videofile.webm" autoplay poster="posterimage.jpg">fallback</video>.</p>`
|
||||
expected := `<p>My valid <video src="http://example.org/videofile.webm" poster="http://example.org/posterimage.jpg" controls>fallback</video>.</p>`
|
||||
output := Sanitize("http://example.org/", input)
|
||||
|
||||
if expected != output {
|
||||
t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAudioAndSourceTag(t *testing.T) {
|
||||
input := `<p>My music <audio controls="controls"><source src="foo.wav" type="audio/wav"></audio>.</p>`
|
||||
expected := `<p>My music <audio controls><source src="http://example.org/foo.wav" type="audio/wav"></audio>.</p>`
|
||||
output := Sanitize("http://example.org/", input)
|
||||
|
||||
if expected != output {
|
||||
t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestUnknownTag(t *testing.T) {
|
||||
input := `<p>My invalid <unknown>tag</unknown>.</p>`
|
||||
expected := `<p>My invalid tag.</p>`
|
||||
output := Sanitize("http://example.org/", input)
|
||||
|
||||
if expected != output {
|
||||
t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestInvalidNestedTag(t *testing.T) {
|
||||
input := `<p>My invalid <b>tag with some <em>valid</em> tag</b>.</p>`
|
||||
expected := `<p>My invalid tag with some <em>valid</em> tag.</p>`
|
||||
output := Sanitize("http://example.org/", input)
|
||||
|
||||
if expected != output {
|
||||
t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestInvalidIFrame(t *testing.T) {
|
||||
input := `<iframe src="http://example.org/"></iframe>`
|
||||
expected := ``
|
||||
output := Sanitize("http://example.org/", input)
|
||||
|
||||
if expected != output {
|
||||
t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestInvalidURLScheme(t *testing.T) {
|
||||
input := `<p>This link is <a src="file:///etc/passwd">not valid</a></p>`
|
||||
expected := `<p>This link is not valid</p>`
|
||||
output := Sanitize("http://example.org/", input)
|
||||
|
||||
if expected != output {
|
||||
t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBlacklistedLink(t *testing.T) {
|
||||
input := `<p>This image is not valid <img src="https://stats.wordpress.com/some-tracker"></p>`
|
||||
expected := `<p>This image is not valid </p>`
|
||||
output := Sanitize("http://example.org/", input)
|
||||
|
||||
if expected != output {
|
||||
t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPixelTracker(t *testing.T) {
|
||||
input := `<p><img src="https://tracker1.example.org/" height="1" width="1"> and <img src="https://tracker2.example.org/" height="1" width="1"/></p>`
|
||||
expected := `<p> and </p>`
|
||||
output := Sanitize("http://example.org/", input)
|
||||
|
||||
if expected != output {
|
||||
t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
|
||||
}
|
||||
}
|
35
reader/sanitizer/strip_tags.go
Normal file
35
reader/sanitizer/strip_tags.go
Normal file
|
@ -0,0 +1,35 @@
|
|||
// Copyright 2017 Frédéric Guillot. All rights reserved.
|
||||
// Use of this source code is governed by the Apache 2.0
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package sanitizer
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
// StripTags removes all HTML/XML tags from the input string.
|
||||
func StripTags(input string) string {
|
||||
tokenizer := html.NewTokenizer(bytes.NewBufferString(input))
|
||||
var buffer bytes.Buffer
|
||||
|
||||
for {
|
||||
if tokenizer.Next() == html.ErrorToken {
|
||||
err := tokenizer.Err()
|
||||
if err == io.EOF {
|
||||
return buffer.String()
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
token := tokenizer.Token()
|
||||
switch token.Type {
|
||||
case html.TextToken:
|
||||
buffer.WriteString(token.Data)
|
||||
}
|
||||
}
|
||||
}
|
17
reader/sanitizer/strip_tags_test.go
Normal file
17
reader/sanitizer/strip_tags_test.go
Normal file
|
@ -0,0 +1,17 @@
|
|||
// Copyright 2017 Frédéric Guillot. All rights reserved.
|
||||
// Use of this source code is governed by the Apache 2.0
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package sanitizer
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestStripTags(t *testing.T) {
|
||||
input := `This <a href="/test.html">link is relative</a> and <strong>this</strong> image: <img src="../folder/image.png"/>`
|
||||
expected := `This link is relative and this image: `
|
||||
output := StripTags(input)
|
||||
|
||||
if expected != output {
|
||||
t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue