2024-07-24 21:41:09 -07:00
|
|
|
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0
|
|
|
|
|
|
|
|
package readability // import "miniflux.app/v2/internal/reader/readability"
|
|
|
|
|
|
|
|
import (
|
2024-12-13 23:43:07 +01:00
|
|
|
"bytes"
|
|
|
|
"os"
|
2024-07-24 21:41:09 -07:00
|
|
|
"strings"
|
|
|
|
"testing"
|
|
|
|
)
|
|
|
|
|
|
|
|
func TestBaseURL(t *testing.T) {
|
|
|
|
html := `
|
|
|
|
<html>
|
|
|
|
<head>
|
|
|
|
<base href="https://example.org/ ">
|
|
|
|
</head>
|
|
|
|
<body>
|
|
|
|
<article>
|
|
|
|
Some content
|
|
|
|
</article>
|
|
|
|
</body>
|
|
|
|
</html>`
|
|
|
|
|
|
|
|
baseURL, _, err := ExtractContent(strings.NewReader(html))
|
|
|
|
if err != nil {
|
|
|
|
t.Fatal(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
if baseURL != "https://example.org/" {
|
|
|
|
t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestMultipleBaseURL(t *testing.T) {
|
|
|
|
html := `
|
|
|
|
<html>
|
|
|
|
<head>
|
|
|
|
<base href="https://example.org/ ">
|
|
|
|
<base href="https://example.com/ ">
|
|
|
|
</head>
|
|
|
|
<body>
|
|
|
|
<article>
|
|
|
|
Some content
|
|
|
|
</article>
|
|
|
|
</body>
|
|
|
|
</html>`
|
|
|
|
|
|
|
|
baseURL, _, err := ExtractContent(strings.NewReader(html))
|
|
|
|
if err != nil {
|
|
|
|
t.Fatal(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
if baseURL != "https://example.org/" {
|
|
|
|
t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestRelativeBaseURL(t *testing.T) {
|
|
|
|
html := `
|
|
|
|
<html>
|
|
|
|
<head>
|
|
|
|
<base href="/test/ ">
|
|
|
|
</head>
|
|
|
|
<body>
|
|
|
|
<article>
|
|
|
|
Some content
|
|
|
|
</article>
|
|
|
|
</body>
|
|
|
|
</html>`
|
|
|
|
|
|
|
|
baseURL, _, err := ExtractContent(strings.NewReader(html))
|
|
|
|
if err != nil {
|
|
|
|
t.Fatal(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
if baseURL != "" {
|
|
|
|
t.Errorf(`Unexpected base URL, got %q`, baseURL)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestWithoutBaseURL(t *testing.T) {
|
|
|
|
html := `
|
|
|
|
<html>
|
|
|
|
<head>
|
|
|
|
<title>Test</title>
|
|
|
|
</head>
|
|
|
|
<body>
|
|
|
|
<article>
|
|
|
|
Some content
|
|
|
|
</article>
|
|
|
|
</body>
|
|
|
|
</html>`
|
|
|
|
|
|
|
|
baseURL, _, err := ExtractContent(strings.NewReader(html))
|
|
|
|
if err != nil {
|
|
|
|
t.Fatal(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
if baseURL != "" {
|
|
|
|
t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
|
|
|
|
}
|
|
|
|
}
|
2024-12-13 04:41:56 +00:00
|
|
|
|
|
|
|
func TestRemoveStyleScript(t *testing.T) {
|
|
|
|
html := `
|
|
|
|
<html>
|
|
|
|
<head>
|
|
|
|
<title>Test</title>
|
|
|
|
<script src="tololo.js"></script>
|
|
|
|
</head>
|
|
|
|
<body>
|
|
|
|
<script src="tololo.js"></script>
|
|
|
|
<style>
|
|
|
|
h1 {color:red;}
|
|
|
|
p {color:blue;}
|
|
|
|
</style>
|
|
|
|
<article>Some content</article>
|
|
|
|
</body>
|
|
|
|
</html>`
|
|
|
|
want := `<div><div><article>Somecontent</article></div></div>`
|
|
|
|
|
|
|
|
_, content, err := ExtractContent(strings.NewReader(html))
|
|
|
|
if err != nil {
|
|
|
|
t.Fatal(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
content = strings.ReplaceAll(content, "\n", "")
|
|
|
|
content = strings.ReplaceAll(content, " ", "")
|
|
|
|
content = strings.ReplaceAll(content, "\t", "")
|
|
|
|
|
|
|
|
if content != want {
|
|
|
|
t.Errorf(`Invalid content, got %s instead of %s`, content, want)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestRemoveBlacklist(t *testing.T) {
|
|
|
|
html := `
|
|
|
|
<html>
|
|
|
|
<head>
|
|
|
|
<title>Test</title>
|
|
|
|
</head>
|
|
|
|
<body>
|
|
|
|
<article class="super-ad">Some content</article>
|
|
|
|
<article class="g-plus-crap">Some other thing</article>
|
|
|
|
<article class="stuff popupbody">And more</article>
|
|
|
|
<article class="legit">Valid!</article>
|
|
|
|
</body>
|
|
|
|
</html>`
|
|
|
|
want := `<div><div><articleclass="legit">Valid!</article></div></div>`
|
|
|
|
|
|
|
|
_, content, err := ExtractContent(strings.NewReader(html))
|
|
|
|
if err != nil {
|
|
|
|
t.Fatal(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
content = strings.ReplaceAll(content, "\n", "")
|
|
|
|
content = strings.ReplaceAll(content, " ", "")
|
|
|
|
content = strings.ReplaceAll(content, "\t", "")
|
|
|
|
|
|
|
|
if content != want {
|
|
|
|
t.Errorf(`Invalid content, got %s instead of %s`, content, want)
|
|
|
|
}
|
|
|
|
}
|
2024-12-13 23:43:07 +01:00
|
|
|
|
2025-06-19 21:03:53 -07:00
|
|
|
func TestNestedSpanInCodeBlock(t *testing.T) {
|
|
|
|
html := `
|
|
|
|
<html>
|
|
|
|
<head>
|
|
|
|
<title>Test</title>
|
|
|
|
</head>
|
|
|
|
<body>
|
|
|
|
<article><p>Some content</p><pre><code class="hljs-built_in">Code block with <span class="hljs-built_in">nested span</span> <span class="hljs-comment"># exit 1</span></code></pre></article>
|
|
|
|
</body>
|
|
|
|
</html>`
|
|
|
|
want := `<div><div><p>Some content</p><pre><code class="hljs-built_in">Code block with <span class="hljs-built_in">nested span</span> <span class="hljs-comment"># exit 1</span></code></pre></div></div>`
|
|
|
|
|
|
|
|
_, result, err := ExtractContent(strings.NewReader(html))
|
|
|
|
if err != nil {
|
|
|
|
t.Fatal(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
if result != want {
|
|
|
|
t.Errorf(`Invalid content, got %s instead of %s`, result, want)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-12-13 23:43:07 +01:00
|
|
|
func BenchmarkExtractContent(b *testing.B) {
|
|
|
|
var testCases = map[string][]byte{
|
|
|
|
"miniflux_github.html": {},
|
|
|
|
"miniflux_wikipedia.html": {},
|
|
|
|
}
|
|
|
|
for filename := range testCases {
|
|
|
|
data, err := os.ReadFile("testdata/" + filename)
|
|
|
|
if err != nil {
|
|
|
|
b.Fatalf(`Unable to read file %q: %v`, filename, err)
|
|
|
|
}
|
|
|
|
testCases[filename] = data
|
|
|
|
}
|
|
|
|
for range b.N {
|
|
|
|
for _, v := range testCases {
|
|
|
|
ExtractContent(bytes.NewReader(v))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|