miniflux-v2/internal/reader/readability/readability_test.go

// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

package readability // import "miniflux.app/v2/internal/reader/readability"

import (
	"bytes"
	"os"
	"strings"
	"testing"
)

func TestBaseURL(t *testing.T) {
	html := `
		<html>
			<head>
				<base href="https://example.org/ ">
			</head>
			<body>
				<article>
					Some content
				</article>
			</body>
		</html>`

	baseURL, _, err := ExtractContent(strings.NewReader(html))
	if err != nil {
		t.Fatal(err)
	}

	if baseURL != "https://example.org/" {
		t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
	}
}

func TestMultipleBaseURL(t *testing.T) {
	html := `
		<html>
			<head>
				<base href="https://example.org/ ">
				<base href="https://example.com/ ">
			</head>
			<body>
				<article>
					Some content
				</article>
			</body>
		</html>`

	baseURL, _, err := ExtractContent(strings.NewReader(html))
	if err != nil {
		t.Fatal(err)
	}

	if baseURL != "https://example.org/" {
		t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
	}
}

func TestRelativeBaseURL(t *testing.T) {
	html := `
		<html>
			<head>
				<base href="/test/ ">
			</head>
			<body>
				<article>
					Some content
				</article>
			</body>
		</html>`

	baseURL, _, err := ExtractContent(strings.NewReader(html))
	if err != nil {
		t.Fatal(err)
	}

	if baseURL != "" {
		t.Errorf(`Unexpected base URL, got %q`, baseURL)
	}
}

func TestWithoutBaseURL(t *testing.T) {
	html := `
		<html>
			<head>
				<title>Test</title>
			</head>
			<body>
				<article>
					Some content
				</article>
			</body>
		</html>`

	baseURL, _, err := ExtractContent(strings.NewReader(html))
	if err != nil {
		t.Fatal(err)
	}

	if baseURL != "" {
		t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
	}
}

func TestRemoveStyleScript(t *testing.T) {
	html := `
		<html>
			<head>
				<title>Test</title>
				    <script src="tololo.js"></script>
			</head>
			<body>
				<script src="tololo.js"></script>
				<style>
			  		h1 {color:red;}
			  		p {color:blue;}
				</style>
				<article>Some content</article>
			</body>
		</html>`
	want := `<div><div><article>Somecontent</article></div></div>`

	_, content, err := ExtractContent(strings.NewReader(html))
	if err != nil {
		t.Fatal(err)
	}

	content = strings.ReplaceAll(content, "\n", "")
	content = strings.ReplaceAll(content, " ", "")
	content = strings.ReplaceAll(content, "\t", "")

	if content != want {
		t.Errorf(`Invalid content, got %s instead of %s`, content, want)
	}
}

func TestRemoveBlacklist(t *testing.T) {
	html := `
		<html>
			<head>
				<title>Test</title>
			</head>
			<body>
				<article class="super-ad">Some content</article>
				<article class="g-plus-crap">Some other thing</article>
				<article class="stuff popupbody">And more</article>
				<article class="legit">Valid!</article>
			</body>
		</html>`
	want := `<div><div><articleclass="legit">Valid!</article></div></div>`

	_, content, err := ExtractContent(strings.NewReader(html))
	if err != nil {
		t.Fatal(err)
	}

	content = strings.ReplaceAll(content, "\n", "")
	content = strings.ReplaceAll(content, " ", "")
	content = strings.ReplaceAll(content, "\t", "")

	if content != want {
		t.Errorf(`Invalid content, got %s instead of %s`, content, want)
	}
}

func TestNestedSpanInCodeBlock(t *testing.T) {
	html := `
		<html>
			<head>
				<title>Test</title>
			</head>
			<body>
				<article><p>Some content</p><pre><code class="hljs-built_in">Code block with <span class="hljs-built_in">nested span</span> <span class="hljs-comment"># exit 1</span></code></pre></article>
			</body>
		</html>`
	want := `<div><div><p>Some content</p><pre><code class="hljs-built_in">Code block with <span class="hljs-built_in">nested span</span> <span class="hljs-comment"># exit 1</span></code></pre></div></div>`

	_, result, err := ExtractContent(strings.NewReader(html))
	if err != nil {
		t.Fatal(err)
	}

	if result != want {
		t.Errorf(`Invalid content, got %s instead of %s`, result, want)
	}
}

func BenchmarkExtractContent(b *testing.B) {
	var testCases = map[string][]byte{
		"miniflux_github.html":    {},
		"miniflux_wikipedia.html": {},
	}
	for filename := range testCases {
		data, err := os.ReadFile("testdata/" + filename)
		if err != nil {
			b.Fatalf(`Unable to read file %q: %v`, filename, err)
		}
		testCases[filename] = data
	}
	for range b.N {
		for _, v := range testCases {
			ExtractContent(bytes.NewReader(v))
		}
	}
}
feat: implement base element handling in content scraper 2024-07-24 21:41:09 -07:00			`// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.`
			`// SPDX-License-Identifier: Apache-2.0`

			`package readability // import "miniflux.app/v2/internal/reader/readability"`

			`import (`
Refactor internal/reader/readability/testdata - Use chained strings.Contains instead of a regex for blacklistCandidatesRegexp, as this is a bit faster - Simplify a Find.Each.Remove to Find.Remove - Don't concatenate id and class for removeUnlikelyCandidates, as it makes no sense to match on overlaps. It might also marginally improve performances, as regex now have to run on two strings separately, instead of both. - Add a small benchmark 2024-12-13 23:43:07 +01:00			`"bytes"`
			`"os"`
feat: implement base element handling in content scraper 2024-07-24 21:41:09 -07:00			`"strings"`
			`"testing"`
			`)`

			`func TestBaseURL(t *testing.T) {`
			html := `
			`<html>`
			`<head>`
			`<base href="https://example.org/ ">`
			`</head>`
			`<body>`
			`<article>`
			`Some content`
			`</article>`
			`</body>`
			</html>`

			`baseURL, _, err := ExtractContent(strings.NewReader(html))`
			`if err != nil {`
			`t.Fatal(err)`
			`}`

			`if baseURL != "https://example.org/" {`
			t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
			`}`
			`}`

			`func TestMultipleBaseURL(t *testing.T) {`
			html := `
			`<html>`
			`<head>`
			`<base href="https://example.org/ ">`
			`<base href="https://example.com/ ">`
			`</head>`
			`<body>`
			`<article>`
			`Some content`
			`</article>`
			`</body>`
			</html>`

			`baseURL, _, err := ExtractContent(strings.NewReader(html))`
			`if err != nil {`
			`t.Fatal(err)`
			`}`

			`if baseURL != "https://example.org/" {`
			t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
			`}`
			`}`

			`func TestRelativeBaseURL(t *testing.T) {`
			html := `
			`<html>`
			`<head>`
			`<base href="/test/ ">`
			`</head>`
			`<body>`
			`<article>`
			`Some content`
			`</article>`
			`</body>`
			</html>`

			`baseURL, _, err := ExtractContent(strings.NewReader(html))`
			`if err != nil {`
			`t.Fatal(err)`
			`}`

			`if baseURL != "" {`
			t.Errorf(`Unexpected base URL, got %q`, baseURL)
			`}`
			`}`

			`func TestWithoutBaseURL(t *testing.T) {`
			html := `
			`<html>`
			`<head>`
			`<title>Test</title>`
			`</head>`
			`<body>`
			`<article>`
			`Some content`
			`</article>`
			`</body>`
			</html>`

			`baseURL, _, err := ExtractContent(strings.NewReader(html))`
			`if err != nil {`
			`t.Fatal(err)`
			`}`

			`if baseURL != "" {`
			t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
			`}`
			`}`
refactor(readability): various improvements and optimizations - Replace a completely overkill regex - Use `.Remove()` instead of a hand-rolled loop - Use a strings.Builder instead of a bytes.NewBufferString - Replace a call to Fprintf with string concatenation, as the latter are much faster - Remove a superfluous cast - Delay some computations - Add some tests 2024-12-13 04:41:56 +00:00
			`func TestRemoveStyleScript(t *testing.T) {`
			html := `
			`<html>`
			`<head>`
			`<title>Test</title>`
			`<script src="tololo.js"></script>`
			`</head>`
			`<body>`
			`<script src="tololo.js"></script>`
			`<style>`
			`h1 {color:red;}`
			`p {color:blue;}`
			`</style>`
			`<article>Some content</article>`
			`</body>`
			</html>`
			want := `<div><div><article>Somecontent</article></div></div>`

			`_, content, err := ExtractContent(strings.NewReader(html))`
			`if err != nil {`
			`t.Fatal(err)`
			`}`

			`content = strings.ReplaceAll(content, "\n", "")`
			`content = strings.ReplaceAll(content, " ", "")`
			`content = strings.ReplaceAll(content, "\t", "")`

			`if content != want {`
			t.Errorf(`Invalid content, got %s instead of %s`, content, want)
			`}`
			`}`

			`func TestRemoveBlacklist(t *testing.T) {`
			html := `
			`<html>`
			`<head>`
			`<title>Test</title>`
			`</head>`
			`<body>`
			`<article class="super-ad">Some content</article>`
			`<article class="g-plus-crap">Some other thing</article>`
			`<article class="stuff popupbody">And more</article>`
			`<article class="legit">Valid!</article>`
			`</body>`
			</html>`
			want := `<div><div><articleclass="legit">Valid!</article></div></div>`

			`_, content, err := ExtractContent(strings.NewReader(html))`
			`if err != nil {`
			`t.Fatal(err)`
			`}`

			`content = strings.ReplaceAll(content, "\n", "")`
			`content = strings.ReplaceAll(content, " ", "")`
			`content = strings.ReplaceAll(content, "\t", "")`

			`if content != want {`
			t.Errorf(`Invalid content, got %s instead of %s`, content, want)
			`}`
			`}`
Refactor internal/reader/readability/testdata - Use chained strings.Contains instead of a regex for blacklistCandidatesRegexp, as this is a bit faster - Simplify a Find.Each.Remove to Find.Remove - Don't concatenate id and class for removeUnlikelyCandidates, as it makes no sense to match on overlaps. It might also marginally improve performances, as regex now have to run on two strings separately, instead of both. - Add a small benchmark 2024-12-13 23:43:07 +01:00
fix(readability): do not remove elements within code blocks `<span class="hljs-comment"># exit 1</span>` will match the `unlikelyCandidatesRegexp` because it contains the `comment` string. 2025-06-19 21:03:53 -07:00			`func TestNestedSpanInCodeBlock(t *testing.T) {`
			html := `
			`<html>`
			`<head>`
			`<title>Test</title>`
			`</head>`
			`<body>`
			`<article><p>Some content</p><pre><code class="hljs-built_in">Code block with <span class="hljs-built_in">nested span</span> <span class="hljs-comment"># exit 1</span></code></pre></article>`
			`</body>`
			</html>`
			want := `<div><div><p>Some content</p><pre><code class="hljs-built_in">Code block with <span class="hljs-built_in">nested span</span> <span class="hljs-comment"># exit 1</span></code></pre></div></div>`

			`_, result, err := ExtractContent(strings.NewReader(html))`
			`if err != nil {`
			`t.Fatal(err)`
			`}`

			`if result != want {`
			t.Errorf(`Invalid content, got %s instead of %s`, result, want)
			`}`
			`}`

Refactor internal/reader/readability/testdata - Use chained strings.Contains instead of a regex for blacklistCandidatesRegexp, as this is a bit faster - Simplify a Find.Each.Remove to Find.Remove - Don't concatenate id and class for removeUnlikelyCandidates, as it makes no sense to match on overlaps. It might also marginally improve performances, as regex now have to run on two strings separately, instead of both. - Add a small benchmark 2024-12-13 23:43:07 +01:00			`func BenchmarkExtractContent(b *testing.B) {`
			`var testCases = map[string][]byte{`
			`"miniflux_github.html": {},`
			`"miniflux_wikipedia.html": {},`
			`}`
			`for filename := range testCases {`
			`data, err := os.ReadFile("testdata/" + filename)`
			`if err != nil {`
			b.Fatalf(`Unable to read file %q: %v`, filename, err)
			`}`
			`testCases[filename] = data`
			`}`
			`for range b.N {`
			`for _, v := range testCases {`
			`ExtractContent(bytes.NewReader(v))`
			`}`
			`}`
			`}`