refactor(readability): various improvements and optimizations

- Replace a completely overkill regex - Use `.Remove()` instead of a hand-rolled loop - Use a strings.Builder instead of a bytes.NewBufferString - Replace a call to Fprintf with string concatenation, as the latter are much faster - Remove a superfluous cast - Delay some computations - Add some tests
2025-09-05 18:41:01 +00:00 · 2024-12-13 04:41:56 +00:00 · 2024-12-13 04:41:56 +00:00 · 6ad5ad0bb2
commit 6ad5ad0bb2
parent 113abeea59
2 changed files with 84 additions and 29 deletions
--- a/internal/reader/readability/readability_test.go
+++ b/internal/reader/readability/readability_test.go
@ -100,3 +100,64 @@ func TestWithoutBaseURL(t *testing.T) {
 		t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
 	}
 }
+
+func TestRemoveStyleScript(t *testing.T) {
+	html := `
+		<html>
+			<head>
+				<title>Test</title>
+				    <script src="tololo.js"></script>
+			</head>
+			<body>
+				<script src="tololo.js"></script>
+				<style>
+			  		h1 {color:red;}
+			  		p {color:blue;}
+				</style>
+				<article>Some content</article>
+			</body>
+		</html>`
+	want := `<div><div><article>Somecontent</article></div></div>`
+
+	_, content, err := ExtractContent(strings.NewReader(html))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	content = strings.ReplaceAll(content, "\n", "")
+	content = strings.ReplaceAll(content, " ", "")
+	content = strings.ReplaceAll(content, "\t", "")
+
+	if content != want {
+		t.Errorf(`Invalid content, got %s instead of %s`, content, want)
+	}
+}
+
+func TestRemoveBlacklist(t *testing.T) {
+	html := `
+		<html>
+			<head>
+				<title>Test</title>
+			</head>
+			<body>
+				<article class="super-ad">Some content</article>
+				<article class="g-plus-crap">Some other thing</article>
+				<article class="stuff popupbody">And more</article>
+				<article class="legit">Valid!</article>
+			</body>
+		</html>`
+	want := `<div><div><articleclass="legit">Valid!</article></div></div>`
+
+	_, content, err := ExtractContent(strings.NewReader(html))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	content = strings.ReplaceAll(content, "\n", "")
+	content = strings.ReplaceAll(content, " ", "")
+	content = strings.ReplaceAll(content, "\t", "")
+
+	if content != want {
+		t.Errorf(`Invalid content, got %s instead of %s`, content, want)
+	}
+}