feat: implement base element handling in content scraper

2025-09-15 18:57:04 +00:00 · 2024-07-24 21:41:09 -07:00 · 2024-07-24 21:41:09 -07:00 · 29387f2d60
commit 29387f2d60
parent c0f6e32a99
5 changed files with 224 additions and 38 deletions
--- a/internal/reader/readability/readability_test.go
+++ b/internal/reader/readability/readability_test.go
@ -0,0 +1,102 @@
+// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+package readability // import "miniflux.app/v2/internal/reader/readability"
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestBaseURL(t *testing.T) {
+	html := `
+		<html>
+			<head>
+				<base href="https://example.org/ ">
+			</head>
+			<body>
+				<article>
+					Some content
+				</article>
+			</body>
+		</html>`
+
+	baseURL, _, err := ExtractContent(strings.NewReader(html))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if baseURL != "https://example.org/" {
+		t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
+	}
+}
+
+func TestMultipleBaseURL(t *testing.T) {
+	html := `
+		<html>
+			<head>
+				<base href="https://example.org/ ">
+				<base href="https://example.com/ ">
+			</head>
+			<body>
+				<article>
+					Some content
+				</article>
+			</body>
+		</html>`
+
+	baseURL, _, err := ExtractContent(strings.NewReader(html))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if baseURL != "https://example.org/" {
+		t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
+	}
+}
+
+func TestRelativeBaseURL(t *testing.T) {
+	html := `
+		<html>
+			<head>
+				<base href="/test/ ">
+			</head>
+			<body>
+				<article>
+					Some content
+				</article>
+			</body>
+		</html>`
+
+	baseURL, _, err := ExtractContent(strings.NewReader(html))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if baseURL != "" {
+		t.Errorf(`Unexpected base URL, got %q`, baseURL)
+	}
+}
+
+func TestWithoutBaseURL(t *testing.T) {
+	html := `
+		<html>
+			<head>
+				<title>Test</title>
+			</head>
+			<body>
+				<article>
+					Some content
+				</article>
+			</body>
+		</html>`
+
+	baseURL, _, err := ExtractContent(strings.NewReader(html))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if baseURL != "" {
+		t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
+	}
+}