feat: implement base element handling in content scraper

2025-09-15 18:57:04 +00:00 · 2024-07-24 21:41:09 -07:00 · 2024-07-24 21:41:09 -07:00 · 29387f2d60
commit 29387f2d60
parent c0f6e32a99
5 changed files with 224 additions and 38 deletions
--- a/internal/reader/readability/readability.go
+++ b/internal/reader/readability/readability.go
@ -12,6 +12,8 @@ import (
 	"regexp"
 	"strings"

+	"miniflux.app/v2/internal/urllib"
+
 	"github.com/PuerkitoBio/goquery"
 	"golang.org/x/net/html"
 )
@ -69,10 +71,17 @@ func (c candidateList) String() string {
 }

 // ExtractContent returns relevant content.
-func ExtractContent(page io.Reader) (string, error) {
+func ExtractContent(page io.Reader) (baseURL string, extractedContent string, err error) {
 	document, err := goquery.NewDocumentFromReader(page)
 	if err != nil {
-		return "", err
+		return "", "", err
+	}
+
+	if hrefValue, exists := document.Find("head base").First().Attr("href"); exists {
+		hrefValue = strings.TrimSpace(hrefValue)
+		if urllib.IsAbsoluteURL(hrefValue) {
+			baseURL = hrefValue
+		}
 	}

 	document.Find("script,style").Each(func(i int, s *goquery.Selection) {
@ -86,12 +95,13 @@ func ExtractContent(page io.Reader) (string, error) {
 	topCandidate := getTopCandidate(document, candidates)

 	slog.Debug("Readability parsing",
+		slog.String("base_url", baseURL),
 		slog.Any("candidates", candidates),
 		slog.Any("topCandidate", topCandidate),
 	)

-	output := getArticle(topCandidate, candidates)
-	return output, nil
+	extractedContent = getArticle(topCandidate, candidates)
+	return baseURL, extractedContent, nil
 }

 // Now that we have the top candidate, look through its siblings for content that might also be related.
--- a/internal/reader/readability/readability_test.go
+++ b/internal/reader/readability/readability_test.go
@ -0,0 +1,102 @@
+// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+package readability // import "miniflux.app/v2/internal/reader/readability"
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestBaseURL(t *testing.T) {
+	html := `
+		<html>
+			<head>
+				<base href="https://example.org/ ">
+			</head>
+			<body>
+				<article>
+					Some content
+				</article>
+			</body>
+		</html>`
+
+	baseURL, _, err := ExtractContent(strings.NewReader(html))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if baseURL != "https://example.org/" {
+		t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
+	}
+}
+
+func TestMultipleBaseURL(t *testing.T) {
+	html := `
+		<html>
+			<head>
+				<base href="https://example.org/ ">
+				<base href="https://example.com/ ">
+			</head>
+			<body>
+				<article>
+					Some content
+				</article>
+			</body>
+		</html>`
+
+	baseURL, _, err := ExtractContent(strings.NewReader(html))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if baseURL != "https://example.org/" {
+		t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
+	}
+}
+
+func TestRelativeBaseURL(t *testing.T) {
+	html := `
+		<html>
+			<head>
+				<base href="/test/ ">
+			</head>
+			<body>
+				<article>
+					Some content
+				</article>
+			</body>
+		</html>`
+
+	baseURL, _, err := ExtractContent(strings.NewReader(html))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if baseURL != "" {
+		t.Errorf(`Unexpected base URL, got %q`, baseURL)
+	}
+}
+
+func TestWithoutBaseURL(t *testing.T) {
+	html := `
+		<html>
+			<head>
+				<title>Test</title>
+			</head>
+			<body>
+				<article>
+					Some content
+				</article>
+			</body>
+		</html>`
+
+	baseURL, _, err := ExtractContent(strings.NewReader(html))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if baseURL != "" {
+		t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
+	}
+}