1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-08-11 17:51:01 +00:00

feat: implement base element handling in content scraper

This commit is contained in:
Frédéric Guillot 2024-07-24 21:41:09 -07:00
parent c0f6e32a99
commit 29387f2d60
5 changed files with 224 additions and 38 deletions

View file

@ -0,0 +1,102 @@
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package readability // import "miniflux.app/v2/internal/reader/readability"
import (
"strings"
"testing"
)
func TestBaseURL(t *testing.T) {
html := `
<html>
<head>
<base href="https://example.org/ ">
</head>
<body>
<article>
Some content
</article>
</body>
</html>`
baseURL, _, err := ExtractContent(strings.NewReader(html))
if err != nil {
t.Fatal(err)
}
if baseURL != "https://example.org/" {
t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
}
}
func TestMultipleBaseURL(t *testing.T) {
html := `
<html>
<head>
<base href="https://example.org/ ">
<base href="https://example.com/ ">
</head>
<body>
<article>
Some content
</article>
</body>
</html>`
baseURL, _, err := ExtractContent(strings.NewReader(html))
if err != nil {
t.Fatal(err)
}
if baseURL != "https://example.org/" {
t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
}
}
func TestRelativeBaseURL(t *testing.T) {
html := `
<html>
<head>
<base href="/test/ ">
</head>
<body>
<article>
Some content
</article>
</body>
</html>`
baseURL, _, err := ExtractContent(strings.NewReader(html))
if err != nil {
t.Fatal(err)
}
if baseURL != "" {
t.Errorf(`Unexpected base URL, got %q`, baseURL)
}
}
func TestWithoutBaseURL(t *testing.T) {
html := `
<html>
<head>
<title>Test</title>
</head>
<body>
<article>
Some content
</article>
</body>
</html>`
baseURL, _, err := ExtractContent(strings.NewReader(html))
if err != nil {
t.Fatal(err)
}
if baseURL != "" {
t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
}
}