mirror of
https://github.com/miniflux/v2.git
synced 2025-08-11 17:51:01 +00:00
feat: implement base element handling in content scraper
This commit is contained in:
parent
c0f6e32a99
commit
29387f2d60
5 changed files with 224 additions and 38 deletions
102
internal/reader/readability/readability_test.go
Normal file
102
internal/reader/readability/readability_test.go
Normal file
|
@ -0,0 +1,102 @@
|
|||
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
package readability // import "miniflux.app/v2/internal/reader/readability"
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestBaseURL(t *testing.T) {
|
||||
html := `
|
||||
<html>
|
||||
<head>
|
||||
<base href="https://example.org/ ">
|
||||
</head>
|
||||
<body>
|
||||
<article>
|
||||
Some content
|
||||
</article>
|
||||
</body>
|
||||
</html>`
|
||||
|
||||
baseURL, _, err := ExtractContent(strings.NewReader(html))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if baseURL != "https://example.org/" {
|
||||
t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMultipleBaseURL(t *testing.T) {
|
||||
html := `
|
||||
<html>
|
||||
<head>
|
||||
<base href="https://example.org/ ">
|
||||
<base href="https://example.com/ ">
|
||||
</head>
|
||||
<body>
|
||||
<article>
|
||||
Some content
|
||||
</article>
|
||||
</body>
|
||||
</html>`
|
||||
|
||||
baseURL, _, err := ExtractContent(strings.NewReader(html))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if baseURL != "https://example.org/" {
|
||||
t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRelativeBaseURL(t *testing.T) {
|
||||
html := `
|
||||
<html>
|
||||
<head>
|
||||
<base href="/test/ ">
|
||||
</head>
|
||||
<body>
|
||||
<article>
|
||||
Some content
|
||||
</article>
|
||||
</body>
|
||||
</html>`
|
||||
|
||||
baseURL, _, err := ExtractContent(strings.NewReader(html))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if baseURL != "" {
|
||||
t.Errorf(`Unexpected base URL, got %q`, baseURL)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWithoutBaseURL(t *testing.T) {
|
||||
html := `
|
||||
<html>
|
||||
<head>
|
||||
<title>Test</title>
|
||||
</head>
|
||||
<body>
|
||||
<article>
|
||||
Some content
|
||||
</article>
|
||||
</body>
|
||||
</html>`
|
||||
|
||||
baseURL, _, err := ExtractContent(strings.NewReader(html))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if baseURL != "" {
|
||||
t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue