1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-08-11 17:51:01 +00:00

feat: implement base element handling in content scraper

This commit is contained in:
Frédéric Guillot 2024-07-24 21:41:09 -07:00
parent c0f6e32a99
commit 29387f2d60
5 changed files with 224 additions and 38 deletions

View file

@ -62,7 +62,7 @@ func TestSelectorRules(t *testing.T) {
t.Fatalf(`Unable to read file %q: %v`, filename, err)
}
actualResult, err := findContentUsingCustomRules(bytes.NewReader(html), rule)
_, actualResult, err := findContentUsingCustomRules(bytes.NewReader(html), rule)
if err != nil {
t.Fatalf(`Scraping error for %q - %q: %v`, filename, rule, err)
}
@ -73,7 +73,67 @@ func TestSelectorRules(t *testing.T) {
}
if actualResult != strings.TrimSpace(string(expectedResult)) {
t.Errorf(`Unexpected result for %q, got "%s" instead of "%s"`, rule, actualResult, expectedResult)
t.Errorf(`Unexpected result for %q, got %q instead of %q`, rule, actualResult, expectedResult)
}
}
}
func TestParseBaseURLWithCustomRules(t *testing.T) {
html := `<html><head><base href="https://example.com/"></head><body><img src="image.jpg"></body></html>`
baseURL, _, err := findContentUsingCustomRules(strings.NewReader(html), "img")
if err != nil {
t.Fatalf(`Scraping error: %v`, err)
}
if baseURL != "https://example.com/" {
t.Errorf(`Unexpected base URL, got %q instead of "https://example.com/"`, baseURL)
}
}
func TestParseMultipleBaseURLWithCustomRules(t *testing.T) {
html := `<html><head><base href="https://example.com/"><base href="https://example.org/"/></head><body><img src="image.jpg"></body></html>`
baseURL, _, err := findContentUsingCustomRules(strings.NewReader(html), "img")
if err != nil {
t.Fatalf(`Scraping error: %v`, err)
}
if baseURL != "https://example.com/" {
t.Errorf(`Unexpected base URL, got %q instead of "https://example.com/"`, baseURL)
}
}
func TestParseRelativeBaseURLWithCustomRules(t *testing.T) {
html := `<html><head><base href="/test"></head><body><img src="image.jpg"></body></html>`
baseURL, _, err := findContentUsingCustomRules(strings.NewReader(html), "img")
if err != nil {
t.Fatalf(`Scraping error: %v`, err)
}
if baseURL != "" {
t.Errorf(`Unexpected base URL, got %q`, baseURL)
}
}
func TestParseEmptyBaseURLWithCustomRules(t *testing.T) {
html := `<html><head><base href=" "></head><body><img src="image.jpg"></body></html>`
baseURL, _, err := findContentUsingCustomRules(strings.NewReader(html), "img")
if err != nil {
t.Fatalf(`Scraping error: %v`, err)
}
if baseURL != "" {
t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
}
}
func TestParseMissingBaseURLWithCustomRules(t *testing.T) {
html := `<html><head></head><body><img src="image.jpg"></body></html>`
baseURL, _, err := findContentUsingCustomRules(strings.NewReader(html), "img")
if err != nil {
t.Fatalf(`Scraping error: %v`, err)
}
if baseURL != "" {
t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
}
}