// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. // SPDX-License-Identifier: Apache-2.0 package readability // import "miniflux.app/v2/internal/reader/readability" import ( "bytes" "os" "strings" "testing" "github.com/PuerkitoBio/goquery" ) func TestBaseURL(t *testing.T) { html := `
Some content
` baseURL, _, err := ExtractContent(strings.NewReader(html)) if err != nil { t.Fatal(err) } if baseURL != "https://example.org/" { t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL) } } func TestMultipleBaseURL(t *testing.T) { html := `
Some content
` baseURL, _, err := ExtractContent(strings.NewReader(html)) if err != nil { t.Fatal(err) } if baseURL != "https://example.org/" { t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL) } } func TestRelativeBaseURL(t *testing.T) { html := `
Some content
` baseURL, _, err := ExtractContent(strings.NewReader(html)) if err != nil { t.Fatal(err) } if baseURL != "" { t.Errorf(`Unexpected base URL, got %q`, baseURL) } } func TestWithoutBaseURL(t *testing.T) { html := ` Test
Some content
` baseURL, _, err := ExtractContent(strings.NewReader(html)) if err != nil { t.Fatal(err) } if baseURL != "" { t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL) } } func TestRemoveStyleScript(t *testing.T) { html := ` Test
Some content
` want := `
Somecontent
` _, content, err := ExtractContent(strings.NewReader(html)) if err != nil { t.Fatal(err) } content = strings.ReplaceAll(content, "\n", "") content = strings.ReplaceAll(content, " ", "") content = strings.ReplaceAll(content, "\t", "") if content != want { t.Errorf(`Invalid content, got %s instead of %s`, content, want) } } func TestRemoveBlacklist(t *testing.T) { html := ` Test
Some content
Some other thing
And more
Valid!
` want := `
Valid!
` _, content, err := ExtractContent(strings.NewReader(html)) if err != nil { t.Fatal(err) } content = strings.ReplaceAll(content, "\n", "") content = strings.ReplaceAll(content, " ", "") content = strings.ReplaceAll(content, "\t", "") if content != want { t.Errorf(`Invalid content, got %s instead of %s`, content, want) } } func TestNestedSpanInCodeBlock(t *testing.T) { html := ` Test

Some content

Code block with nested span # exit 1
` want := `

Some content

Code block with nested span # exit 1
` _, result, err := ExtractContent(strings.NewReader(html)) if err != nil { t.Fatal(err) } if result != want { t.Errorf(`Invalid content, got %s instead of %s`, result, want) } } func BenchmarkExtractContent(b *testing.B) { var testCases = map[string][]byte{ "miniflux_github.html": {}, "miniflux_wikipedia.html": {}, } for filename := range testCases { data, err := os.ReadFile("testdata/" + filename) if err != nil { b.Fatalf(`Unable to read file %q: %v`, filename, err) } testCases[filename] = data } for range b.N { for _, v := range testCases { ExtractContent(bytes.NewReader(v)) } } } func TestGetClassWeight(t *testing.T) { testCases := []struct { name string html string expected float32 }{ { name: "no class or id", html: `
content
`, expected: 0, }, { name: "positive class only", html: `
content
`, expected: 25, }, { name: "negative class only", html: `
content
`, expected: -25, }, { name: "positive id only", html: `
content
`, expected: 25, }, { name: "negative id only", html: ``, expected: -25, }, { name: "positive class and positive id", html: `
content
`, expected: 50, }, { name: "negative class and negative id", html: ``, expected: -50, }, { name: "positive class and negative id", html: `
content
`, expected: 0, }, { name: "negative class and positive id", html: ``, expected: 0, }, { name: "multiple positive classes", html: `
content
`, expected: 25, }, { name: "multiple negative classes", html: ``, expected: -25, }, { name: "mixed positive and negative classes", html: `
content
`, expected: -25, // negative takes precedence since it's checked first }, { name: "case insensitive class", html: `
content
`, expected: 25, }, { name: "case insensitive id", html: `
content
`, expected: 25, }, { name: "non-matching class and id", html: `
content
`, expected: 0, }, { name: "empty class and id", html: `
content
`, expected: 0, }, { name: "class with special characters", html: `
content
`, expected: -25, // matches com- in negative regex }, { name: "id with special characters", html: `
content
`, expected: 25, // matches h-entry in positive regex }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html)) if err != nil { t.Fatalf("Failed to parse HTML: %v", err) } selection := doc.Find("div").First() if selection.Length() == 0 { t.Fatal("No div element found in HTML") } result := getClassWeight(selection) if result != tc.expected { t.Errorf("Expected weight %f, got %f", tc.expected, result) } }) } } func TestGetClassWeightRegexPatterns(t *testing.T) { // Test specific regex patterns used in getClassWeight positiveWords := []string{"article", "body", "content", "entry", "hentry", "h-entry", "main", "page", "pagination", "post", "text", "blog", "story"} negativeWords := []string{"hid", "banner", "combx", "comment", "com-", "contact", "foot", "masthead", "media", "meta", "modal", "outbrain", "promo", "related", "scroll", "share", "shoutbox", "sidebar", "skyscraper", "sponsor", "shopping", "tags", "tool", "widget", "byline", "author", "dateline", "writtenby"} for _, word := range positiveWords { t.Run("positive_"+word, func(t *testing.T) { html := `
content
` doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { t.Fatalf("Failed to parse HTML: %v", err) } selection := doc.Find("div").First() result := getClassWeight(selection) if result != 25 { t.Errorf("Expected positive weight 25 for word '%s', got %f", word, result) } }) } for _, word := range negativeWords { t.Run("negative_"+word, func(t *testing.T) { html := `
content
` doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { t.Fatalf("Failed to parse HTML: %v", err) } selection := doc.Find("div").First() result := getClassWeight(selection) if result != -25 { t.Errorf("Expected negative weight -25 for word '%s', got %f", word, result) } }) } } func TestRemoveUnlikelyCandidates(t *testing.T) { testCases := []struct { name string html string expected string }{ { name: "removes elements with popupbody class", html: `
popup content
good content
`, expected: `
good content
`, }, { name: "removes elements with -ad in class", html: `
ad content
good content
`, expected: `
good content
`, }, { name: "removes elements with g-plus in class", html: `
social content
good content
`, expected: `
good content
`, }, { name: "removes elements with unlikely candidates in class", html: `
good content
`, expected: `
good content
`, }, { name: "preserves elements with unlikely candidates but also good candidates in class", html: `
good content
`, expected: `
good content
`, }, { name: "removes elements with unlikely candidates in id", html: `
good content
`, expected: `
good content
`, }, { name: "preserves elements with unlikely candidates but also good candidates in id", html: `
mixed content
good content
`, expected: `
mixed content
good content
`, }, { name: "preserves html and body tags", html: ``, expected: ``, }, { name: "preserves elements within code blocks", html: `
`, expected: `
`, }, { name: "preserves elements within pre tags", html: `
`, expected: `
`, }, { name: "case insensitive matching", html: `
good content
`, expected: `
good content
`, }, { name: "multiple unlikely patterns in single class", html: `
good content
`, expected: `
good content
`, }, { name: "elements without class or id are preserved", html: `
no attributes

paragraph

`, expected: `
no attributes

paragraph

`, }, { name: "removes nested unlikely elements", html: `

good content

`, expected: `

good content

`, }, { name: "comprehensive unlikely candidates test", html: `
combx
comment
community
cover-wrap
disqus
extra
foot
header
legends
remark
replies
rss
shoutbox
skyscraper
social
supplemental
ad-break
agegate
pager
yom-remote
good content
`, expected: `
good content
`, }, { name: "preserves good candidates that contain unlikely words", html: `
should be preserved
should be preserved
should be removed
`, expected: `
should be preserved
should be preserved
`, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html)) if err != nil { t.Fatalf("Failed to parse HTML: %v", err) } removeUnlikelyCandidates(doc) result, err := doc.Html() if err != nil { t.Fatalf("Failed to get HTML: %v", err) } // Normalize whitespace for comparison result = strings.TrimSpace(result) expected := strings.TrimSpace(tc.expected) if result != expected { t.Errorf("\nExpected:\n%s\n\nGot:\n%s", expected, result) } }) } } func TestRemoveUnlikelyCandidatesShouldRemoveFunction(t *testing.T) { // Test the internal shouldRemove function behavior through the public interface testCases := []struct { name string attr string attrType string // "class" or "id" expected bool // true if should be removed }{ // Special hardcoded cases {"popupbody in class", "popupbody", "class", true}, {"contains popupbody in class", "main-popupbody-content", "class", true}, {"ad suffix in class", "super-ad", "class", true}, {"ad in middle of class", "pre-ad-post", "class", true}, {"g-plus in class", "g-plus-share", "class", true}, {"contains g-plus in class", "social-g-plus-button", "class", true}, // Unlikely candidates regexp {"banner class", "banner", "class", true}, {"breadcrumbs class", "breadcrumbs", "class", true}, {"comment class", "comment", "class", true}, {"sidebar class", "sidebar", "class", true}, {"footer class", "footer", "class", true}, // Unlikely candidates with good candidates (should not be removed) {"banner with article", "banner article", "class", false}, {"comment with main", "comment main", "class", false}, {"sidebar with body", "sidebar body", "class", false}, {"footer with column", "footer column", "class", false}, {"menu with shadow", "menu shadow", "class", false}, // Case insensitive {"uppercase banner", "BANNER", "class", true}, {"mixed case comment", "Comment", "class", true}, {"uppercase with good", "BANNER ARTICLE", "class", false}, // ID attributes {"banner id", "banner", "id", true}, {"comment id", "comment", "id", true}, {"banner with article id", "banner article", "id", false}, // Good candidates only {"article class", "article", "class", false}, {"main class", "main", "class", false}, {"content class", "content", "class", false}, {"body class", "body", "class", false}, // No matches {"random class", "random-class", "class", false}, {"normal content", "normal-content", "class", false}, {"empty string", "", "class", false}, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { var html string if tc.attrType == "class" { html = `
content
` } else { html = `
content
` } doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { t.Fatalf("Failed to parse HTML: %v", err) } // Count elements before removal beforeCount := doc.Find("div").Length() removeUnlikelyCandidates(doc) // Count elements after removal afterCount := doc.Find("div").Length() wasRemoved := beforeCount > afterCount if wasRemoved != tc.expected { t.Errorf("Expected element to be removed: %v, but was removed: %v", tc.expected, wasRemoved) } }) } } func TestRemoveUnlikelyCandidatesPreservation(t *testing.T) { testCases := []struct { name string html string description string }{ { name: "preserves html tag", html: `
content
`, description: "HTML tag should never be removed regardless of class", }, { name: "preserves body tag", html: `
content
`, description: "Body tag should never be removed regardless of class", }, { name: "preserves elements in pre tags", html: `
`, description: "Elements within pre tags should be preserved", }, { name: "preserves elements in code tags", html: `code`, description: "Elements within code tags should be preserved", }, { name: "preserves nested elements in code blocks", html: `
`, description: "Deeply nested elements in code blocks should be preserved", }, { name: "preserves elements in mixed code scenarios", html: `
code
`, description: "Multiple code block scenarios should work correctly", }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html)) if err != nil { t.Fatalf("Failed to parse HTML: %v", err) } // Count specific elements before removal beforeHtml := doc.Find("html").Length() beforeBody := doc.Find("body").Length() beforePre := doc.Find("pre").Length() beforeCode := doc.Find("code").Length() removeUnlikelyCandidates(doc) // Count specific elements after removal afterHtml := doc.Find("html").Length() afterBody := doc.Find("body").Length() afterPre := doc.Find("pre").Length() afterCode := doc.Find("code").Length() // These elements should always be preserved if beforeHtml != afterHtml { t.Errorf("HTML elements were removed: before=%d, after=%d", beforeHtml, afterHtml) } if beforeBody != afterBody { t.Errorf("Body elements were removed: before=%d, after=%d", beforeBody, afterBody) } if beforePre != afterPre { t.Errorf("Pre elements were removed: before=%d, after=%d", beforePre, afterPre) } if beforeCode != afterCode { t.Errorf("Code elements were removed: before=%d, after=%d", beforeCode, afterCode) } // Verify that elements within code blocks are preserved if tc.name == "preserves elements in pre tags" || tc.name == "preserves elements in code tags" || tc.name == "preserves nested elements in code blocks" { spanInCode := doc.Find("pre span, code span, pre div, code div").Length() if spanInCode == 0 { t.Error("Elements within code blocks were incorrectly removed") } } }) } }