// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package readability // import "miniflux.app/v2/internal/reader/readability"
import (
"bytes"
"os"
"strings"
"testing"
"github.com/PuerkitoBio/goquery"
)
func TestBaseURL(t *testing.T) {
html := `
Some content
`
baseURL, _, err := ExtractContent(strings.NewReader(html))
if err != nil {
t.Fatal(err)
}
if baseURL != "https://example.org/" {
t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
}
}
func TestMultipleBaseURL(t *testing.T) {
html := `
Some content
`
baseURL, _, err := ExtractContent(strings.NewReader(html))
if err != nil {
t.Fatal(err)
}
if baseURL != "https://example.org/" {
t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
}
}
func TestRelativeBaseURL(t *testing.T) {
html := `
Some content
`
baseURL, _, err := ExtractContent(strings.NewReader(html))
if err != nil {
t.Fatal(err)
}
if baseURL != "" {
t.Errorf(`Unexpected base URL, got %q`, baseURL)
}
}
func TestWithoutBaseURL(t *testing.T) {
html := `
Test
Some content
`
baseURL, _, err := ExtractContent(strings.NewReader(html))
if err != nil {
t.Fatal(err)
}
if baseURL != "" {
t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
}
}
func TestRemoveStyleScript(t *testing.T) {
html := `
Test
Some content
`
want := ``
_, content, err := ExtractContent(strings.NewReader(html))
if err != nil {
t.Fatal(err)
}
content = strings.ReplaceAll(content, "\n", "")
content = strings.ReplaceAll(content, " ", "")
content = strings.ReplaceAll(content, "\t", "")
if content != want {
t.Errorf(`Invalid content, got %s instead of %s`, content, want)
}
}
func TestRemoveBlacklist(t *testing.T) {
html := `
Test
Some content
Some other thing
And more
Valid!
`
want := ``
_, content, err := ExtractContent(strings.NewReader(html))
if err != nil {
t.Fatal(err)
}
content = strings.ReplaceAll(content, "\n", "")
content = strings.ReplaceAll(content, " ", "")
content = strings.ReplaceAll(content, "\t", "")
if content != want {
t.Errorf(`Invalid content, got %s instead of %s`, content, want)
}
}
func TestNestedSpanInCodeBlock(t *testing.T) {
html := `
Test
Some content
Code block with nested span
`
want := `Some content
Code block with nested span
`
_, result, err := ExtractContent(strings.NewReader(html))
if err != nil {
t.Fatal(err)
}
if result != want {
t.Errorf(`Invalid content, got %s instead of %s`, result, want)
}
}
func BenchmarkExtractContent(b *testing.B) {
var testCases = map[string][]byte{
"miniflux_github.html": {},
"miniflux_wikipedia.html": {},
}
for filename := range testCases {
data, err := os.ReadFile("testdata/" + filename)
if err != nil {
b.Fatalf(`Unable to read file %q: %v`, filename, err)
}
testCases[filename] = data
}
for range b.N {
for _, v := range testCases {
ExtractContent(bytes.NewReader(v))
}
}
}
func TestGetClassWeight(t *testing.T) {
testCases := []struct {
name string
html string
expected float32
}{
{
name: "no class or id",
html: `content
`,
expected: 0,
},
{
name: "positive class only",
html: `content
`,
expected: 25,
},
{
name: "negative class only",
html: ``,
expected: -25,
},
{
name: "positive id only",
html: `content
`,
expected: 25,
},
{
name: "negative id only",
html: ``,
expected: -25,
},
{
name: "positive class and positive id",
html: `content
`,
expected: 50,
},
{
name: "negative class and negative id",
html: ``,
expected: -50,
},
{
name: "positive class and negative id",
html: `content
`,
expected: 0,
},
{
name: "negative class and positive id",
html: `content
`,
expected: 0,
},
{
name: "multiple positive classes",
html: `content
`,
expected: 25,
},
{
name: "multiple negative classes",
html: ``,
expected: -25,
},
{
name: "mixed positive and negative classes",
html: `content
`,
expected: -25, // negative takes precedence since it's checked first
},
{
name: "case insensitive class",
html: `content
`,
expected: 25,
},
{
name: "case insensitive id",
html: `content
`,
expected: 25,
},
{
name: "non-matching class and id",
html: `content
`,
expected: 0,
},
{
name: "empty class and id",
html: `content
`,
expected: 0,
},
{
name: "class with special characters",
html: `content
`,
expected: -25, // matches com- in negative regex
},
{
name: "id with special characters",
html: `content
`,
expected: 25, // matches h-entry in positive regex
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html))
if err != nil {
t.Fatalf("Failed to parse HTML: %v", err)
}
selection := doc.Find("div").First()
if selection.Length() == 0 {
t.Fatal("No div element found in HTML")
}
result := getClassWeight(selection)
if result != tc.expected {
t.Errorf("Expected weight %f, got %f", tc.expected, result)
}
})
}
}
func TestGetClassWeightRegexPatterns(t *testing.T) {
// Test specific regex patterns used in getClassWeight
positiveWords := []string{"article", "body", "content", "entry", "hentry", "h-entry", "main", "page", "pagination", "post", "text", "blog", "story"}
negativeWords := []string{"hid", "banner", "combx", "comment", "com-", "contact", "foot", "masthead", "media", "meta", "modal", "outbrain", "promo", "related", "scroll", "share", "shoutbox", "sidebar", "skyscraper", "sponsor", "shopping", "tags", "tool", "widget", "byline", "author", "dateline", "writtenby"}
for _, word := range positiveWords {
t.Run("positive_"+word, func(t *testing.T) {
html := `content
`
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
t.Fatalf("Failed to parse HTML: %v", err)
}
selection := doc.Find("div").First()
result := getClassWeight(selection)
if result != 25 {
t.Errorf("Expected positive weight 25 for word '%s', got %f", word, result)
}
})
}
for _, word := range negativeWords {
t.Run("negative_"+word, func(t *testing.T) {
html := `content
`
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
t.Fatalf("Failed to parse HTML: %v", err)
}
selection := doc.Find("div").First()
result := getClassWeight(selection)
if result != -25 {
t.Errorf("Expected negative weight -25 for word '%s', got %f", word, result)
}
})
}
}
func TestRemoveUnlikelyCandidates(t *testing.T) {
testCases := []struct {
name string
html string
expected string
}{
{
name: "removes elements with popupbody class",
html: `popup content
good content
`,
expected: `good content
`,
},
{
name: "removes elements with -ad in class",
html: `ad content
good content
`,
expected: `good content
`,
},
{
name: "removes elements with g-plus in class",
html: `social content
good content
`,
expected: `good content
`,
},
{
name: "removes elements with unlikely candidates in class",
html: `banner
good content
`,
expected: `good content
`,
},
{
name: "preserves elements with unlikely candidates but also good candidates in class",
html: `mixed content
good content
`,
expected: `mixed content
good content
`,
},
{
name: "removes elements with unlikely candidates in id",
html: `banner
good content
`,
expected: `good content
`,
},
{
name: "preserves elements with unlikely candidates but also good candidates in id",
html: `mixed content
good content
`,
expected: `mixed content
good content
`,
},
{
name: "preserves html and body tags",
html: `content
`,
expected: ``,
},
{
name: "preserves elements within code blocks",
html: `code content
remove this
`,
expected: `code content
`,
},
{
name: "preserves elements within pre tags",
html: ``,
expected: ``,
},
{
name: "case insensitive matching",
html: `uppercase banner
mixed case banner
good content
`,
expected: `good content
`,
},
{
name: "multiple unlikely patterns in single class",
html: `good content
`,
expected: `good content
`,
},
{
name: "elements without class or id are preserved",
html: `no attributes
paragraph
`,
expected: `no attributes
paragraph
`,
},
{
name: "removes nested unlikely elements",
html: `nested banner
good content
`,
expected: ``,
},
{
name: "comprehensive unlikely candidates test",
html: `breadcrumbs
cover-wrap
legends
modal
related
replies
skyscraper
social
supplemental
yom-remote
good content
`,
expected: `good content
`,
},
{
name: "preserves good candidates that contain unlikely words",
html: `should be preserved
should be preserved
should be preserved
should be preserved
should be preserved
should be preserved
should be removed
`,
expected: `should be preserved
should be preserved
should be preserved
should be preserved
should be preserved
should be preserved
`,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html))
if err != nil {
t.Fatalf("Failed to parse HTML: %v", err)
}
removeUnlikelyCandidates(doc)
result, err := doc.Html()
if err != nil {
t.Fatalf("Failed to get HTML: %v", err)
}
// Normalize whitespace for comparison
result = strings.TrimSpace(result)
expected := strings.TrimSpace(tc.expected)
if result != expected {
t.Errorf("\nExpected:\n%s\n\nGot:\n%s", expected, result)
}
})
}
}
func TestRemoveUnlikelyCandidatesShouldRemoveFunction(t *testing.T) {
// Test the internal shouldRemove function behavior through the public interface
testCases := []struct {
name string
attr string
attrType string // "class" or "id"
expected bool // true if should be removed
}{
// Special hardcoded cases
{"popupbody in class", "popupbody", "class", true},
{"contains popupbody in class", "main-popupbody-content", "class", true},
{"ad suffix in class", "super-ad", "class", true},
{"ad in middle of class", "pre-ad-post", "class", true},
{"g-plus in class", "g-plus-share", "class", true},
{"contains g-plus in class", "social-g-plus-button", "class", true},
// Unlikely candidates regexp
{"banner class", "banner", "class", true},
{"breadcrumbs class", "breadcrumbs", "class", true},
{"comment class", "comment", "class", true},
{"sidebar class", "sidebar", "class", true},
{"footer class", "footer", "class", true},
// Unlikely candidates with good candidates (should not be removed)
{"banner with article", "banner article", "class", false},
{"comment with main", "comment main", "class", false},
{"sidebar with body", "sidebar body", "class", false},
{"footer with column", "footer column", "class", false},
{"menu with shadow", "menu shadow", "class", false},
// Case insensitive
{"uppercase banner", "BANNER", "class", true},
{"mixed case comment", "Comment", "class", true},
{"uppercase with good", "BANNER ARTICLE", "class", false},
// ID attributes
{"banner id", "banner", "id", true},
{"comment id", "comment", "id", true},
{"banner with article id", "banner article", "id", false},
// Good candidates only
{"article class", "article", "class", false},
{"main class", "main", "class", false},
{"content class", "content", "class", false},
{"body class", "body", "class", false},
// No matches
{"random class", "random-class", "class", false},
{"normal content", "normal-content", "class", false},
{"empty string", "", "class", false},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
var html string
if tc.attrType == "class" {
html = `content
`
} else {
html = `content
`
}
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
t.Fatalf("Failed to parse HTML: %v", err)
}
// Count elements before removal
beforeCount := doc.Find("div").Length()
removeUnlikelyCandidates(doc)
// Count elements after removal
afterCount := doc.Find("div").Length()
wasRemoved := beforeCount > afterCount
if wasRemoved != tc.expected {
t.Errorf("Expected element to be removed: %v, but was removed: %v", tc.expected, wasRemoved)
}
})
}
}
func TestRemoveUnlikelyCandidatesPreservation(t *testing.T) {
testCases := []struct {
name string
html string
description string
}{
{
name: "preserves html tag",
html: `content
`,
description: "HTML tag should never be removed regardless of class",
},
{
name: "preserves body tag",
html: `content
`,
description: "Body tag should never be removed regardless of class",
},
{
name: "preserves elements in pre tags",
html: `code
`,
description: "Elements within pre tags should be preserved",
},
{
name: "preserves elements in code tags",
html: `
`,
description: "Elements within code tags should be preserved",
},
{
name: "preserves nested elements in code blocks",
html: `
`,
description: "Deeply nested elements in code blocks should be preserved",
},
{
name: "preserves elements in mixed code scenarios",
html: ``,
description: "Multiple code block scenarios should work correctly",
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html))
if err != nil {
t.Fatalf("Failed to parse HTML: %v", err)
}
// Count specific elements before removal
beforeHtml := doc.Find("html").Length()
beforeBody := doc.Find("body").Length()
beforePre := doc.Find("pre").Length()
beforeCode := doc.Find("code").Length()
removeUnlikelyCandidates(doc)
// Count specific elements after removal
afterHtml := doc.Find("html").Length()
afterBody := doc.Find("body").Length()
afterPre := doc.Find("pre").Length()
afterCode := doc.Find("code").Length()
// These elements should always be preserved
if beforeHtml != afterHtml {
t.Errorf("HTML elements were removed: before=%d, after=%d", beforeHtml, afterHtml)
}
if beforeBody != afterBody {
t.Errorf("Body elements were removed: before=%d, after=%d", beforeBody, afterBody)
}
if beforePre != afterPre {
t.Errorf("Pre elements were removed: before=%d, after=%d", beforePre, afterPre)
}
if beforeCode != afterCode {
t.Errorf("Code elements were removed: before=%d, after=%d", beforeCode, afterCode)
}
// Verify that elements within code blocks are preserved
if tc.name == "preserves elements in pre tags" || tc.name == "preserves elements in code tags" || tc.name == "preserves nested elements in code blocks" {
spanInCode := doc.Find("pre span, code span, pre div, code div").Length()
if spanInCode == 0 {
t.Error("Elements within code blocks were incorrectly removed")
}
}
})
}
}