// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. // SPDX-License-Identifier: Apache-2.0 package readability // import "miniflux.app/v2/internal/reader/readability" import ( "bytes" "os" "strings" "testing" "github.com/PuerkitoBio/goquery" ) func TestBaseURL(t *testing.T) { html := `
Some content
` baseURL, _, err := ExtractContent(strings.NewReader(html)) if err != nil { t.Fatal(err) } if baseURL != "https://example.org/" { t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL) } } func TestMultipleBaseURL(t *testing.T) { html := `
Some content
` baseURL, _, err := ExtractContent(strings.NewReader(html)) if err != nil { t.Fatal(err) } if baseURL != "https://example.org/" { t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL) } } func TestRelativeBaseURL(t *testing.T) { html := `
Some content
` baseURL, _, err := ExtractContent(strings.NewReader(html)) if err != nil { t.Fatal(err) } if baseURL != "" { t.Errorf(`Unexpected base URL, got %q`, baseURL) } } func TestWithoutBaseURL(t *testing.T) { html := ` Test
Some content
` baseURL, _, err := ExtractContent(strings.NewReader(html)) if err != nil { t.Fatal(err) } if baseURL != "" { t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL) } } func TestRemoveStyleScript(t *testing.T) { html := ` Test
Some content
` want := `
Somecontent
` _, content, err := ExtractContent(strings.NewReader(html)) if err != nil { t.Fatal(err) } content = strings.ReplaceAll(content, "\n", "") content = strings.ReplaceAll(content, " ", "") content = strings.ReplaceAll(content, "\t", "") if content != want { t.Errorf(`Invalid content, got %s instead of %s`, content, want) } } func TestRemoveBlacklist(t *testing.T) { html := ` Test
Some content
Some other thing
And more
Valid!
` want := `
Valid!
` _, content, err := ExtractContent(strings.NewReader(html)) if err != nil { t.Fatal(err) } content = strings.ReplaceAll(content, "\n", "") content = strings.ReplaceAll(content, " ", "") content = strings.ReplaceAll(content, "\t", "") if content != want { t.Errorf(`Invalid content, got %s instead of %s`, content, want) } } func TestNestedSpanInCodeBlock(t *testing.T) { html := ` Test

Some content

Code block with nested span # exit 1
` want := `

Some content

Code block with nested span # exit 1
` _, result, err := ExtractContent(strings.NewReader(html)) if err != nil { t.Fatal(err) } if result != want { t.Errorf(`Invalid content, got %s instead of %s`, result, want) } } func BenchmarkExtractContent(b *testing.B) { var testCases = map[string][]byte{ "miniflux_github.html": {}, "miniflux_wikipedia.html": {}, } for filename := range testCases { data, err := os.ReadFile("testdata/" + filename) if err != nil { b.Fatalf(`Unable to read file %q: %v`, filename, err) } testCases[filename] = data } for range b.N { for _, v := range testCases { ExtractContent(bytes.NewReader(v)) } } } func TestGetClassWeight(t *testing.T) { testCases := []struct { name string html string expected float32 }{ { name: "no class or id", html: `
content
`, expected: 0, }, { name: "positive class only", html: `
content
`, expected: 25, }, { name: "negative class only", html: `
content
`, expected: -25, }, { name: "positive id only", html: `
content
`, expected: 25, }, { name: "negative id only", html: ``, expected: -25, }, { name: "positive class and positive id", html: `
content
`, expected: 50, }, { name: "negative class and negative id", html: ``, expected: -50, }, { name: "positive class and negative id", html: `
content
`, expected: 0, }, { name: "negative class and positive id", html: ``, expected: 0, }, { name: "multiple positive classes", html: `
content
`, expected: 25, }, { name: "multiple negative classes", html: ``, expected: -25, }, { name: "mixed positive and negative classes", html: `
content
`, expected: -25, // negative takes precedence since it's checked first }, { name: "case insensitive class", html: `
content
`, expected: 25, }, { name: "case insensitive id", html: `
content
`, expected: 25, }, { name: "non-matching class and id", html: `
content
`, expected: 0, }, { name: "empty class and id", html: `
content
`, expected: 0, }, { name: "class with special characters", html: `
content
`, expected: -25, // matches com- in negative regex }, { name: "id with special characters", html: `
content
`, expected: 25, // matches h-entry in positive regex }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html)) if err != nil { t.Fatalf("Failed to parse HTML: %v", err) } selection := doc.Find("div").First() if selection.Length() == 0 { t.Fatal("No div element found in HTML") } result := getClassWeight(selection) if result != tc.expected { t.Errorf("Expected weight %f, got %f", tc.expected, result) } }) } } func TestGetClassWeightRegexPatterns(t *testing.T) { // Test specific regex patterns used in getClassWeight positiveWords := []string{"article", "body", "content", "entry", "hentry", "h-entry", "main", "page", "pagination", "post", "text", "blog", "story"} negativeWords := []string{"hid", "banner", "combx", "comment", "com-", "contact", "foot", "masthead", "media", "meta", "modal", "outbrain", "promo", "related", "scroll", "share", "shoutbox", "sidebar", "skyscraper", "sponsor", "shopping", "tags", "tool", "widget", "byline", "author", "dateline", "writtenby"} for _, word := range positiveWords { t.Run("positive_"+word, func(t *testing.T) { html := `
content
` doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { t.Fatalf("Failed to parse HTML: %v", err) } selection := doc.Find("div").First() result := getClassWeight(selection) if result != 25 { t.Errorf("Expected positive weight 25 for word '%s', got %f", word, result) } }) } for _, word := range negativeWords { t.Run("negative_"+word, func(t *testing.T) { html := `
content
` doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { t.Fatalf("Failed to parse HTML: %v", err) } selection := doc.Find("div").First() result := getClassWeight(selection) if result != -25 { t.Errorf("Expected negative weight -25 for word '%s', got %f", word, result) } }) } }