diff --git a/internal/reader/readability/readability.go b/internal/reader/readability/readability.go
index 50c7bb30..c423f1a7 100644
--- a/internal/reader/readability/readability.go
+++ b/internal/reader/readability/readability.go
@@ -27,8 +27,8 @@ var (
maybeCandidate = [...]string{"and", "article", "body", "column", "main", "shadow"}
unlikelyCandidate = [...]string{"banner", "breadcrumbs", "combx", "comment", "community", "cover-wrap", "disqus", "extra", "foot", "header", "legends", "menu", "modal", "related", "remark", "replies", "rss", "shoutbox", "sidebar", "skyscraper", "social", "sponsor", "supplemental", "ad-break", "agegate", "pagination", "pager", "popup", "yom-remote"}
- positive = [...]string{"article", "blog", "body", "content", "entry", "h-entry", "hentry", "main", "page", "pagination", "post", "story", "text"}
- negative = [...]string{"author", "banner", "byline", "com-", "combx", "comment", "contact", "dateline", "foot", "hid", "masthead", "media", "meta", "modal", "outbrain", "promo", "related", "scroll", "share", "shopping", "shoutbox", "sidebar", "skyscraper", "sponsor", "tags", "tool", "widget", "writtenby"}
+ positiveKeywords = [...]string{"article", "blog", "body", "content", "entry", "h-entry", "hentry", "main", "page", "pagination", "post", "story", "text"}
+ negativeKeywords = [...]string{"author", "banner", "byline", "com-", "combx", "comment", "contact", "dateline", "foot", "hid", "masthead", "media", "meta", "modal", "outbrain", "promo", "related", "scroll", "share", "shopping", "shoutbox", "sidebar", "skyscraper", "sponsor", "tags", "tool", "widget", "writtenby"}
)
type candidate struct {
@@ -37,23 +37,31 @@ type candidate struct {
}
func (c *candidate) Node() *html.Node {
+ if c.selection.Length() == 0 {
+ return nil
+ }
return c.selection.Get(0)
}
func (c *candidate) String() string {
+ node := c.Node()
+ if node == nil {
+ return fmt.Sprintf("empty => %f", c.score)
+ }
+
id, _ := c.selection.Attr("id")
class, _ := c.selection.Attr("class")
switch {
case id != "" && class != "":
- return fmt.Sprintf("%s#%s.%s => %f", c.Node().DataAtom, id, class, c.score)
+ return fmt.Sprintf("%s#%s.%s => %f", node.DataAtom, id, class, c.score)
case id != "":
- return fmt.Sprintf("%s#%s => %f", c.Node().DataAtom, id, c.score)
+ return fmt.Sprintf("%s#%s => %f", node.DataAtom, id, c.score)
case class != "":
- return fmt.Sprintf("%s.%s => %f", c.Node().DataAtom, class, c.score)
+ return fmt.Sprintf("%s.%s => %f", node.DataAtom, class, c.score)
}
- return fmt.Sprintf("%s => %f", c.Node().DataAtom, c.score)
+ return fmt.Sprintf("%s => %f", node.DataAtom, c.score)
}
type candidateList map[*html.Node]*candidate
@@ -111,7 +119,8 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
tag := "div"
node := s.Get(0)
- if node == topCandidate.Node() {
+ topNode := topCandidate.Node()
+ if topNode != nil && node == topNode {
append = true
} else if c, ok := candidates[node]; ok && c.score >= siblingScoreThreshold {
append = true
@@ -147,14 +156,14 @@ func shouldRemoveCandidate(str string) bool {
str = strings.ToLower(str)
// Those candidates have no false-positives, no need to check against `maybeCandidate`
- for _, strong := range strongCandidates {
- if strings.Contains(str, strong) {
+ for _, strongCandidate := range strongCandidates {
+ if strings.Contains(str, strongCandidate) {
return true
}
}
- for _, unlikely := range unlikelyCandidate {
- if strings.Contains(str, unlikely) {
+ for _, unlikelyCandidate := range unlikelyCandidate {
+ if strings.Contains(str, unlikelyCandidate) {
// Do we have a false positive?
for _, maybe := range maybeCandidate {
if strings.Contains(str, maybe) {
@@ -268,6 +277,11 @@ func getCandidates(document *goquery.Document) candidateList {
func scoreNode(s *goquery.Selection) *candidate {
c := &candidate{selection: s, score: 0}
+ // Check if selection is empty to avoid panic
+ if s.Length() == 0 {
+ return c
+ }
+
switch s.Get(0).DataAtom.String() {
case "div":
c.score += 5
@@ -314,13 +328,13 @@ func getClassWeight(s *goquery.Selection) float32 {
func getWeight(s string) int {
s = strings.ToLower(s)
- for _, pos := range negative {
- if strings.Contains(s, pos) {
+ for _, keyword := range negativeKeywords {
+ if strings.Contains(s, keyword) {
return -25
}
}
- for _, pos := range positive {
- if strings.Contains(s, pos) {
+ for _, keyword := range positiveKeywords {
+ if strings.Contains(s, keyword) {
return +25
}
}
diff --git a/internal/reader/readability/readability_test.go b/internal/reader/readability/readability_test.go
index cf3188e6..96813d42 100644
--- a/internal/reader/readability/readability_test.go
+++ b/internal/reader/readability/readability_test.go
@@ -11,8 +11,60 @@ import (
"testing"
"github.com/PuerkitoBio/goquery"
+ "golang.org/x/net/html"
)
+func BenchmarkExtractContent(b *testing.B) {
+ var testCases = map[string][]byte{
+ "miniflux_github.html": {},
+ "miniflux_wikipedia.html": {},
+ }
+ for filename := range testCases {
+ data, err := os.ReadFile("testdata/" + filename)
+ if err != nil {
+ b.Fatalf(`Unable to read file %q: %v`, filename, err)
+ }
+ testCases[filename] = data
+ }
+ for range b.N {
+ for _, v := range testCases {
+ ExtractContent(bytes.NewReader(v))
+ }
+ }
+}
+
+func BenchmarkGetWeight(b *testing.B) {
+ testCases := []string{
+ "p-3 color-bg-accent-emphasis color-fg-on-emphasis show-on-focus js-skip-to-content",
+ "d-flex flex-column mb-3",
+ "AppHeader-search-control AppHeader-search-control-overflow",
+ "Button Button--iconOnly Button--invisible Button--medium mr-1 px-2 py-0 d-flex flex-items-center rounded-1 color-fg-muted",
+ "sr-only",
+ "validation-12753bbc-b4d1-4e10-bec6-92e585d1699d",
+ }
+ for range b.N {
+ for _, v := range testCases {
+ getWeight(v)
+ }
+ }
+}
+
+func BenchmarkTransformMisusedDivsIntoParagraphs(b *testing.B) {
+ html := `
+ Simple text content
+ More inline content
+
+
+ Another simple text
+ `
+
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ doc, _ := goquery.NewDocumentFromReader(strings.NewReader(html))
+ transformMisusedDivsIntoParagraphs(doc)
+ }
+}
+
func TestBaseURL(t *testing.T) {
html := `
@@ -189,25 +241,6 @@ func TestNestedSpanInCodeBlock(t *testing.T) {
}
}
-func BenchmarkExtractContent(b *testing.B) {
- var testCases = map[string][]byte{
- "miniflux_github.html": {},
- "miniflux_wikipedia.html": {},
- }
- for filename := range testCases {
- data, err := os.ReadFile("testdata/" + filename)
- if err != nil {
- b.Fatalf(`Unable to read file %q: %v`, filename, err)
- }
- testCases[filename] = data
- }
- for range b.N {
- for _, v := range testCases {
- ExtractContent(bytes.NewReader(v))
- }
- }
-}
-
func TestGetClassWeight(t *testing.T) {
testCases := []struct {
name string
@@ -1315,18 +1348,1134 @@ func TestContainsSentence(t *testing.T) {
}
}
-func BenchmarkGetWeight(b *testing.B) {
- testCases := []string{
- "p-3 color-bg-accent-emphasis color-fg-on-emphasis show-on-focus js-skip-to-content",
- "d-flex flex-column mb-3",
- "AppHeader-search-control AppHeader-search-control-overflow",
- "Button Button--iconOnly Button--invisible Button--medium mr-1 px-2 py-0 d-flex flex-items-center rounded-1 color-fg-muted",
- "sr-only",
- "validation-12753bbc-b4d1-4e10-bec6-92e585d1699d",
+func TestScoreNode(t *testing.T) {
+ testCases := []struct {
+ name string
+ html string
+ expectedScore float32
+ expectedTag string
+ }{
+ {
+ name: "div element with no class or id",
+ html: `Some content
`,
+ expectedScore: 5,
+ expectedTag: "div",
+ },
+ {
+ name: "pre element with no class or id",
+ html: `Some code
`,
+ expectedScore: 3,
+ expectedTag: "pre",
+ },
+ {
+ name: "td element with no class or id",
+ html: ``,
+ expectedScore: 3,
+ expectedTag: "td",
+ },
+ {
+ name: "blockquote element with no class or id",
+ html: `Quote
`,
+ expectedScore: 3,
+ expectedTag: "blockquote",
+ },
+ {
+ name: "img element with no class or id",
+ html: `
`,
+ expectedScore: 3,
+ expectedTag: "img",
+ },
+ {
+ name: "ol element with no class or id",
+ html: `- Item
`,
+ expectedScore: -3,
+ expectedTag: "ol",
+ },
+ {
+ name: "ul element with no class or id",
+ html: ``,
+ expectedScore: -3,
+ expectedTag: "ul",
+ },
+ {
+ name: "address element with no class or id",
+ html: `Contact info`,
+ expectedScore: -3,
+ expectedTag: "address",
+ },
+ {
+ name: "dl element with no class or id",
+ html: `- Term
- Definition
`,
+ expectedScore: -3,
+ expectedTag: "dl",
+ },
+ {
+ name: "dd element with no class or id",
+ html: `Definition`,
+ expectedScore: -3,
+ expectedTag: "dd",
+ },
+ {
+ name: "dt element with no class or id",
+ html: `Term`,
+ expectedScore: -3,
+ expectedTag: "dt",
+ },
+ {
+ name: "li element with no class or id",
+ html: `List item`,
+ expectedScore: -3,
+ expectedTag: "li",
+ },
+ {
+ name: "form element with no class or id",
+ html: ``,
+ expectedScore: -3,
+ expectedTag: "form",
+ },
+ {
+ name: "h1 element with no class or id",
+ html: `Heading
`,
+ expectedScore: -5,
+ expectedTag: "h1",
+ },
+ {
+ name: "h2 element with no class or id",
+ html: `Heading
`,
+ expectedScore: -5,
+ expectedTag: "h2",
+ },
+ {
+ name: "h3 element with no class or id",
+ html: `Heading
`,
+ expectedScore: -5,
+ expectedTag: "h3",
+ },
+ {
+ name: "h4 element with no class or id",
+ html: `Heading
`,
+ expectedScore: -5,
+ expectedTag: "h4",
+ },
+ {
+ name: "h5 element with no class or id",
+ html: `Heading
`,
+ expectedScore: -5,
+ expectedTag: "h5",
+ },
+ {
+ name: "h6 element with no class or id",
+ html: `Heading
`,
+ expectedScore: -5,
+ expectedTag: "h6",
+ },
+ {
+ name: "th element with no class or id",
+ html: ``,
+ expectedScore: -5,
+ expectedTag: "th",
+ },
+ {
+ name: "p element with no class or id (default case)",
+ html: `Paragraph content
`,
+ expectedScore: 0,
+ expectedTag: "p",
+ },
+ {
+ name: "span element with no class or id (default case)",
+ html: `Span content`,
+ expectedScore: 0,
+ expectedTag: "span",
+ },
}
- for range b.N {
- for _, v := range testCases {
- getWeight(v)
- }
+
+ for _, tc := range testCases {
+ t.Run(tc.name, func(t *testing.T) {
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ selection := doc.Find(tc.expectedTag)
+ if selection.Length() == 0 {
+ t.Fatalf("Could not find element with tag %s", tc.expectedTag)
+ }
+
+ candidate := scoreNode(selection)
+
+ if candidate.score != tc.expectedScore {
+ t.Errorf("Expected score %f, got %f", tc.expectedScore, candidate.score)
+ }
+
+ if candidate.selection != selection {
+ t.Error("Expected selection to be preserved in candidate")
+ }
+
+ if candidate.Node() == nil {
+ t.Errorf("Expected valid node, got nil")
+ } else if candidate.Node().Data != tc.expectedTag {
+ t.Errorf("Expected node tag %s, got %s", tc.expectedTag, candidate.Node().Data)
+ }
+ })
}
}
+
+func TestScoreNodeWithClassWeights(t *testing.T) {
+ testCases := []struct {
+ name string
+ html string
+ expectedScore float32
+ description string
+ }{
+ {
+ name: "div with positive class",
+ html: `Content
`,
+ expectedScore: 30, // 5 (div) + 25 (positive class)
+ description: "div base score + positive class weight",
+ },
+ {
+ name: "div with negative class",
+ html: ``,
+ expectedScore: -20, // 5 (div) + (-25) (negative class)
+ description: "div base score + negative class weight",
+ },
+ {
+ name: "div with positive id",
+ html: `Content
`,
+ expectedScore: 30, // 5 (div) + 25 (positive id)
+ description: "div base score + positive id weight",
+ },
+ {
+ name: "div with negative id",
+ html: ``,
+ expectedScore: -20, // 5 (div) + (-25) (negative id)
+ description: "div base score + negative id weight",
+ },
+ {
+ name: "div with both positive class and id",
+ html: `Content
`,
+ expectedScore: 55, // 5 (div) + 25 (positive class) + 25 (positive id)
+ description: "div base score + positive class weight + positive id weight",
+ },
+ {
+ name: "div with both negative class and id",
+ html: ``,
+ expectedScore: -45, // 5 (div) + (-25) (negative class) + (-25) (negative id)
+ description: "div base score + negative class weight + negative id weight",
+ },
+ {
+ name: "div with mixed class and id weights",
+ html: ``,
+ expectedScore: 5, // 5 (div) + 25 (positive class) + (-25) (negative id)
+ description: "div base score + positive class weight + negative id weight",
+ },
+ {
+ name: "h1 with positive class (should still be negative overall)",
+ html: `Heading
`,
+ expectedScore: 20, // -5 (h1) + 25 (positive class)
+ description: "h1 base score + positive class weight",
+ },
+ {
+ name: "ul with negative class (more negative)",
+ html: ``,
+ expectedScore: -28, // -3 (ul) + (-25) (negative class)
+ description: "ul base score + negative class weight",
+ },
+ {
+ name: "p with neutral class/id (no weight change)",
+ html: `Paragraph
`,
+ expectedScore: 0, // 0 (p) + 0 (neutral class) + 0 (neutral id)
+ description: "p base score with neutral class and id",
+ },
+ }
+
+ for _, tc := range testCases {
+ t.Run(tc.name, func(t *testing.T) {
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ // Find the first non-html/body element
+ selection := doc.Find("div, h1, h2, h3, h4, h5, h6, ul, ol, p, pre, blockquote, img, td, th, address, dl, dd, dt, li, form, span").First()
+ if selection.Length() == 0 {
+ t.Fatal("Could not find element")
+ }
+
+ candidate := scoreNode(selection)
+
+ if candidate.score != tc.expectedScore {
+ t.Errorf("%s: Expected score %f, got %f", tc.description, tc.expectedScore, candidate.score)
+ }
+ })
+ }
+}
+
+func TestScoreNodeEdgeCases(t *testing.T) {
+ t.Run("empty selection", func(t *testing.T) {
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(``))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ // Create empty selection
+ emptySelection := doc.Find("nonexistent")
+ if emptySelection.Length() != 0 {
+ t.Fatal("Expected empty selection")
+ }
+
+ // scoreNode should handle empty selection gracefully
+ candidate := scoreNode(emptySelection)
+ if candidate == nil {
+ t.Error("Expected non-nil candidate even for empty selection")
+ }
+
+ // Should have score 0 and empty selection
+ if candidate != nil && candidate.score != 0 {
+ t.Errorf("Expected score 0 for empty selection, got %f", candidate.score)
+ }
+
+ if candidate.selection.Length() != 0 {
+ t.Error("Expected candidate to preserve empty selection")
+ }
+
+ // Node() should return nil for empty selection
+ if candidate.Node() != nil {
+ t.Error("Expected Node() to return nil for empty selection")
+ }
+
+ // String() should handle empty selection gracefully
+ str := candidate.String()
+ expected := "empty => 0.000000"
+ if str != expected {
+ t.Errorf("Expected String() to return %q, got %q", expected, str)
+ }
+ })
+
+ t.Run("multiple elements in selection", func(t *testing.T) {
+ html := ``
+
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ // Select all p elements
+ selection := doc.Find("p")
+ if selection.Length() != 2 {
+ t.Fatalf("Expected 2 p elements, got %d", selection.Length())
+ }
+
+ // scoreNode should only consider the first element in the selection
+ candidate := scoreNode(selection)
+
+ // Should score based on first p element (class="article")
+ expectedScore := float32(25) // 0 (p) + 25 (positive class)
+ if candidate.score != expectedScore {
+ t.Errorf("Expected score %f, got %f", expectedScore, candidate.score)
+ }
+
+ if candidate.Node() == nil {
+ t.Error("Expected valid node, got nil")
+ } else if candidate.Node().Data != "p" {
+ t.Errorf("Expected node tag p, got %s", candidate.Node().Data)
+ }
+ })
+
+ t.Run("nested elements", func(t *testing.T) {
+ html := ``
+
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ // Test scoring each level
+ divSelection := doc.Find("div")
+ divCandidate := scoreNode(divSelection)
+ expectedDivScore := float32(30) // 5 (div) + 25 (positive class)
+ if divCandidate.score != expectedDivScore {
+ t.Errorf("Div score: expected %f, got %f", expectedDivScore, divCandidate.score)
+ }
+
+ pSelection := doc.Find("p")
+ pCandidate := scoreNode(pSelection)
+ expectedPScore := float32(25) // 0 (p) + 25 (positive class)
+ if pCandidate.score != expectedPScore {
+ t.Errorf("P score: expected %f, got %f", expectedPScore, pCandidate.score)
+ }
+
+ spanSelection := doc.Find("span")
+ spanCandidate := scoreNode(spanSelection)
+ expectedSpanScore := float32(0) // 0 (span) + 0 (neutral class)
+ if spanCandidate.score != expectedSpanScore {
+ t.Errorf("Span score: expected %f, got %f", expectedSpanScore, spanCandidate.score)
+ }
+ })
+}
+
+func TestTransformMisusedDivsIntoParagraphs(t *testing.T) {
+ testCases := []struct {
+ name string
+ input string
+ expected string
+ description string
+ }{
+ {
+ name: "div with only text should become paragraph",
+ input: `Simple text content
`,
+ expected: `Simple text content
`,
+ description: "div containing only text should be converted to p",
+ },
+ {
+ name: "div with inline elements should become paragraph",
+ input: `Text with inline and emphasis
`,
+ expected: `Text with inline and emphasis
`,
+ description: "div with inline elements should be converted to p",
+ },
+ {
+ name: "div with strong and other inline elements",
+ input: `Some bold and italic text
`,
+ expected: `Some bold and italic text
`,
+ description: "div with inline formatting should be converted to p",
+ },
+ {
+ name: "div with anchor tag should NOT become paragraph",
+ input: ``,
+ expected: ``,
+ description: "div containing anchor tag should remain div (matches regex)",
+ },
+ {
+ name: "div with paragraph should NOT become paragraph",
+ input: ``,
+ expected: ``,
+ description: "div containing p tag should remain div",
+ },
+ {
+ name: "div with blockquote should NOT become paragraph",
+ input: ``,
+ expected: ``,
+ description: "div containing blockquote should remain div",
+ },
+ {
+ name: "div with nested div should NOT become paragraph",
+ input: ``,
+ expected: ``,
+ description: "outer div has nested div (matches regex), inner div has text only (gets converted)",
+ },
+ {
+ name: "div with img should NOT become paragraph",
+ input: ``,
+ expected: ``,
+ description: "div containing img should remain div",
+ },
+ {
+ name: "div with ol should NOT become paragraph",
+ input: ``,
+ expected: ``,
+ description: "div containing ol should remain div",
+ },
+ {
+ name: "div with ul should NOT become paragraph",
+ input: ``,
+ expected: ``,
+ description: "div containing ul should remain div",
+ },
+ {
+ name: "div with pre should NOT become paragraph",
+ input: ``,
+ expected: ``,
+ description: "div containing pre should remain div",
+ },
+ {
+ name: "div with table should NOT become paragraph",
+ input: ``,
+ expected: ``,
+ description: "div containing table should remain div (note: GoQuery adds tbody)",
+ },
+ {
+ name: "div with dl should NOT become paragraph",
+ input: ``,
+ expected: ``,
+ description: "div containing dl should remain div",
+ },
+ {
+ name: "empty div should become paragraph",
+ input: ``,
+ expected: ``,
+ description: "empty div should be converted to p",
+ },
+ {
+ name: "div with only whitespace should become paragraph",
+ input: `
`,
+ expected: `
`,
+ description: "div with only whitespace should be converted to p",
+ },
+ {
+ name: "div with self-closing anchor tag should NOT become paragraph",
+ input: ``,
+ expected: ``,
+ description: "div with self-closing anchor should remain div (note: GoQuery normalizes self-closing tags)",
+ },
+ {
+ name: "case insensitive matching - uppercase A",
+ input: ``,
+ expected: ``,
+ description: "regex should be case insensitive (note: GoQuery normalizes case)",
+ },
+ {
+ name: "case insensitive matching - uppercase IMG",
+ input: ``,
+ expected: ``,
+ description: "regex should be case insensitive (note: GoQuery normalizes case)",
+ },
+ {
+ name: "multiple divs transformation",
+ input: `Text only
More text
`,
+ expected: `Text only
More text
`,
+ description: "should transform multiple divs appropriately",
+ },
+ {
+ name: "nested divs where inner gets transformed",
+ input: ``,
+ expected: ``,
+ description: "inner div should be transformed even if outer div isn't",
+ },
+ }
+
+ for _, tc := range testCases {
+ t.Run(tc.name, func(t *testing.T) {
+ // Wrap input in a basic HTML structure
+ html := fmt.Sprintf(`%s`, tc.input)
+
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+ if err != nil {
+ t.Fatalf("Failed to parse HTML: %v", err)
+ }
+
+ // Apply the transformation
+ transformMisusedDivsIntoParagraphs(doc)
+
+ // Extract the body content
+ bodyHtml, err := doc.Find("body").Html()
+ if err != nil {
+ t.Fatalf("Failed to get body HTML: %v", err)
+ }
+
+ // Clean up whitespace for comparison
+ result := strings.TrimSpace(bodyHtml)
+ expected := strings.TrimSpace(tc.expected)
+
+ if result != expected {
+ t.Errorf("%s\nExpected: %s\nGot: %s", tc.description, expected, result)
+ }
+ })
+ }
+}
+
+func TestTransformMisusedDivsIntoParagraphsRegexPattern(t *testing.T) {
+ // Test the regex pattern directly to ensure it matches the expected elements
+ testCases := []struct {
+ name string
+ html string
+ shouldMatch bool
+ description string
+ }{
+ {
+ name: "anchor tag",
+ html: `link`,
+ shouldMatch: true,
+ description: "should match anchor tags",
+ },
+ {
+ name: "blockquote tag",
+ html: `quote
`,
+ shouldMatch: true,
+ description: "should match blockquote tags",
+ },
+ {
+ name: "dl tag",
+ html: `- term
`,
+ shouldMatch: true,
+ description: "should match dl tags",
+ },
+ {
+ name: "div tag",
+ html: `content
`,
+ shouldMatch: true,
+ description: "should match div tags",
+ },
+ {
+ name: "img tag",
+ html: `
`,
+ shouldMatch: true,
+ description: "should match img tags",
+ },
+ {
+ name: "ol tag",
+ html: `- item
`,
+ shouldMatch: true,
+ description: "should match ol tags",
+ },
+ {
+ name: "p tag",
+ html: `paragraph
`,
+ shouldMatch: true,
+ description: "should match p tags",
+ },
+ {
+ name: "pre tag",
+ html: `code
`,
+ shouldMatch: true,
+ description: "should match pre tags",
+ },
+ {
+ name: "table tag",
+ html: ``,
+ shouldMatch: true,
+ description: "should match table tags",
+ },
+ {
+ name: "ul tag",
+ html: ``,
+ shouldMatch: true,
+ description: "should match ul tags",
+ },
+ {
+ name: "self-closing anchor",
+ html: ``,
+ shouldMatch: true,
+ description: "should match self-closing anchor tags",
+ },
+ {
+ name: "tag with attributes",
+ html: `text`,
+ shouldMatch: true,
+ description: "should match tags with attributes",
+ },
+ {
+ name: "uppercase tags",
+ html: `link`,
+ shouldMatch: true,
+ description: "should be case insensitive",
+ },
+ {
+ name: "mixed case tags",
+ html: `
`,
+ shouldMatch: true,
+ description: "should match mixed case tags",
+ },
+ {
+ name: "span tag",
+ html: `text`,
+ shouldMatch: false,
+ description: "should NOT match span tags",
+ },
+ {
+ name: "em tag",
+ html: `emphasis`,
+ shouldMatch: false,
+ description: "should NOT match em tags",
+ },
+ {
+ name: "strong tag",
+ html: `bold`,
+ shouldMatch: false,
+ description: "should NOT match strong tags",
+ },
+ {
+ name: "i tag",
+ html: `italic`,
+ shouldMatch: false,
+ description: "should NOT match i tags",
+ },
+ {
+ name: "b tag",
+ html: `bold`,
+ shouldMatch: false,
+ description: "should NOT match b tags",
+ },
+ {
+ name: "plain text",
+ html: `just plain text`,
+ shouldMatch: false,
+ description: "should NOT match plain text",
+ },
+ {
+ name: "empty string",
+ html: ``,
+ shouldMatch: false,
+ description: "should NOT match empty string",
+ },
+ }
+
+ for _, tc := range testCases {
+ t.Run(tc.name, func(t *testing.T) {
+ result := divToPElementsRegexp.MatchString(tc.html)
+ if result != tc.shouldMatch {
+ t.Errorf("%s\nHTML: %s\nExpected match: %v, Got: %v", tc.description, tc.html, tc.shouldMatch, result)
+ }
+ })
+ }
+}
+
+func TestTransformMisusedDivsIntoParagraphsEdgeCases(t *testing.T) {
+ t.Run("document with no divs", func(t *testing.T) {
+ html := `No divs here
Just other elements`
+
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ // Should not panic or cause issues
+ transformMisusedDivsIntoParagraphs(doc)
+
+ bodyHtml, _ := doc.Find("body").Html()
+ expected := `No divs here
Just other elements`
+
+ if strings.TrimSpace(bodyHtml) != expected {
+ t.Errorf("Expected no changes to document without divs")
+ }
+ })
+
+ t.Run("empty document", func(t *testing.T) {
+ html := ``
+
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ // Should not panic with empty document
+ transformMisusedDivsIntoParagraphs(doc)
+
+ bodyHtml, _ := doc.Find("body").Html()
+ if strings.TrimSpace(bodyHtml) != "" {
+ t.Errorf("Expected empty body to remain empty")
+ }
+ })
+
+ t.Run("deeply nested divs", func(t *testing.T) {
+ html := ``
+
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ transformMisusedDivsIntoParagraphs(doc)
+
+ bodyHtml, _ := doc.Find("body").Html()
+ // The outer divs contain other divs (matches regex), so they remain divs
+ // Only the innermost div with just text gets converted to p
+ expected := ``
+
+ if strings.TrimSpace(bodyHtml) != expected {
+ t.Errorf("Expected nested div transformation\nGot: %s\nExpected: %s", strings.TrimSpace(bodyHtml), expected)
+ }
+ })
+
+ t.Run("complex mixed content", func(t *testing.T) {
+ html := `
+ Text only div
+
+ Inline text
+
+ `
+
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ transformMisusedDivsIntoParagraphs(doc)
+
+ // Count paragraphs and divs
+ pCount := doc.Find("p").Length()
+ divCount := doc.Find("div").Length()
+
+ // Should have 3 paragraphs (original p + 2 converted divs) and 2 divs (link div + block element div)
+ expectedPCount := 3
+ expectedDivCount := 2
+
+ if pCount != expectedPCount {
+ t.Errorf("Expected %d paragraphs, got %d", expectedPCount, pCount)
+ }
+ if divCount != expectedDivCount {
+ t.Errorf("Expected %d divs, got %d", expectedDivCount, divCount)
+ }
+ })
+}
+
+func TestCandidateString(t *testing.T) {
+ testCases := []struct {
+ name string
+ html string
+ expected string
+ setup func(*goquery.Document) *candidate
+ }{
+ {
+ name: "empty candidate",
+ html: ``,
+ expected: "empty => 0.000000",
+ setup: func(doc *goquery.Document) *candidate {
+ emptySelection := doc.Find("nonexistent")
+ return &candidate{selection: emptySelection, score: 0}
+ },
+ },
+ {
+ name: "candidate with no class or id",
+ html: `Content
`,
+ expected: "div => 5.000000",
+ setup: func(doc *goquery.Document) *candidate {
+ selection := doc.Find("div")
+ return scoreNode(selection)
+ },
+ },
+ {
+ name: "candidate with class only",
+ html: `Content
`,
+ expected: "div.content => 30.000000",
+ setup: func(doc *goquery.Document) *candidate {
+ selection := doc.Find("div")
+ return scoreNode(selection)
+ },
+ },
+ {
+ name: "candidate with id only",
+ html: `Content
`,
+ expected: "div#main => 30.000000",
+ setup: func(doc *goquery.Document) *candidate {
+ selection := doc.Find("div")
+ return scoreNode(selection)
+ },
+ },
+ {
+ name: "candidate with both class and id",
+ html: `Content
`,
+ expected: "div#main.content => 55.000000",
+ setup: func(doc *goquery.Document) *candidate {
+ selection := doc.Find("div")
+ return scoreNode(selection)
+ },
+ },
+ {
+ name: "candidate with multiple classes",
+ html: `Content
`,
+ expected: "div.article main content => 30.000000",
+ setup: func(doc *goquery.Document) *candidate {
+ selection := doc.Find("div")
+ return scoreNode(selection)
+ },
+ },
+ {
+ name: "paragraph candidate with negative class",
+ html: ``,
+ expected: "p.comment => -25.000000",
+ setup: func(doc *goquery.Document) *candidate {
+ selection := doc.Find("p")
+ return scoreNode(selection)
+ },
+ },
+ {
+ name: "heading candidate with positive id",
+ html: `Heading
`,
+ expected: "h1#main => 20.000000",
+ setup: func(doc *goquery.Document) *candidate {
+ selection := doc.Find("h1")
+ return scoreNode(selection)
+ },
+ },
+ {
+ name: "candidate with special characters in class",
+ html: `Content
`,
+ expected: "div.my-class_name => 5.000000",
+ setup: func(doc *goquery.Document) *candidate {
+ selection := doc.Find("div")
+ return scoreNode(selection)
+ },
+ },
+ {
+ name: "candidate with empty class attribute",
+ html: `Content
`,
+ expected: "div => 5.000000",
+ setup: func(doc *goquery.Document) *candidate {
+ selection := doc.Find("div")
+ return scoreNode(selection)
+ },
+ },
+ {
+ name: "candidate with empty id attribute",
+ html: `Content
`,
+ expected: "div => 5.000000",
+ setup: func(doc *goquery.Document) *candidate {
+ selection := doc.Find("div")
+ return scoreNode(selection)
+ },
+ },
+ {
+ name: "custom score candidate",
+ html: `Content`,
+ expected: "span => 42.500000",
+ setup: func(doc *goquery.Document) *candidate {
+ selection := doc.Find("span")
+ c := scoreNode(selection)
+ c.score = 42.5 // Override score for testing
+ return c
+ },
+ },
+ }
+
+ for _, tc := range testCases {
+ t.Run(tc.name, func(t *testing.T) {
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html))
+ if err != nil {
+ t.Fatalf("Failed to parse HTML: %v", err)
+ }
+
+ candidate := tc.setup(doc)
+ result := candidate.String()
+
+ if result != tc.expected {
+ t.Errorf("Expected: %s, Got: %s", tc.expected, result)
+ }
+ })
+ }
+}
+
+func TestCandidateListString(t *testing.T) {
+ testCases := []struct {
+ name string
+ html string
+ expected string
+ setup func(*goquery.Document) candidateList
+ }{
+ {
+ name: "empty candidate list",
+ html: ``,
+ expected: "",
+ setup: func(doc *goquery.Document) candidateList {
+ return make(candidateList)
+ },
+ },
+ {
+ name: "single candidate",
+ html: `Content
`,
+ expected: "div.content => 30.000000",
+ setup: func(doc *goquery.Document) candidateList {
+ candidates := make(candidateList)
+ selection := doc.Find("div")
+ candidate := scoreNode(selection)
+ candidates[selection.Get(0)] = candidate
+ return candidates
+ },
+ },
+ {
+ name: "multiple candidates",
+ html: `Content
Paragraph
Title
`,
+ setup: func(doc *goquery.Document) candidateList {
+ candidates := make(candidateList)
+
+ divSelection := doc.Find("div")
+ divCandidate := scoreNode(divSelection)
+ candidates[divSelection.Get(0)] = divCandidate
+
+ pSelection := doc.Find("p")
+ pCandidate := scoreNode(pSelection)
+ candidates[pSelection.Get(0)] = pCandidate
+
+ h1Selection := doc.Find("h1")
+ h1Candidate := scoreNode(h1Selection)
+ candidates[h1Selection.Get(0)] = h1Candidate
+
+ return candidates
+ },
+ },
+ {
+ name: "candidates with mixed scores",
+ html: `Good content
`,
+ setup: func(doc *goquery.Document) candidateList {
+ candidates := make(candidateList)
+
+ divSelection := doc.Find("div")
+ divCandidate := scoreNode(divSelection)
+ candidates[divSelection.Get(0)] = divCandidate
+
+ pSelection := doc.Find("p")
+ pCandidate := scoreNode(pSelection)
+ candidates[pSelection.Get(0)] = pCandidate
+
+ return candidates
+ },
+ },
+ {
+ name: "candidate with empty selection",
+ html: `Test
`,
+ setup: func(doc *goquery.Document) candidateList {
+ candidates := make(candidateList)
+
+ // Add a regular candidate
+ divSelection := doc.Find("div")
+ divCandidate := scoreNode(divSelection)
+ candidates[divSelection.Get(0)] = divCandidate
+
+ // Add a candidate with empty selection (this is artificial but tests the edge case)
+ emptySelection := doc.Find("nonexistent")
+ emptyCandidate := &candidate{selection: emptySelection, score: 0}
+ // We can't use emptySelection.Get(0) as key since it would panic,
+ // so we'll create a dummy node for this test
+ dummyNode := &html.Node{Type: html.ElementNode, Data: "dummy"}
+ candidates[dummyNode] = emptyCandidate
+
+ return candidates
+ },
+ },
+ }
+
+ for _, tc := range testCases {
+ t.Run(tc.name, func(t *testing.T) {
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html))
+ if err != nil {
+ t.Fatalf("Failed to parse HTML: %v", err)
+ }
+
+ candidates := tc.setup(doc)
+ result := candidates.String()
+
+ if tc.name == "empty candidate list" {
+ if result != tc.expected {
+ t.Errorf("Expected: %s, Got: %s", tc.expected, result)
+ }
+ return
+ }
+
+ // For multiple candidates, we need to check that all expected parts are present
+ // since map iteration order is not guaranteed
+ switch tc.name {
+ case "multiple candidates":
+ expectedParts := []string{"div.content => 30.000000", "p.text => 25.000000", "h1#main => 20.000000"}
+ for _, part := range expectedParts {
+ if !strings.Contains(result, part) {
+ t.Errorf("Expected result to contain: %s, Got: %s", part, result)
+ }
+ }
+ // Check that it's comma-separated
+ if !strings.Contains(result, ", ") {
+ t.Errorf("Expected comma-separated format, Got: %s", result)
+ }
+ case "candidates with mixed scores":
+ expectedParts := []string{"div.comment => -20.000000", "p.content => 25.000000"}
+ for _, part := range expectedParts {
+ if !strings.Contains(result, part) {
+ t.Errorf("Expected result to contain: %s, Got: %s", part, result)
+ }
+ }
+ case "candidate with empty selection":
+ // Should contain both the regular candidate and the empty one
+ if !strings.Contains(result, "div => 5.000000") {
+ t.Errorf("Expected result to contain div candidate, Got: %s", result)
+ }
+ if !strings.Contains(result, "empty => 0.000000") {
+ t.Errorf("Expected result to contain empty candidate, Got: %s", result)
+ }
+ default:
+ // Single candidate test cases
+ if result != tc.expected {
+ t.Errorf("Expected: %s, Got: %s", tc.expected, result)
+ }
+ }
+ })
+ }
+}
+
+func TestCandidateStringEdgeCases(t *testing.T) {
+ t.Run("candidate with nil node but valid selection", func(t *testing.T) {
+ // This tests the case where Node() returns nil but selection exists
+ html := `Test
`
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ emptySelection := doc.Find("nonexistent")
+ candidate := &candidate{
+ selection: emptySelection,
+ score: 10.5,
+ }
+
+ result := candidate.String()
+ expected := "empty => 10.500000"
+
+ if result != expected {
+ t.Errorf("Expected: %s, Got: %s", expected, result)
+ }
+ })
+
+ t.Run("candidate with zero score", func(t *testing.T) {
+ html := `Test
`
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ selection := doc.Find("div")
+ candidate := &candidate{
+ selection: selection,
+ score: 0,
+ }
+
+ result := candidate.String()
+ expected := "div => 0.000000"
+
+ if result != expected {
+ t.Errorf("Expected: %s, Got: %s", expected, result)
+ }
+ })
+
+ t.Run("candidate with negative score", func(t *testing.T) {
+ html := `Test
`
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ selection := doc.Find("h1")
+ candidate := &candidate{
+ selection: selection,
+ score: -10.5,
+ }
+
+ result := candidate.String()
+ expected := "h1 => -10.500000"
+
+ if result != expected {
+ t.Errorf("Expected: %s, Got: %s", expected, result)
+ }
+ })
+
+ t.Run("candidate with very long class and id", func(t *testing.T) {
+ html := `Test
`
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ selection := doc.Find("div")
+ candidate := scoreNode(selection)
+
+ result := candidate.String()
+ expected := "div#very-long-id-name-that-might-also-cause-formatting-issues.very-long-class-name-that-might-cause-issues => 5.000000"
+
+ if result != expected {
+ t.Errorf("Expected: %s, Got: %s", expected, result)
+ }
+ })
+}