From 6eeccae7cd3b798a94360b192ada636f1b9eaa0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Mon, 30 Jun 2025 21:18:39 -0700 Subject: [PATCH] test(readability): increase test coverage --- internal/reader/readability/readability.go | 44 +- .../reader/readability/readability_test.go | 1211 ++++++++++++++++- 2 files changed, 1209 insertions(+), 46 deletions(-) diff --git a/internal/reader/readability/readability.go b/internal/reader/readability/readability.go index 50c7bb30..c423f1a7 100644 --- a/internal/reader/readability/readability.go +++ b/internal/reader/readability/readability.go @@ -27,8 +27,8 @@ var ( maybeCandidate = [...]string{"and", "article", "body", "column", "main", "shadow"} unlikelyCandidate = [...]string{"banner", "breadcrumbs", "combx", "comment", "community", "cover-wrap", "disqus", "extra", "foot", "header", "legends", "menu", "modal", "related", "remark", "replies", "rss", "shoutbox", "sidebar", "skyscraper", "social", "sponsor", "supplemental", "ad-break", "agegate", "pagination", "pager", "popup", "yom-remote"} - positive = [...]string{"article", "blog", "body", "content", "entry", "h-entry", "hentry", "main", "page", "pagination", "post", "story", "text"} - negative = [...]string{"author", "banner", "byline", "com-", "combx", "comment", "contact", "dateline", "foot", "hid", "masthead", "media", "meta", "modal", "outbrain", "promo", "related", "scroll", "share", "shopping", "shoutbox", "sidebar", "skyscraper", "sponsor", "tags", "tool", "widget", "writtenby"} + positiveKeywords = [...]string{"article", "blog", "body", "content", "entry", "h-entry", "hentry", "main", "page", "pagination", "post", "story", "text"} + negativeKeywords = [...]string{"author", "banner", "byline", "com-", "combx", "comment", "contact", "dateline", "foot", "hid", "masthead", "media", "meta", "modal", "outbrain", "promo", "related", "scroll", "share", "shopping", "shoutbox", "sidebar", "skyscraper", "sponsor", "tags", "tool", "widget", "writtenby"} ) type candidate struct { @@ -37,23 +37,31 @@ type candidate struct { } func (c *candidate) Node() *html.Node { + if c.selection.Length() == 0 { + return nil + } return c.selection.Get(0) } func (c *candidate) String() string { + node := c.Node() + if node == nil { + return fmt.Sprintf("empty => %f", c.score) + } + id, _ := c.selection.Attr("id") class, _ := c.selection.Attr("class") switch { case id != "" && class != "": - return fmt.Sprintf("%s#%s.%s => %f", c.Node().DataAtom, id, class, c.score) + return fmt.Sprintf("%s#%s.%s => %f", node.DataAtom, id, class, c.score) case id != "": - return fmt.Sprintf("%s#%s => %f", c.Node().DataAtom, id, c.score) + return fmt.Sprintf("%s#%s => %f", node.DataAtom, id, c.score) case class != "": - return fmt.Sprintf("%s.%s => %f", c.Node().DataAtom, class, c.score) + return fmt.Sprintf("%s.%s => %f", node.DataAtom, class, c.score) } - return fmt.Sprintf("%s => %f", c.Node().DataAtom, c.score) + return fmt.Sprintf("%s => %f", node.DataAtom, c.score) } type candidateList map[*html.Node]*candidate @@ -111,7 +119,8 @@ func getArticle(topCandidate *candidate, candidates candidateList) string { tag := "div" node := s.Get(0) - if node == topCandidate.Node() { + topNode := topCandidate.Node() + if topNode != nil && node == topNode { append = true } else if c, ok := candidates[node]; ok && c.score >= siblingScoreThreshold { append = true @@ -147,14 +156,14 @@ func shouldRemoveCandidate(str string) bool { str = strings.ToLower(str) // Those candidates have no false-positives, no need to check against `maybeCandidate` - for _, strong := range strongCandidates { - if strings.Contains(str, strong) { + for _, strongCandidate := range strongCandidates { + if strings.Contains(str, strongCandidate) { return true } } - for _, unlikely := range unlikelyCandidate { - if strings.Contains(str, unlikely) { + for _, unlikelyCandidate := range unlikelyCandidate { + if strings.Contains(str, unlikelyCandidate) { // Do we have a false positive? for _, maybe := range maybeCandidate { if strings.Contains(str, maybe) { @@ -268,6 +277,11 @@ func getCandidates(document *goquery.Document) candidateList { func scoreNode(s *goquery.Selection) *candidate { c := &candidate{selection: s, score: 0} + // Check if selection is empty to avoid panic + if s.Length() == 0 { + return c + } + switch s.Get(0).DataAtom.String() { case "div": c.score += 5 @@ -314,13 +328,13 @@ func getClassWeight(s *goquery.Selection) float32 { func getWeight(s string) int { s = strings.ToLower(s) - for _, pos := range negative { - if strings.Contains(s, pos) { + for _, keyword := range negativeKeywords { + if strings.Contains(s, keyword) { return -25 } } - for _, pos := range positive { - if strings.Contains(s, pos) { + for _, keyword := range positiveKeywords { + if strings.Contains(s, keyword) { return +25 } } diff --git a/internal/reader/readability/readability_test.go b/internal/reader/readability/readability_test.go index cf3188e6..96813d42 100644 --- a/internal/reader/readability/readability_test.go +++ b/internal/reader/readability/readability_test.go @@ -11,8 +11,60 @@ import ( "testing" "github.com/PuerkitoBio/goquery" + "golang.org/x/net/html" ) +func BenchmarkExtractContent(b *testing.B) { + var testCases = map[string][]byte{ + "miniflux_github.html": {}, + "miniflux_wikipedia.html": {}, + } + for filename := range testCases { + data, err := os.ReadFile("testdata/" + filename) + if err != nil { + b.Fatalf(`Unable to read file %q: %v`, filename, err) + } + testCases[filename] = data + } + for range b.N { + for _, v := range testCases { + ExtractContent(bytes.NewReader(v)) + } + } +} + +func BenchmarkGetWeight(b *testing.B) { + testCases := []string{ + "p-3 color-bg-accent-emphasis color-fg-on-emphasis show-on-focus js-skip-to-content", + "d-flex flex-column mb-3", + "AppHeader-search-control AppHeader-search-control-overflow", + "Button Button--iconOnly Button--invisible Button--medium mr-1 px-2 py-0 d-flex flex-items-center rounded-1 color-fg-muted", + "sr-only", + "validation-12753bbc-b4d1-4e10-bec6-92e585d1699d", + } + for range b.N { + for _, v := range testCases { + getWeight(v) + } + } +} + +func BenchmarkTransformMisusedDivsIntoParagraphs(b *testing.B) { + html := ` +
Simple text content
+
More inline content
+
Link content
+

Paragraph content

+
Another simple text
+ ` + + b.ResetTimer() + for i := 0; i < b.N; i++ { + doc, _ := goquery.NewDocumentFromReader(strings.NewReader(html)) + transformMisusedDivsIntoParagraphs(doc) + } +} + func TestBaseURL(t *testing.T) { html := ` @@ -189,25 +241,6 @@ func TestNestedSpanInCodeBlock(t *testing.T) { } } -func BenchmarkExtractContent(b *testing.B) { - var testCases = map[string][]byte{ - "miniflux_github.html": {}, - "miniflux_wikipedia.html": {}, - } - for filename := range testCases { - data, err := os.ReadFile("testdata/" + filename) - if err != nil { - b.Fatalf(`Unable to read file %q: %v`, filename, err) - } - testCases[filename] = data - } - for range b.N { - for _, v := range testCases { - ExtractContent(bytes.NewReader(v)) - } - } -} - func TestGetClassWeight(t *testing.T) { testCases := []struct { name string @@ -1315,18 +1348,1134 @@ func TestContainsSentence(t *testing.T) { } } -func BenchmarkGetWeight(b *testing.B) { - testCases := []string{ - "p-3 color-bg-accent-emphasis color-fg-on-emphasis show-on-focus js-skip-to-content", - "d-flex flex-column mb-3", - "AppHeader-search-control AppHeader-search-control-overflow", - "Button Button--iconOnly Button--invisible Button--medium mr-1 px-2 py-0 d-flex flex-items-center rounded-1 color-fg-muted", - "sr-only", - "validation-12753bbc-b4d1-4e10-bec6-92e585d1699d", +func TestScoreNode(t *testing.T) { + testCases := []struct { + name string + html string + expectedScore float32 + expectedTag string + }{ + { + name: "div element with no class or id", + html: `
Some content
`, + expectedScore: 5, + expectedTag: "div", + }, + { + name: "pre element with no class or id", + html: `
Some code
`, + expectedScore: 3, + expectedTag: "pre", + }, + { + name: "td element with no class or id", + html: `
Table cell
`, + expectedScore: 3, + expectedTag: "td", + }, + { + name: "blockquote element with no class or id", + html: `
Quote
`, + expectedScore: 3, + expectedTag: "blockquote", + }, + { + name: "img element with no class or id", + html: `test`, + expectedScore: 3, + expectedTag: "img", + }, + { + name: "ol element with no class or id", + html: `
  1. Item
`, + expectedScore: -3, + expectedTag: "ol", + }, + { + name: "ul element with no class or id", + html: ``, + expectedScore: -3, + expectedTag: "ul", + }, + { + name: "address element with no class or id", + html: `
Contact info
`, + expectedScore: -3, + expectedTag: "address", + }, + { + name: "dl element with no class or id", + html: `
Term
Definition
`, + expectedScore: -3, + expectedTag: "dl", + }, + { + name: "dd element with no class or id", + html: `
Definition
`, + expectedScore: -3, + expectedTag: "dd", + }, + { + name: "dt element with no class or id", + html: `
Term
`, + expectedScore: -3, + expectedTag: "dt", + }, + { + name: "li element with no class or id", + html: `
  • List item
  • `, + expectedScore: -3, + expectedTag: "li", + }, + { + name: "form element with no class or id", + html: `
    Form content
    `, + expectedScore: -3, + expectedTag: "form", + }, + { + name: "h1 element with no class or id", + html: `

    Heading

    `, + expectedScore: -5, + expectedTag: "h1", + }, + { + name: "h2 element with no class or id", + html: `

    Heading

    `, + expectedScore: -5, + expectedTag: "h2", + }, + { + name: "h3 element with no class or id", + html: `

    Heading

    `, + expectedScore: -5, + expectedTag: "h3", + }, + { + name: "h4 element with no class or id", + html: `

    Heading

    `, + expectedScore: -5, + expectedTag: "h4", + }, + { + name: "h5 element with no class or id", + html: `
    Heading
    `, + expectedScore: -5, + expectedTag: "h5", + }, + { + name: "h6 element with no class or id", + html: `
    Heading
    `, + expectedScore: -5, + expectedTag: "h6", + }, + { + name: "th element with no class or id", + html: `
    Header cell
    `, + expectedScore: -5, + expectedTag: "th", + }, + { + name: "p element with no class or id (default case)", + html: `

    Paragraph content

    `, + expectedScore: 0, + expectedTag: "p", + }, + { + name: "span element with no class or id (default case)", + html: `Span content`, + expectedScore: 0, + expectedTag: "span", + }, } - for range b.N { - for _, v := range testCases { - getWeight(v) - } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html)) + if err != nil { + t.Fatal(err) + } + + selection := doc.Find(tc.expectedTag) + if selection.Length() == 0 { + t.Fatalf("Could not find element with tag %s", tc.expectedTag) + } + + candidate := scoreNode(selection) + + if candidate.score != tc.expectedScore { + t.Errorf("Expected score %f, got %f", tc.expectedScore, candidate.score) + } + + if candidate.selection != selection { + t.Error("Expected selection to be preserved in candidate") + } + + if candidate.Node() == nil { + t.Errorf("Expected valid node, got nil") + } else if candidate.Node().Data != tc.expectedTag { + t.Errorf("Expected node tag %s, got %s", tc.expectedTag, candidate.Node().Data) + } + }) } } + +func TestScoreNodeWithClassWeights(t *testing.T) { + testCases := []struct { + name string + html string + expectedScore float32 + description string + }{ + { + name: "div with positive class", + html: `
    Content
    `, + expectedScore: 30, // 5 (div) + 25 (positive class) + description: "div base score + positive class weight", + }, + { + name: "div with negative class", + html: `
    Content
    `, + expectedScore: -20, // 5 (div) + (-25) (negative class) + description: "div base score + negative class weight", + }, + { + name: "div with positive id", + html: `
    Content
    `, + expectedScore: 30, // 5 (div) + 25 (positive id) + description: "div base score + positive id weight", + }, + { + name: "div with negative id", + html: ``, + expectedScore: -20, // 5 (div) + (-25) (negative id) + description: "div base score + negative id weight", + }, + { + name: "div with both positive class and id", + html: `
    Content
    `, + expectedScore: 55, // 5 (div) + 25 (positive class) + 25 (positive id) + description: "div base score + positive class weight + positive id weight", + }, + { + name: "div with both negative class and id", + html: ``, + expectedScore: -45, // 5 (div) + (-25) (negative class) + (-25) (negative id) + description: "div base score + negative class weight + negative id weight", + }, + { + name: "div with mixed class and id weights", + html: ``, + expectedScore: 5, // 5 (div) + 25 (positive class) + (-25) (negative id) + description: "div base score + positive class weight + negative id weight", + }, + { + name: "h1 with positive class (should still be negative overall)", + html: `

    Heading

    `, + expectedScore: 20, // -5 (h1) + 25 (positive class) + description: "h1 base score + positive class weight", + }, + { + name: "ul with negative class (more negative)", + html: ``, + expectedScore: -28, // -3 (ul) + (-25) (negative class) + description: "ul base score + negative class weight", + }, + { + name: "p with neutral class/id (no weight change)", + html: `

    Paragraph

    `, + expectedScore: 0, // 0 (p) + 0 (neutral class) + 0 (neutral id) + description: "p base score with neutral class and id", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html)) + if err != nil { + t.Fatal(err) + } + + // Find the first non-html/body element + selection := doc.Find("div, h1, h2, h3, h4, h5, h6, ul, ol, p, pre, blockquote, img, td, th, address, dl, dd, dt, li, form, span").First() + if selection.Length() == 0 { + t.Fatal("Could not find element") + } + + candidate := scoreNode(selection) + + if candidate.score != tc.expectedScore { + t.Errorf("%s: Expected score %f, got %f", tc.description, tc.expectedScore, candidate.score) + } + }) + } +} + +func TestScoreNodeEdgeCases(t *testing.T) { + t.Run("empty selection", func(t *testing.T) { + doc, err := goquery.NewDocumentFromReader(strings.NewReader(`
    `)) + if err != nil { + t.Fatal(err) + } + + // Create empty selection + emptySelection := doc.Find("nonexistent") + if emptySelection.Length() != 0 { + t.Fatal("Expected empty selection") + } + + // scoreNode should handle empty selection gracefully + candidate := scoreNode(emptySelection) + if candidate == nil { + t.Error("Expected non-nil candidate even for empty selection") + } + + // Should have score 0 and empty selection + if candidate != nil && candidate.score != 0 { + t.Errorf("Expected score 0 for empty selection, got %f", candidate.score) + } + + if candidate.selection.Length() != 0 { + t.Error("Expected candidate to preserve empty selection") + } + + // Node() should return nil for empty selection + if candidate.Node() != nil { + t.Error("Expected Node() to return nil for empty selection") + } + + // String() should handle empty selection gracefully + str := candidate.String() + expected := "empty => 0.000000" + if str != expected { + t.Errorf("Expected String() to return %q, got %q", expected, str) + } + }) + + t.Run("multiple elements in selection", func(t *testing.T) { + html := `
    +

    First paragraph

    + +
    ` + + doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) + if err != nil { + t.Fatal(err) + } + + // Select all p elements + selection := doc.Find("p") + if selection.Length() != 2 { + t.Fatalf("Expected 2 p elements, got %d", selection.Length()) + } + + // scoreNode should only consider the first element in the selection + candidate := scoreNode(selection) + + // Should score based on first p element (class="article") + expectedScore := float32(25) // 0 (p) + 25 (positive class) + if candidate.score != expectedScore { + t.Errorf("Expected score %f, got %f", expectedScore, candidate.score) + } + + if candidate.Node() == nil { + t.Error("Expected valid node, got nil") + } else if candidate.Node().Data != "p" { + t.Errorf("Expected node tag p, got %s", candidate.Node().Data) + } + }) + + t.Run("nested elements", func(t *testing.T) { + html := `
    +

    + Text +

    +
    ` + + doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) + if err != nil { + t.Fatal(err) + } + + // Test scoring each level + divSelection := doc.Find("div") + divCandidate := scoreNode(divSelection) + expectedDivScore := float32(30) // 5 (div) + 25 (positive class) + if divCandidate.score != expectedDivScore { + t.Errorf("Div score: expected %f, got %f", expectedDivScore, divCandidate.score) + } + + pSelection := doc.Find("p") + pCandidate := scoreNode(pSelection) + expectedPScore := float32(25) // 0 (p) + 25 (positive class) + if pCandidate.score != expectedPScore { + t.Errorf("P score: expected %f, got %f", expectedPScore, pCandidate.score) + } + + spanSelection := doc.Find("span") + spanCandidate := scoreNode(spanSelection) + expectedSpanScore := float32(0) // 0 (span) + 0 (neutral class) + if spanCandidate.score != expectedSpanScore { + t.Errorf("Span score: expected %f, got %f", expectedSpanScore, spanCandidate.score) + } + }) +} + +func TestTransformMisusedDivsIntoParagraphs(t *testing.T) { + testCases := []struct { + name string + input string + expected string + description string + }{ + { + name: "div with only text should become paragraph", + input: `
    Simple text content
    `, + expected: `

    Simple text content

    `, + description: "div containing only text should be converted to p", + }, + { + name: "div with inline elements should become paragraph", + input: `
    Text with inline and emphasis
    `, + expected: `

    Text with inline and emphasis

    `, + description: "div with inline elements should be converted to p", + }, + { + name: "div with strong and other inline elements", + input: `
    Some bold and italic text
    `, + expected: `

    Some bold and italic text

    `, + description: "div with inline formatting should be converted to p", + }, + { + name: "div with anchor tag should NOT become paragraph", + input: `
    Text with link
    `, + expected: `
    Text with link
    `, + description: "div containing anchor tag should remain div (matches regex)", + }, + { + name: "div with paragraph should NOT become paragraph", + input: `

    Nested paragraph

    `, + expected: `

    Nested paragraph

    `, + description: "div containing p tag should remain div", + }, + { + name: "div with blockquote should NOT become paragraph", + input: `
    Quote
    `, + expected: `
    Quote
    `, + description: "div containing blockquote should remain div", + }, + { + name: "div with nested div should NOT become paragraph", + input: `
    Nested div
    `, + expected: `

    Nested div

    `, + description: "outer div has nested div (matches regex), inner div has text only (gets converted)", + }, + { + name: "div with img should NOT become paragraph", + input: `
    test
    `, + expected: `
    test
    `, + description: "div containing img should remain div", + }, + { + name: "div with ol should NOT become paragraph", + input: `
    1. Item
    `, + expected: `
    1. Item
    `, + description: "div containing ol should remain div", + }, + { + name: "div with ul should NOT become paragraph", + input: `
    `, + expected: `
    `, + description: "div containing ul should remain div", + }, + { + name: "div with pre should NOT become paragraph", + input: `
    Code block
    `, + expected: `
    Code block
    `, + description: "div containing pre should remain div", + }, + { + name: "div with table should NOT become paragraph", + input: `
    Cell
    `, + expected: `
    Cell
    `, + description: "div containing table should remain div (note: GoQuery adds tbody)", + }, + { + name: "div with dl should NOT become paragraph", + input: `
    Term
    Definition
    `, + expected: `
    Term
    Definition
    `, + description: "div containing dl should remain div", + }, + { + name: "empty div should become paragraph", + input: `
    `, + expected: `

    `, + description: "empty div should be converted to p", + }, + { + name: "div with only whitespace should become paragraph", + input: `
    `, + expected: `

    `, + description: "div with only whitespace should be converted to p", + }, + { + name: "div with self-closing anchor tag should NOT become paragraph", + input: `
    Text more text
    `, + expected: `
    Text more text
    `, + description: "div with self-closing anchor should remain div (note: GoQuery normalizes self-closing tags)", + }, + { + name: "case insensitive matching - uppercase A", + input: `
    Text with link
    `, + expected: `
    Text with link
    `, + description: "regex should be case insensitive (note: GoQuery normalizes case)", + }, + { + name: "case insensitive matching - uppercase IMG", + input: `
    `, + expected: `
    `, + description: "regex should be case insensitive (note: GoQuery normalizes case)", + }, + { + name: "multiple divs transformation", + input: `
    Text only

    Has paragraph

    More text
    `, + expected: `

    Text only

    Has paragraph

    More text

    `, + description: "should transform multiple divs appropriately", + }, + { + name: "nested divs where inner gets transformed", + input: `
    Inner text only

    Paragraph

    `, + expected: `

    Inner text only

    Paragraph

    `, + description: "inner div should be transformed even if outer div isn't", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + // Wrap input in a basic HTML structure + html := fmt.Sprintf(`%s`, tc.input) + + doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) + if err != nil { + t.Fatalf("Failed to parse HTML: %v", err) + } + + // Apply the transformation + transformMisusedDivsIntoParagraphs(doc) + + // Extract the body content + bodyHtml, err := doc.Find("body").Html() + if err != nil { + t.Fatalf("Failed to get body HTML: %v", err) + } + + // Clean up whitespace for comparison + result := strings.TrimSpace(bodyHtml) + expected := strings.TrimSpace(tc.expected) + + if result != expected { + t.Errorf("%s\nExpected: %s\nGot: %s", tc.description, expected, result) + } + }) + } +} + +func TestTransformMisusedDivsIntoParagraphsRegexPattern(t *testing.T) { + // Test the regex pattern directly to ensure it matches the expected elements + testCases := []struct { + name string + html string + shouldMatch bool + description string + }{ + { + name: "anchor tag", + html: `link`, + shouldMatch: true, + description: "should match anchor tags", + }, + { + name: "blockquote tag", + html: `
    quote
    `, + shouldMatch: true, + description: "should match blockquote tags", + }, + { + name: "dl tag", + html: `
    term
    `, + shouldMatch: true, + description: "should match dl tags", + }, + { + name: "div tag", + html: `
    content
    `, + shouldMatch: true, + description: "should match div tags", + }, + { + name: "img tag", + html: ``, + shouldMatch: true, + description: "should match img tags", + }, + { + name: "ol tag", + html: `
    1. item
    `, + shouldMatch: true, + description: "should match ol tags", + }, + { + name: "p tag", + html: `

    paragraph

    `, + shouldMatch: true, + description: "should match p tags", + }, + { + name: "pre tag", + html: `
    code
    `, + shouldMatch: true, + description: "should match pre tags", + }, + { + name: "table tag", + html: `
    `, + shouldMatch: true, + description: "should match table tags", + }, + { + name: "ul tag", + html: ``, + shouldMatch: true, + description: "should match ul tags", + }, + { + name: "self-closing anchor", + html: ``, + shouldMatch: true, + description: "should match self-closing anchor tags", + }, + { + name: "tag with attributes", + html: `text`, + shouldMatch: true, + description: "should match tags with attributes", + }, + { + name: "uppercase tags", + html: `link`, + shouldMatch: true, + description: "should be case insensitive", + }, + { + name: "mixed case tags", + html: ``, + shouldMatch: true, + description: "should match mixed case tags", + }, + { + name: "span tag", + html: `text`, + shouldMatch: false, + description: "should NOT match span tags", + }, + { + name: "em tag", + html: `emphasis`, + shouldMatch: false, + description: "should NOT match em tags", + }, + { + name: "strong tag", + html: `bold`, + shouldMatch: false, + description: "should NOT match strong tags", + }, + { + name: "i tag", + html: `italic`, + shouldMatch: false, + description: "should NOT match i tags", + }, + { + name: "b tag", + html: `bold`, + shouldMatch: false, + description: "should NOT match b tags", + }, + { + name: "plain text", + html: `just plain text`, + shouldMatch: false, + description: "should NOT match plain text", + }, + { + name: "empty string", + html: ``, + shouldMatch: false, + description: "should NOT match empty string", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + result := divToPElementsRegexp.MatchString(tc.html) + if result != tc.shouldMatch { + t.Errorf("%s\nHTML: %s\nExpected match: %v, Got: %v", tc.description, tc.html, tc.shouldMatch, result) + } + }) + } +} + +func TestTransformMisusedDivsIntoParagraphsEdgeCases(t *testing.T) { + t.Run("document with no divs", func(t *testing.T) { + html := `

    No divs here

    Just other elements` + + doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) + if err != nil { + t.Fatal(err) + } + + // Should not panic or cause issues + transformMisusedDivsIntoParagraphs(doc) + + bodyHtml, _ := doc.Find("body").Html() + expected := `

    No divs here

    Just other elements` + + if strings.TrimSpace(bodyHtml) != expected { + t.Errorf("Expected no changes to document without divs") + } + }) + + t.Run("empty document", func(t *testing.T) { + html := `` + + doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) + if err != nil { + t.Fatal(err) + } + + // Should not panic with empty document + transformMisusedDivsIntoParagraphs(doc) + + bodyHtml, _ := doc.Find("body").Html() + if strings.TrimSpace(bodyHtml) != "" { + t.Errorf("Expected empty body to remain empty") + } + }) + + t.Run("deeply nested divs", func(t *testing.T) { + html := `
    Deep text
    ` + + doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) + if err != nil { + t.Fatal(err) + } + + transformMisusedDivsIntoParagraphs(doc) + + bodyHtml, _ := doc.Find("body").Html() + // The outer divs contain other divs (matches regex), so they remain divs + // Only the innermost div with just text gets converted to p + expected := `

    Deep text

    ` + + if strings.TrimSpace(bodyHtml) != expected { + t.Errorf("Expected nested div transformation\nGot: %s\nExpected: %s", strings.TrimSpace(bodyHtml), expected) + } + }) + + t.Run("complex mixed content", func(t *testing.T) { + html := ` +
    Text only div
    +
    Link div
    +
    Inline text
    +

    Block element

    + ` + + doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) + if err != nil { + t.Fatal(err) + } + + transformMisusedDivsIntoParagraphs(doc) + + // Count paragraphs and divs + pCount := doc.Find("p").Length() + divCount := doc.Find("div").Length() + + // Should have 3 paragraphs (original p + 2 converted divs) and 2 divs (link div + block element div) + expectedPCount := 3 + expectedDivCount := 2 + + if pCount != expectedPCount { + t.Errorf("Expected %d paragraphs, got %d", expectedPCount, pCount) + } + if divCount != expectedDivCount { + t.Errorf("Expected %d divs, got %d", expectedDivCount, divCount) + } + }) +} + +func TestCandidateString(t *testing.T) { + testCases := []struct { + name string + html string + expected string + setup func(*goquery.Document) *candidate + }{ + { + name: "empty candidate", + html: `
    `, + expected: "empty => 0.000000", + setup: func(doc *goquery.Document) *candidate { + emptySelection := doc.Find("nonexistent") + return &candidate{selection: emptySelection, score: 0} + }, + }, + { + name: "candidate with no class or id", + html: `
    Content
    `, + expected: "div => 5.000000", + setup: func(doc *goquery.Document) *candidate { + selection := doc.Find("div") + return scoreNode(selection) + }, + }, + { + name: "candidate with class only", + html: `
    Content
    `, + expected: "div.content => 30.000000", + setup: func(doc *goquery.Document) *candidate { + selection := doc.Find("div") + return scoreNode(selection) + }, + }, + { + name: "candidate with id only", + html: `
    Content
    `, + expected: "div#main => 30.000000", + setup: func(doc *goquery.Document) *candidate { + selection := doc.Find("div") + return scoreNode(selection) + }, + }, + { + name: "candidate with both class and id", + html: `
    Content
    `, + expected: "div#main.content => 55.000000", + setup: func(doc *goquery.Document) *candidate { + selection := doc.Find("div") + return scoreNode(selection) + }, + }, + { + name: "candidate with multiple classes", + html: `
    Content
    `, + expected: "div.article main content => 30.000000", + setup: func(doc *goquery.Document) *candidate { + selection := doc.Find("div") + return scoreNode(selection) + }, + }, + { + name: "paragraph candidate with negative class", + html: `

    Comment text

    `, + expected: "p.comment => -25.000000", + setup: func(doc *goquery.Document) *candidate { + selection := doc.Find("p") + return scoreNode(selection) + }, + }, + { + name: "heading candidate with positive id", + html: `

    Heading

    `, + expected: "h1#main => 20.000000", + setup: func(doc *goquery.Document) *candidate { + selection := doc.Find("h1") + return scoreNode(selection) + }, + }, + { + name: "candidate with special characters in class", + html: `
    Content
    `, + expected: "div.my-class_name => 5.000000", + setup: func(doc *goquery.Document) *candidate { + selection := doc.Find("div") + return scoreNode(selection) + }, + }, + { + name: "candidate with empty class attribute", + html: `
    Content
    `, + expected: "div => 5.000000", + setup: func(doc *goquery.Document) *candidate { + selection := doc.Find("div") + return scoreNode(selection) + }, + }, + { + name: "candidate with empty id attribute", + html: `
    Content
    `, + expected: "div => 5.000000", + setup: func(doc *goquery.Document) *candidate { + selection := doc.Find("div") + return scoreNode(selection) + }, + }, + { + name: "custom score candidate", + html: `Content`, + expected: "span => 42.500000", + setup: func(doc *goquery.Document) *candidate { + selection := doc.Find("span") + c := scoreNode(selection) + c.score = 42.5 // Override score for testing + return c + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html)) + if err != nil { + t.Fatalf("Failed to parse HTML: %v", err) + } + + candidate := tc.setup(doc) + result := candidate.String() + + if result != tc.expected { + t.Errorf("Expected: %s, Got: %s", tc.expected, result) + } + }) + } +} + +func TestCandidateListString(t *testing.T) { + testCases := []struct { + name string + html string + expected string + setup func(*goquery.Document) candidateList + }{ + { + name: "empty candidate list", + html: `
    `, + expected: "", + setup: func(doc *goquery.Document) candidateList { + return make(candidateList) + }, + }, + { + name: "single candidate", + html: `
    Content
    `, + expected: "div.content => 30.000000", + setup: func(doc *goquery.Document) candidateList { + candidates := make(candidateList) + selection := doc.Find("div") + candidate := scoreNode(selection) + candidates[selection.Get(0)] = candidate + return candidates + }, + }, + { + name: "multiple candidates", + html: `
    Content

    Paragraph

    Title

    `, + setup: func(doc *goquery.Document) candidateList { + candidates := make(candidateList) + + divSelection := doc.Find("div") + divCandidate := scoreNode(divSelection) + candidates[divSelection.Get(0)] = divCandidate + + pSelection := doc.Find("p") + pCandidate := scoreNode(pSelection) + candidates[pSelection.Get(0)] = pCandidate + + h1Selection := doc.Find("h1") + h1Candidate := scoreNode(h1Selection) + candidates[h1Selection.Get(0)] = h1Candidate + + return candidates + }, + }, + { + name: "candidates with mixed scores", + html: `
    Comment

    Good content

    `, + setup: func(doc *goquery.Document) candidateList { + candidates := make(candidateList) + + divSelection := doc.Find("div") + divCandidate := scoreNode(divSelection) + candidates[divSelection.Get(0)] = divCandidate + + pSelection := doc.Find("p") + pCandidate := scoreNode(pSelection) + candidates[pSelection.Get(0)] = pCandidate + + return candidates + }, + }, + { + name: "candidate with empty selection", + html: `
    Test
    `, + setup: func(doc *goquery.Document) candidateList { + candidates := make(candidateList) + + // Add a regular candidate + divSelection := doc.Find("div") + divCandidate := scoreNode(divSelection) + candidates[divSelection.Get(0)] = divCandidate + + // Add a candidate with empty selection (this is artificial but tests the edge case) + emptySelection := doc.Find("nonexistent") + emptyCandidate := &candidate{selection: emptySelection, score: 0} + // We can't use emptySelection.Get(0) as key since it would panic, + // so we'll create a dummy node for this test + dummyNode := &html.Node{Type: html.ElementNode, Data: "dummy"} + candidates[dummyNode] = emptyCandidate + + return candidates + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html)) + if err != nil { + t.Fatalf("Failed to parse HTML: %v", err) + } + + candidates := tc.setup(doc) + result := candidates.String() + + if tc.name == "empty candidate list" { + if result != tc.expected { + t.Errorf("Expected: %s, Got: %s", tc.expected, result) + } + return + } + + // For multiple candidates, we need to check that all expected parts are present + // since map iteration order is not guaranteed + switch tc.name { + case "multiple candidates": + expectedParts := []string{"div.content => 30.000000", "p.text => 25.000000", "h1#main => 20.000000"} + for _, part := range expectedParts { + if !strings.Contains(result, part) { + t.Errorf("Expected result to contain: %s, Got: %s", part, result) + } + } + // Check that it's comma-separated + if !strings.Contains(result, ", ") { + t.Errorf("Expected comma-separated format, Got: %s", result) + } + case "candidates with mixed scores": + expectedParts := []string{"div.comment => -20.000000", "p.content => 25.000000"} + for _, part := range expectedParts { + if !strings.Contains(result, part) { + t.Errorf("Expected result to contain: %s, Got: %s", part, result) + } + } + case "candidate with empty selection": + // Should contain both the regular candidate and the empty one + if !strings.Contains(result, "div => 5.000000") { + t.Errorf("Expected result to contain div candidate, Got: %s", result) + } + if !strings.Contains(result, "empty => 0.000000") { + t.Errorf("Expected result to contain empty candidate, Got: %s", result) + } + default: + // Single candidate test cases + if result != tc.expected { + t.Errorf("Expected: %s, Got: %s", tc.expected, result) + } + } + }) + } +} + +func TestCandidateStringEdgeCases(t *testing.T) { + t.Run("candidate with nil node but valid selection", func(t *testing.T) { + // This tests the case where Node() returns nil but selection exists + html := `
    Test
    ` + doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) + if err != nil { + t.Fatal(err) + } + + emptySelection := doc.Find("nonexistent") + candidate := &candidate{ + selection: emptySelection, + score: 10.5, + } + + result := candidate.String() + expected := "empty => 10.500000" + + if result != expected { + t.Errorf("Expected: %s, Got: %s", expected, result) + } + }) + + t.Run("candidate with zero score", func(t *testing.T) { + html := `
    Test
    ` + doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) + if err != nil { + t.Fatal(err) + } + + selection := doc.Find("div") + candidate := &candidate{ + selection: selection, + score: 0, + } + + result := candidate.String() + expected := "div => 0.000000" + + if result != expected { + t.Errorf("Expected: %s, Got: %s", expected, result) + } + }) + + t.Run("candidate with negative score", func(t *testing.T) { + html := `

    Test

    ` + doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) + if err != nil { + t.Fatal(err) + } + + selection := doc.Find("h1") + candidate := &candidate{ + selection: selection, + score: -10.5, + } + + result := candidate.String() + expected := "h1 => -10.500000" + + if result != expected { + t.Errorf("Expected: %s, Got: %s", expected, result) + } + }) + + t.Run("candidate with very long class and id", func(t *testing.T) { + html := `
    Test
    ` + doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) + if err != nil { + t.Fatal(err) + } + + selection := doc.Find("div") + candidate := scoreNode(selection) + + result := candidate.String() + expected := "div#very-long-id-name-that-might-also-cause-formatting-issues.very-long-class-name-that-might-cause-issues => 5.000000" + + if result != expected { + t.Errorf("Expected: %s, Got: %s", expected, result) + } + }) +}