diff --git a/internal/reader/readability/readability.go b/internal/reader/readability/readability.go index 42560886..83d227aa 100644 --- a/internal/reader/readability/readability.go +++ b/internal/reader/readability/readability.go @@ -7,7 +7,6 @@ import ( "fmt" "io" "log/slog" - "regexp" "strings" "miniflux.app/v2/internal/urllib" @@ -16,13 +15,9 @@ import ( "golang.org/x/net/html" ) -const ( - defaultTagsToScore = "section,h2,h3,h4,h5,h6,p,td,pre,div" -) +const defaultTagsToScore = "section,h2,h3,h4,h5,h6,p,td,pre,div" var ( - divToPElementsRegexp = regexp.MustCompile(`(?i)<(?:a|blockquote|dl|div|img|ol|p|pre|table|ul)[ />]`) - strongCandidates = [...]string{"popupbody", "-ad", "g-plus"} maybeCandidate = [...]string{"and", "article", "body", "column", "main", "shadow"} unlikelyCandidate = [...]string{"banner", "breadcrumbs", "combx", "comment", "community", "cover-wrap", "disqus", "extra", "foot", "header", "legends", "menu", "modal", "related", "remark", "replies", "rss", "shoutbox", "sidebar", "skyscraper", "social", "sponsor", "supplemental", "ad-break", "agegate", "pagination", "pager", "popup", "yom-remote"} diff --git a/internal/reader/readability/readability_test.go b/internal/reader/readability/readability_test.go index 14e099cf..6c111f08 100644 --- a/internal/reader/readability/readability_test.go +++ b/internal/reader/readability/readability_test.go @@ -1880,152 +1880,6 @@ func TestTransformMisusedDivsIntoParagraphs(t *testing.T) { } } -func TestTransformMisusedDivsIntoParagraphsRegexPattern(t *testing.T) { - // Test the regex pattern directly to ensure it matches the expected elements - testCases := []struct { - name string - html string - shouldMatch bool - description string - }{ - { - name: "anchor tag", - html: `link`, - shouldMatch: true, - description: "should match anchor tags", - }, - { - name: "blockquote tag", - html: `
quote
`, - shouldMatch: true, - description: "should match blockquote tags", - }, - { - name: "dl tag", - html: `
term
`, - shouldMatch: true, - description: "should match dl tags", - }, - { - name: "div tag", - html: `
content
`, - shouldMatch: true, - description: "should match div tags", - }, - { - name: "img tag", - html: ``, - shouldMatch: true, - description: "should match img tags", - }, - { - name: "ol tag", - html: `
  1. item
`, - shouldMatch: true, - description: "should match ol tags", - }, - { - name: "p tag", - html: `

paragraph

`, - shouldMatch: true, - description: "should match p tags", - }, - { - name: "pre tag", - html: `
code
`, - shouldMatch: true, - description: "should match pre tags", - }, - { - name: "table tag", - html: `
`, - shouldMatch: true, - description: "should match table tags", - }, - { - name: "ul tag", - html: ``, - shouldMatch: true, - description: "should match ul tags", - }, - { - name: "self-closing anchor", - html: ``, - shouldMatch: true, - description: "should match self-closing anchor tags", - }, - { - name: "tag with attributes", - html: `text`, - shouldMatch: true, - description: "should match tags with attributes", - }, - { - name: "uppercase tags", - html: `link`, - shouldMatch: true, - description: "should be case insensitive", - }, - { - name: "mixed case tags", - html: ``, - shouldMatch: true, - description: "should match mixed case tags", - }, - { - name: "span tag", - html: `text`, - shouldMatch: false, - description: "should NOT match span tags", - }, - { - name: "em tag", - html: `emphasis`, - shouldMatch: false, - description: "should NOT match em tags", - }, - { - name: "strong tag", - html: `bold`, - shouldMatch: false, - description: "should NOT match strong tags", - }, - { - name: "i tag", - html: `italic`, - shouldMatch: false, - description: "should NOT match i tags", - }, - { - name: "b tag", - html: `bold`, - shouldMatch: false, - description: "should NOT match b tags", - }, - { - name: "plain text", - html: `just plain text`, - shouldMatch: false, - description: "should NOT match plain text", - }, - { - name: "empty string", - html: ``, - shouldMatch: false, - description: "should NOT match empty string", - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - result := divToPElementsRegexp.MatchString(tc.html) - if result != tc.shouldMatch { - t.Errorf("%s\nHTML: %s\nExpected match: %v, Got: %v", tc.description, tc.html, tc.shouldMatch, result) - } - }) - } -} - func TestTransformMisusedDivsIntoParagraphsEdgeCases(t *testing.T) { t.Run("document with no divs", func(t *testing.T) { html := `

No divs here

Just other elements`