1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-08-06 17:41:00 +00:00

refactor(readability): minor clean up

Remove a now-useless regex and its associated test.
This commit is contained in:
jvoisin 2025-07-02 14:42:57 +02:00 committed by Frédéric Guillot
parent 766d4ab834
commit 69a74c4abf
2 changed files with 1 additions and 152 deletions

View file

@ -7,7 +7,6 @@ import (
"fmt"
"io"
"log/slog"
"regexp"
"strings"
"miniflux.app/v2/internal/urllib"
@ -16,13 +15,9 @@ import (
"golang.org/x/net/html"
)
const (
defaultTagsToScore = "section,h2,h3,h4,h5,h6,p,td,pre,div"
)
const defaultTagsToScore = "section,h2,h3,h4,h5,h6,p,td,pre,div"
var (
divToPElementsRegexp = regexp.MustCompile(`(?i)<(?:a|blockquote|dl|div|img|ol|p|pre|table|ul)[ />]`)
strongCandidates = [...]string{"popupbody", "-ad", "g-plus"}
maybeCandidate = [...]string{"and", "article", "body", "column", "main", "shadow"}
unlikelyCandidate = [...]string{"banner", "breadcrumbs", "combx", "comment", "community", "cover-wrap", "disqus", "extra", "foot", "header", "legends", "menu", "modal", "related", "remark", "replies", "rss", "shoutbox", "sidebar", "skyscraper", "social", "sponsor", "supplemental", "ad-break", "agegate", "pagination", "pager", "popup", "yom-remote"}

View file

@ -1880,152 +1880,6 @@ func TestTransformMisusedDivsIntoParagraphs(t *testing.T) {
}
}
func TestTransformMisusedDivsIntoParagraphsRegexPattern(t *testing.T) {
// Test the regex pattern directly to ensure it matches the expected elements
testCases := []struct {
name string
html string
shouldMatch bool
description string
}{
{
name: "anchor tag",
html: `<a href="#">link</a>`,
shouldMatch: true,
description: "should match anchor tags",
},
{
name: "blockquote tag",
html: `<blockquote>quote</blockquote>`,
shouldMatch: true,
description: "should match blockquote tags",
},
{
name: "dl tag",
html: `<dl><dt>term</dt></dl>`,
shouldMatch: true,
description: "should match dl tags",
},
{
name: "div tag",
html: `<div>content</div>`,
shouldMatch: true,
description: "should match div tags",
},
{
name: "img tag",
html: `<img src="test.jpg">`,
shouldMatch: true,
description: "should match img tags",
},
{
name: "ol tag",
html: `<ol><li>item</li></ol>`,
shouldMatch: true,
description: "should match ol tags",
},
{
name: "p tag",
html: `<p>paragraph</p>`,
shouldMatch: true,
description: "should match p tags",
},
{
name: "pre tag",
html: `<pre>code</pre>`,
shouldMatch: true,
description: "should match pre tags",
},
{
name: "table tag",
html: `<table><tr></tr></table>`,
shouldMatch: true,
description: "should match table tags",
},
{
name: "ul tag",
html: `<ul><li>item</li></ul>`,
shouldMatch: true,
description: "should match ul tags",
},
{
name: "self-closing anchor",
html: `<a/>`,
shouldMatch: true,
description: "should match self-closing anchor tags",
},
{
name: "tag with attributes",
html: `<a href="#" class="link">text</a>`,
shouldMatch: true,
description: "should match tags with attributes",
},
{
name: "uppercase tags",
html: `<A href="#">link</A>`,
shouldMatch: true,
description: "should be case insensitive",
},
{
name: "mixed case tags",
html: `<Img src="test.jpg">`,
shouldMatch: true,
description: "should match mixed case tags",
},
{
name: "span tag",
html: `<span>text</span>`,
shouldMatch: false,
description: "should NOT match span tags",
},
{
name: "em tag",
html: `<em>emphasis</em>`,
shouldMatch: false,
description: "should NOT match em tags",
},
{
name: "strong tag",
html: `<strong>bold</strong>`,
shouldMatch: false,
description: "should NOT match strong tags",
},
{
name: "i tag",
html: `<i>italic</i>`,
shouldMatch: false,
description: "should NOT match i tags",
},
{
name: "b tag",
html: `<b>bold</b>`,
shouldMatch: false,
description: "should NOT match b tags",
},
{
name: "plain text",
html: `just plain text`,
shouldMatch: false,
description: "should NOT match plain text",
},
{
name: "empty string",
html: ``,
shouldMatch: false,
description: "should NOT match empty string",
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
result := divToPElementsRegexp.MatchString(tc.html)
if result != tc.shouldMatch {
t.Errorf("%s\nHTML: %s\nExpected match: %v, Got: %v", tc.description, tc.html, tc.shouldMatch, result)
}
})
}
}
func TestTransformMisusedDivsIntoParagraphsEdgeCases(t *testing.T) {
t.Run("document with no divs", func(t *testing.T) {
html := `<html><body><p>No divs here</p><span>Just other elements</span></body></html>`