mirror of
https://github.com/miniflux/v2.git
synced 2025-08-06 17:41:00 +00:00
refactor(readability): minor clean up
Remove a now-useless regex and its associated test.
This commit is contained in:
parent
766d4ab834
commit
69a74c4abf
2 changed files with 1 additions and 152 deletions
|
@ -7,7 +7,6 @@ import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"regexp"
|
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"miniflux.app/v2/internal/urllib"
|
"miniflux.app/v2/internal/urllib"
|
||||||
|
@ -16,13 +15,9 @@ import (
|
||||||
"golang.org/x/net/html"
|
"golang.org/x/net/html"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const defaultTagsToScore = "section,h2,h3,h4,h5,h6,p,td,pre,div"
|
||||||
defaultTagsToScore = "section,h2,h3,h4,h5,h6,p,td,pre,div"
|
|
||||||
)
|
|
||||||
|
|
||||||
var (
|
var (
|
||||||
divToPElementsRegexp = regexp.MustCompile(`(?i)<(?:a|blockquote|dl|div|img|ol|p|pre|table|ul)[ />]`)
|
|
||||||
|
|
||||||
strongCandidates = [...]string{"popupbody", "-ad", "g-plus"}
|
strongCandidates = [...]string{"popupbody", "-ad", "g-plus"}
|
||||||
maybeCandidate = [...]string{"and", "article", "body", "column", "main", "shadow"}
|
maybeCandidate = [...]string{"and", "article", "body", "column", "main", "shadow"}
|
||||||
unlikelyCandidate = [...]string{"banner", "breadcrumbs", "combx", "comment", "community", "cover-wrap", "disqus", "extra", "foot", "header", "legends", "menu", "modal", "related", "remark", "replies", "rss", "shoutbox", "sidebar", "skyscraper", "social", "sponsor", "supplemental", "ad-break", "agegate", "pagination", "pager", "popup", "yom-remote"}
|
unlikelyCandidate = [...]string{"banner", "breadcrumbs", "combx", "comment", "community", "cover-wrap", "disqus", "extra", "foot", "header", "legends", "menu", "modal", "related", "remark", "replies", "rss", "shoutbox", "sidebar", "skyscraper", "social", "sponsor", "supplemental", "ad-break", "agegate", "pagination", "pager", "popup", "yom-remote"}
|
||||||
|
|
|
@ -1880,152 +1880,6 @@ func TestTransformMisusedDivsIntoParagraphs(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestTransformMisusedDivsIntoParagraphsRegexPattern(t *testing.T) {
|
|
||||||
// Test the regex pattern directly to ensure it matches the expected elements
|
|
||||||
testCases := []struct {
|
|
||||||
name string
|
|
||||||
html string
|
|
||||||
shouldMatch bool
|
|
||||||
description string
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
name: "anchor tag",
|
|
||||||
html: `<a href="#">link</a>`,
|
|
||||||
shouldMatch: true,
|
|
||||||
description: "should match anchor tags",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "blockquote tag",
|
|
||||||
html: `<blockquote>quote</blockquote>`,
|
|
||||||
shouldMatch: true,
|
|
||||||
description: "should match blockquote tags",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "dl tag",
|
|
||||||
html: `<dl><dt>term</dt></dl>`,
|
|
||||||
shouldMatch: true,
|
|
||||||
description: "should match dl tags",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "div tag",
|
|
||||||
html: `<div>content</div>`,
|
|
||||||
shouldMatch: true,
|
|
||||||
description: "should match div tags",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "img tag",
|
|
||||||
html: `<img src="test.jpg">`,
|
|
||||||
shouldMatch: true,
|
|
||||||
description: "should match img tags",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "ol tag",
|
|
||||||
html: `<ol><li>item</li></ol>`,
|
|
||||||
shouldMatch: true,
|
|
||||||
description: "should match ol tags",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "p tag",
|
|
||||||
html: `<p>paragraph</p>`,
|
|
||||||
shouldMatch: true,
|
|
||||||
description: "should match p tags",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "pre tag",
|
|
||||||
html: `<pre>code</pre>`,
|
|
||||||
shouldMatch: true,
|
|
||||||
description: "should match pre tags",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "table tag",
|
|
||||||
html: `<table><tr></tr></table>`,
|
|
||||||
shouldMatch: true,
|
|
||||||
description: "should match table tags",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "ul tag",
|
|
||||||
html: `<ul><li>item</li></ul>`,
|
|
||||||
shouldMatch: true,
|
|
||||||
description: "should match ul tags",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "self-closing anchor",
|
|
||||||
html: `<a/>`,
|
|
||||||
shouldMatch: true,
|
|
||||||
description: "should match self-closing anchor tags",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "tag with attributes",
|
|
||||||
html: `<a href="#" class="link">text</a>`,
|
|
||||||
shouldMatch: true,
|
|
||||||
description: "should match tags with attributes",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "uppercase tags",
|
|
||||||
html: `<A href="#">link</A>`,
|
|
||||||
shouldMatch: true,
|
|
||||||
description: "should be case insensitive",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "mixed case tags",
|
|
||||||
html: `<Img src="test.jpg">`,
|
|
||||||
shouldMatch: true,
|
|
||||||
description: "should match mixed case tags",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "span tag",
|
|
||||||
html: `<span>text</span>`,
|
|
||||||
shouldMatch: false,
|
|
||||||
description: "should NOT match span tags",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "em tag",
|
|
||||||
html: `<em>emphasis</em>`,
|
|
||||||
shouldMatch: false,
|
|
||||||
description: "should NOT match em tags",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "strong tag",
|
|
||||||
html: `<strong>bold</strong>`,
|
|
||||||
shouldMatch: false,
|
|
||||||
description: "should NOT match strong tags",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "i tag",
|
|
||||||
html: `<i>italic</i>`,
|
|
||||||
shouldMatch: false,
|
|
||||||
description: "should NOT match i tags",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "b tag",
|
|
||||||
html: `<b>bold</b>`,
|
|
||||||
shouldMatch: false,
|
|
||||||
description: "should NOT match b tags",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "plain text",
|
|
||||||
html: `just plain text`,
|
|
||||||
shouldMatch: false,
|
|
||||||
description: "should NOT match plain text",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "empty string",
|
|
||||||
html: ``,
|
|
||||||
shouldMatch: false,
|
|
||||||
description: "should NOT match empty string",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tc := range testCases {
|
|
||||||
t.Run(tc.name, func(t *testing.T) {
|
|
||||||
result := divToPElementsRegexp.MatchString(tc.html)
|
|
||||||
if result != tc.shouldMatch {
|
|
||||||
t.Errorf("%s\nHTML: %s\nExpected match: %v, Got: %v", tc.description, tc.html, tc.shouldMatch, result)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestTransformMisusedDivsIntoParagraphsEdgeCases(t *testing.T) {
|
func TestTransformMisusedDivsIntoParagraphsEdgeCases(t *testing.T) {
|
||||||
t.Run("document with no divs", func(t *testing.T) {
|
t.Run("document with no divs", func(t *testing.T) {
|
||||||
html := `<html><body><p>No divs here</p><span>Just other elements</span></body></html>`
|
html := `<html><body><p>No divs here</p><span>Just other elements</span></body></html>`
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue