mirror of
https://github.com/miniflux/v2.git
synced 2025-08-06 17:41:00 +00:00
refactor(readability): minor clean up
Remove a now-useless regex and its associated test.
This commit is contained in:
parent
766d4ab834
commit
69a74c4abf
2 changed files with 1 additions and 152 deletions
|
@ -7,7 +7,6 @@ import (
|
|||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"miniflux.app/v2/internal/urllib"
|
||||
|
@ -16,13 +15,9 @@ import (
|
|||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
const (
|
||||
defaultTagsToScore = "section,h2,h3,h4,h5,h6,p,td,pre,div"
|
||||
)
|
||||
const defaultTagsToScore = "section,h2,h3,h4,h5,h6,p,td,pre,div"
|
||||
|
||||
var (
|
||||
divToPElementsRegexp = regexp.MustCompile(`(?i)<(?:a|blockquote|dl|div|img|ol|p|pre|table|ul)[ />]`)
|
||||
|
||||
strongCandidates = [...]string{"popupbody", "-ad", "g-plus"}
|
||||
maybeCandidate = [...]string{"and", "article", "body", "column", "main", "shadow"}
|
||||
unlikelyCandidate = [...]string{"banner", "breadcrumbs", "combx", "comment", "community", "cover-wrap", "disqus", "extra", "foot", "header", "legends", "menu", "modal", "related", "remark", "replies", "rss", "shoutbox", "sidebar", "skyscraper", "social", "sponsor", "supplemental", "ad-break", "agegate", "pagination", "pager", "popup", "yom-remote"}
|
||||
|
|
|
@ -1880,152 +1880,6 @@ func TestTransformMisusedDivsIntoParagraphs(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestTransformMisusedDivsIntoParagraphsRegexPattern(t *testing.T) {
|
||||
// Test the regex pattern directly to ensure it matches the expected elements
|
||||
testCases := []struct {
|
||||
name string
|
||||
html string
|
||||
shouldMatch bool
|
||||
description string
|
||||
}{
|
||||
{
|
||||
name: "anchor tag",
|
||||
html: `<a href="#">link</a>`,
|
||||
shouldMatch: true,
|
||||
description: "should match anchor tags",
|
||||
},
|
||||
{
|
||||
name: "blockquote tag",
|
||||
html: `<blockquote>quote</blockquote>`,
|
||||
shouldMatch: true,
|
||||
description: "should match blockquote tags",
|
||||
},
|
||||
{
|
||||
name: "dl tag",
|
||||
html: `<dl><dt>term</dt></dl>`,
|
||||
shouldMatch: true,
|
||||
description: "should match dl tags",
|
||||
},
|
||||
{
|
||||
name: "div tag",
|
||||
html: `<div>content</div>`,
|
||||
shouldMatch: true,
|
||||
description: "should match div tags",
|
||||
},
|
||||
{
|
||||
name: "img tag",
|
||||
html: `<img src="test.jpg">`,
|
||||
shouldMatch: true,
|
||||
description: "should match img tags",
|
||||
},
|
||||
{
|
||||
name: "ol tag",
|
||||
html: `<ol><li>item</li></ol>`,
|
||||
shouldMatch: true,
|
||||
description: "should match ol tags",
|
||||
},
|
||||
{
|
||||
name: "p tag",
|
||||
html: `<p>paragraph</p>`,
|
||||
shouldMatch: true,
|
||||
description: "should match p tags",
|
||||
},
|
||||
{
|
||||
name: "pre tag",
|
||||
html: `<pre>code</pre>`,
|
||||
shouldMatch: true,
|
||||
description: "should match pre tags",
|
||||
},
|
||||
{
|
||||
name: "table tag",
|
||||
html: `<table><tr></tr></table>`,
|
||||
shouldMatch: true,
|
||||
description: "should match table tags",
|
||||
},
|
||||
{
|
||||
name: "ul tag",
|
||||
html: `<ul><li>item</li></ul>`,
|
||||
shouldMatch: true,
|
||||
description: "should match ul tags",
|
||||
},
|
||||
{
|
||||
name: "self-closing anchor",
|
||||
html: `<a/>`,
|
||||
shouldMatch: true,
|
||||
description: "should match self-closing anchor tags",
|
||||
},
|
||||
{
|
||||
name: "tag with attributes",
|
||||
html: `<a href="#" class="link">text</a>`,
|
||||
shouldMatch: true,
|
||||
description: "should match tags with attributes",
|
||||
},
|
||||
{
|
||||
name: "uppercase tags",
|
||||
html: `<A href="#">link</A>`,
|
||||
shouldMatch: true,
|
||||
description: "should be case insensitive",
|
||||
},
|
||||
{
|
||||
name: "mixed case tags",
|
||||
html: `<Img src="test.jpg">`,
|
||||
shouldMatch: true,
|
||||
description: "should match mixed case tags",
|
||||
},
|
||||
{
|
||||
name: "span tag",
|
||||
html: `<span>text</span>`,
|
||||
shouldMatch: false,
|
||||
description: "should NOT match span tags",
|
||||
},
|
||||
{
|
||||
name: "em tag",
|
||||
html: `<em>emphasis</em>`,
|
||||
shouldMatch: false,
|
||||
description: "should NOT match em tags",
|
||||
},
|
||||
{
|
||||
name: "strong tag",
|
||||
html: `<strong>bold</strong>`,
|
||||
shouldMatch: false,
|
||||
description: "should NOT match strong tags",
|
||||
},
|
||||
{
|
||||
name: "i tag",
|
||||
html: `<i>italic</i>`,
|
||||
shouldMatch: false,
|
||||
description: "should NOT match i tags",
|
||||
},
|
||||
{
|
||||
name: "b tag",
|
||||
html: `<b>bold</b>`,
|
||||
shouldMatch: false,
|
||||
description: "should NOT match b tags",
|
||||
},
|
||||
{
|
||||
name: "plain text",
|
||||
html: `just plain text`,
|
||||
shouldMatch: false,
|
||||
description: "should NOT match plain text",
|
||||
},
|
||||
{
|
||||
name: "empty string",
|
||||
html: ``,
|
||||
shouldMatch: false,
|
||||
description: "should NOT match empty string",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
result := divToPElementsRegexp.MatchString(tc.html)
|
||||
if result != tc.shouldMatch {
|
||||
t.Errorf("%s\nHTML: %s\nExpected match: %v, Got: %v", tc.description, tc.html, tc.shouldMatch, result)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestTransformMisusedDivsIntoParagraphsEdgeCases(t *testing.T) {
|
||||
t.Run("document with no divs", func(t *testing.T) {
|
||||
html := `<html><body><p>No divs here</p><span>Just other elements</span></body></html>`
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue