mirror of
https://github.com/miniflux/v2.git
synced 2025-06-27 16:36:00 +00:00
refactor(readability): various improvements and optimizations
- Replace a completely overkill regex - Use `.Remove()` instead of a hand-rolled loop - Use a strings.Builder instead of a bytes.NewBufferString - Replace a call to Fprintf with string concatenation, as the latter are much faster - Remove a superfluous cast - Delay some computations - Add some tests
This commit is contained in:
parent
113abeea59
commit
6ad5ad0bb2
2 changed files with 84 additions and 29 deletions
|
@ -4,7 +4,6 @@
|
||||||
package readability // import "miniflux.app/v2/internal/reader/readability"
|
package readability // import "miniflux.app/v2/internal/reader/readability"
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
@ -23,7 +22,6 @@ const (
|
||||||
|
|
||||||
var (
|
var (
|
||||||
divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`)
|
divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`)
|
||||||
sentenceRegexp = regexp.MustCompile(`\.( |$)`)
|
|
||||||
|
|
||||||
blacklistCandidatesRegexp = regexp.MustCompile(`popupbody|-ad|g-plus`)
|
blacklistCandidatesRegexp = regexp.MustCompile(`popupbody|-ad|g-plus`)
|
||||||
okMaybeItsACandidateRegexp = regexp.MustCompile(`and|article|body|column|main|shadow`)
|
okMaybeItsACandidateRegexp = regexp.MustCompile(`and|article|body|column|main|shadow`)
|
||||||
|
@ -84,7 +82,7 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er
|
||||||
}
|
}
|
||||||
|
|
||||||
document.Find("script,style").Each(func(i int, s *goquery.Selection) {
|
document.Find("script,style").Each(func(i int, s *goquery.Selection) {
|
||||||
removeNodes(s)
|
s.Remove()
|
||||||
})
|
})
|
||||||
|
|
||||||
transformMisusedDivsIntoParagraphs(document)
|
transformMisusedDivsIntoParagraphs(document)
|
||||||
|
@ -106,7 +104,8 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er
|
||||||
// Now that we have the top candidate, look through its siblings for content that might also be related.
|
// Now that we have the top candidate, look through its siblings for content that might also be related.
|
||||||
// Things like preambles, content split by ads that we removed, etc.
|
// Things like preambles, content split by ads that we removed, etc.
|
||||||
func getArticle(topCandidate *candidate, candidates candidateList) string {
|
func getArticle(topCandidate *candidate, candidates candidateList) string {
|
||||||
output := bytes.NewBufferString("<div>")
|
var output strings.Builder
|
||||||
|
output.WriteString("<div>")
|
||||||
siblingScoreThreshold := max(10, topCandidate.score*.2)
|
siblingScoreThreshold := max(10, topCandidate.score*.2)
|
||||||
|
|
||||||
topCandidate.selection.Siblings().Union(topCandidate.selection).Each(func(i int, s *goquery.Selection) {
|
topCandidate.selection.Siblings().Union(topCandidate.selection).Each(func(i int, s *goquery.Selection) {
|
||||||
|
@ -124,10 +123,14 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
|
||||||
content := s.Text()
|
content := s.Text()
|
||||||
contentLength := len(content)
|
contentLength := len(content)
|
||||||
|
|
||||||
if contentLength >= 80 && linkDensity < .25 {
|
if contentLength >= 80 {
|
||||||
append = true
|
if linkDensity < .25 {
|
||||||
} else if contentLength < 80 && linkDensity == 0 && sentenceRegexp.MatchString(content) {
|
append = true
|
||||||
append = true
|
}
|
||||||
|
} else {
|
||||||
|
if linkDensity == 0 && containsSentence(content) {
|
||||||
|
append = true
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -138,7 +141,7 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
|
||||||
}
|
}
|
||||||
|
|
||||||
html, _ := s.Html()
|
html, _ := s.Html()
|
||||||
fmt.Fprintf(output, "<%s>%s</%s>", tag, html, tag)
|
output.WriteString("<" + tag + ">" + html + "</" + tag + ">")
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
@ -156,9 +159,9 @@ func removeUnlikelyCandidates(document *goquery.Document) {
|
||||||
str := strings.ToLower(class + id)
|
str := strings.ToLower(class + id)
|
||||||
|
|
||||||
if blacklistCandidatesRegexp.MatchString(str) {
|
if blacklistCandidatesRegexp.MatchString(str) {
|
||||||
removeNodes(s)
|
s.Remove()
|
||||||
} else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) {
|
} else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) {
|
||||||
removeNodes(s)
|
s.Remove()
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -222,7 +225,7 @@ func getCandidates(document *goquery.Document) candidateList {
|
||||||
contentScore += float32(strings.Count(text, ",") + 1)
|
contentScore += float32(strings.Count(text, ",") + 1)
|
||||||
|
|
||||||
// For every 100 characters in this paragraph, add another point. Up to 3 points.
|
// For every 100 characters in this paragraph, add another point. Up to 3 points.
|
||||||
contentScore += float32(min(int(len(text)/100.0), 3))
|
contentScore += float32(min(len(text)/100.0, 3))
|
||||||
|
|
||||||
candidates[parentNode].score += contentScore
|
candidates[parentNode].score += contentScore
|
||||||
if grandParentNode != nil {
|
if grandParentNode != nil {
|
||||||
|
@ -261,13 +264,14 @@ func scoreNode(s *goquery.Selection) *candidate {
|
||||||
// Get the density of links as a percentage of the content
|
// Get the density of links as a percentage of the content
|
||||||
// This is the amount of text that is inside a link divided by the total text in the node.
|
// This is the amount of text that is inside a link divided by the total text in the node.
|
||||||
func getLinkDensity(s *goquery.Selection) float32 {
|
func getLinkDensity(s *goquery.Selection) float32 {
|
||||||
linkLength := len(s.Find("a").Text())
|
|
||||||
textLength := len(s.Text())
|
textLength := len(s.Text())
|
||||||
|
|
||||||
if textLength == 0 {
|
if textLength == 0 {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
linkLength := len(s.Find("a").Text())
|
||||||
|
|
||||||
return float32(linkLength) / float32(textLength)
|
return float32(linkLength) / float32(textLength)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -278,25 +282,20 @@ func getClassWeight(s *goquery.Selection) float32 {
|
||||||
class, _ := s.Attr("class")
|
class, _ := s.Attr("class")
|
||||||
id, _ := s.Attr("id")
|
id, _ := s.Attr("id")
|
||||||
|
|
||||||
class = strings.ToLower(class)
|
|
||||||
id = strings.ToLower(id)
|
|
||||||
|
|
||||||
if class != "" {
|
if class != "" {
|
||||||
|
class = strings.ToLower(class)
|
||||||
if negativeRegexp.MatchString(class) {
|
if negativeRegexp.MatchString(class) {
|
||||||
weight -= 25
|
weight -= 25
|
||||||
}
|
} else if positiveRegexp.MatchString(class) {
|
||||||
|
|
||||||
if positiveRegexp.MatchString(class) {
|
|
||||||
weight += 25
|
weight += 25
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if id != "" {
|
if id != "" {
|
||||||
|
id = strings.ToLower(id)
|
||||||
if negativeRegexp.MatchString(id) {
|
if negativeRegexp.MatchString(id) {
|
||||||
weight -= 25
|
weight -= 25
|
||||||
}
|
} else if positiveRegexp.MatchString(id) {
|
||||||
|
|
||||||
if positiveRegexp.MatchString(id) {
|
|
||||||
weight += 25
|
weight += 25
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -314,11 +313,6 @@ func transformMisusedDivsIntoParagraphs(document *goquery.Document) {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func removeNodes(s *goquery.Selection) {
|
func containsSentence(content string) bool {
|
||||||
s.Each(func(i int, s *goquery.Selection) {
|
return strings.HasSuffix(content, ".") || strings.Contains(content, ". ")
|
||||||
parent := s.Parent()
|
|
||||||
if parent.Length() > 0 {
|
|
||||||
parent.Get(0).RemoveChild(s.Get(0))
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -100,3 +100,64 @@ func TestWithoutBaseURL(t *testing.T) {
|
||||||
t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
|
t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRemoveStyleScript(t *testing.T) {
|
||||||
|
html := `
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Test</title>
|
||||||
|
<script src="tololo.js"></script>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<script src="tololo.js"></script>
|
||||||
|
<style>
|
||||||
|
h1 {color:red;}
|
||||||
|
p {color:blue;}
|
||||||
|
</style>
|
||||||
|
<article>Some content</article>
|
||||||
|
</body>
|
||||||
|
</html>`
|
||||||
|
want := `<div><div><article>Somecontent</article></div></div>`
|
||||||
|
|
||||||
|
_, content, err := ExtractContent(strings.NewReader(html))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
content = strings.ReplaceAll(content, "\n", "")
|
||||||
|
content = strings.ReplaceAll(content, " ", "")
|
||||||
|
content = strings.ReplaceAll(content, "\t", "")
|
||||||
|
|
||||||
|
if content != want {
|
||||||
|
t.Errorf(`Invalid content, got %s instead of %s`, content, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRemoveBlacklist(t *testing.T) {
|
||||||
|
html := `
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Test</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<article class="super-ad">Some content</article>
|
||||||
|
<article class="g-plus-crap">Some other thing</article>
|
||||||
|
<article class="stuff popupbody">And more</article>
|
||||||
|
<article class="legit">Valid!</article>
|
||||||
|
</body>
|
||||||
|
</html>`
|
||||||
|
want := `<div><div><articleclass="legit">Valid!</article></div></div>`
|
||||||
|
|
||||||
|
_, content, err := ExtractContent(strings.NewReader(html))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
content = strings.ReplaceAll(content, "\n", "")
|
||||||
|
content = strings.ReplaceAll(content, " ", "")
|
||||||
|
content = strings.ReplaceAll(content, "\t", "")
|
||||||
|
|
||||||
|
if content != want {
|
||||||
|
t.Errorf(`Invalid content, got %s instead of %s`, content, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue