1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-06-27 16:36:00 +00:00

refactor(readability): various improvements and optimizations

- Replace a completely overkill regex
- Use `.Remove()` instead of a hand-rolled loop
- Use a strings.Builder instead of a bytes.NewBufferString
- Replace a call to Fprintf with string concatenation, as the latter are much
  faster
- Remove a superfluous cast
- Delay some computations
- Add some tests
This commit is contained in:
Julien Voisin 2024-12-13 04:41:56 +00:00 committed by GitHub
parent 113abeea59
commit 6ad5ad0bb2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 84 additions and 29 deletions

View file

@ -4,7 +4,6 @@
package readability // import "miniflux.app/v2/internal/reader/readability" package readability // import "miniflux.app/v2/internal/reader/readability"
import ( import (
"bytes"
"fmt" "fmt"
"io" "io"
"log/slog" "log/slog"
@ -23,7 +22,6 @@ const (
var ( var (
divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`) divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`)
sentenceRegexp = regexp.MustCompile(`\.( |$)`)
blacklistCandidatesRegexp = regexp.MustCompile(`popupbody|-ad|g-plus`) blacklistCandidatesRegexp = regexp.MustCompile(`popupbody|-ad|g-plus`)
okMaybeItsACandidateRegexp = regexp.MustCompile(`and|article|body|column|main|shadow`) okMaybeItsACandidateRegexp = regexp.MustCompile(`and|article|body|column|main|shadow`)
@ -84,7 +82,7 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er
} }
document.Find("script,style").Each(func(i int, s *goquery.Selection) { document.Find("script,style").Each(func(i int, s *goquery.Selection) {
removeNodes(s) s.Remove()
}) })
transformMisusedDivsIntoParagraphs(document) transformMisusedDivsIntoParagraphs(document)
@ -106,7 +104,8 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er
// Now that we have the top candidate, look through its siblings for content that might also be related. // Now that we have the top candidate, look through its siblings for content that might also be related.
// Things like preambles, content split by ads that we removed, etc. // Things like preambles, content split by ads that we removed, etc.
func getArticle(topCandidate *candidate, candidates candidateList) string { func getArticle(topCandidate *candidate, candidates candidateList) string {
output := bytes.NewBufferString("<div>") var output strings.Builder
output.WriteString("<div>")
siblingScoreThreshold := max(10, topCandidate.score*.2) siblingScoreThreshold := max(10, topCandidate.score*.2)
topCandidate.selection.Siblings().Union(topCandidate.selection).Each(func(i int, s *goquery.Selection) { topCandidate.selection.Siblings().Union(topCandidate.selection).Each(func(i int, s *goquery.Selection) {
@ -124,10 +123,14 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
content := s.Text() content := s.Text()
contentLength := len(content) contentLength := len(content)
if contentLength >= 80 && linkDensity < .25 { if contentLength >= 80 {
append = true if linkDensity < .25 {
} else if contentLength < 80 && linkDensity == 0 && sentenceRegexp.MatchString(content) { append = true
append = true }
} else {
if linkDensity == 0 && containsSentence(content) {
append = true
}
} }
} }
@ -138,7 +141,7 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
} }
html, _ := s.Html() html, _ := s.Html()
fmt.Fprintf(output, "<%s>%s</%s>", tag, html, tag) output.WriteString("<" + tag + ">" + html + "</" + tag + ">")
} }
}) })
@ -156,9 +159,9 @@ func removeUnlikelyCandidates(document *goquery.Document) {
str := strings.ToLower(class + id) str := strings.ToLower(class + id)
if blacklistCandidatesRegexp.MatchString(str) { if blacklistCandidatesRegexp.MatchString(str) {
removeNodes(s) s.Remove()
} else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) { } else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) {
removeNodes(s) s.Remove()
} }
}) })
} }
@ -222,7 +225,7 @@ func getCandidates(document *goquery.Document) candidateList {
contentScore += float32(strings.Count(text, ",") + 1) contentScore += float32(strings.Count(text, ",") + 1)
// For every 100 characters in this paragraph, add another point. Up to 3 points. // For every 100 characters in this paragraph, add another point. Up to 3 points.
contentScore += float32(min(int(len(text)/100.0), 3)) contentScore += float32(min(len(text)/100.0, 3))
candidates[parentNode].score += contentScore candidates[parentNode].score += contentScore
if grandParentNode != nil { if grandParentNode != nil {
@ -261,13 +264,14 @@ func scoreNode(s *goquery.Selection) *candidate {
// Get the density of links as a percentage of the content // Get the density of links as a percentage of the content
// This is the amount of text that is inside a link divided by the total text in the node. // This is the amount of text that is inside a link divided by the total text in the node.
func getLinkDensity(s *goquery.Selection) float32 { func getLinkDensity(s *goquery.Selection) float32 {
linkLength := len(s.Find("a").Text())
textLength := len(s.Text()) textLength := len(s.Text())
if textLength == 0 { if textLength == 0 {
return 0 return 0
} }
linkLength := len(s.Find("a").Text())
return float32(linkLength) / float32(textLength) return float32(linkLength) / float32(textLength)
} }
@ -278,25 +282,20 @@ func getClassWeight(s *goquery.Selection) float32 {
class, _ := s.Attr("class") class, _ := s.Attr("class")
id, _ := s.Attr("id") id, _ := s.Attr("id")
class = strings.ToLower(class)
id = strings.ToLower(id)
if class != "" { if class != "" {
class = strings.ToLower(class)
if negativeRegexp.MatchString(class) { if negativeRegexp.MatchString(class) {
weight -= 25 weight -= 25
} } else if positiveRegexp.MatchString(class) {
if positiveRegexp.MatchString(class) {
weight += 25 weight += 25
} }
} }
if id != "" { if id != "" {
id = strings.ToLower(id)
if negativeRegexp.MatchString(id) { if negativeRegexp.MatchString(id) {
weight -= 25 weight -= 25
} } else if positiveRegexp.MatchString(id) {
if positiveRegexp.MatchString(id) {
weight += 25 weight += 25
} }
} }
@ -314,11 +313,6 @@ func transformMisusedDivsIntoParagraphs(document *goquery.Document) {
}) })
} }
func removeNodes(s *goquery.Selection) { func containsSentence(content string) bool {
s.Each(func(i int, s *goquery.Selection) { return strings.HasSuffix(content, ".") || strings.Contains(content, ". ")
parent := s.Parent()
if parent.Length() > 0 {
parent.Get(0).RemoveChild(s.Get(0))
}
})
} }

View file

@ -100,3 +100,64 @@ func TestWithoutBaseURL(t *testing.T) {
t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL) t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
} }
} }
func TestRemoveStyleScript(t *testing.T) {
html := `
<html>
<head>
<title>Test</title>
<script src="tololo.js"></script>
</head>
<body>
<script src="tololo.js"></script>
<style>
h1 {color:red;}
p {color:blue;}
</style>
<article>Some content</article>
</body>
</html>`
want := `<div><div><article>Somecontent</article></div></div>`
_, content, err := ExtractContent(strings.NewReader(html))
if err != nil {
t.Fatal(err)
}
content = strings.ReplaceAll(content, "\n", "")
content = strings.ReplaceAll(content, " ", "")
content = strings.ReplaceAll(content, "\t", "")
if content != want {
t.Errorf(`Invalid content, got %s instead of %s`, content, want)
}
}
func TestRemoveBlacklist(t *testing.T) {
html := `
<html>
<head>
<title>Test</title>
</head>
<body>
<article class="super-ad">Some content</article>
<article class="g-plus-crap">Some other thing</article>
<article class="stuff popupbody">And more</article>
<article class="legit">Valid!</article>
</body>
</html>`
want := `<div><div><articleclass="legit">Valid!</article></div></div>`
_, content, err := ExtractContent(strings.NewReader(html))
if err != nil {
t.Fatal(err)
}
content = strings.ReplaceAll(content, "\n", "")
content = strings.ReplaceAll(content, " ", "")
content = strings.ReplaceAll(content, "\t", "")
if content != want {
t.Errorf(`Invalid content, got %s instead of %s`, content, want)
}
}