From 6ad5ad0bb23ab1faa60ff5c30e2f8ef03a6c22e3 Mon Sep 17 00:00:00 2001 From: Julien Voisin Date: Fri, 13 Dec 2024 04:41:56 +0000 Subject: [PATCH] refactor(readability): various improvements and optimizations - Replace a completely overkill regex - Use `.Remove()` instead of a hand-rolled loop - Use a strings.Builder instead of a bytes.NewBufferString - Replace a call to Fprintf with string concatenation, as the latter are much faster - Remove a superfluous cast - Delay some computations - Add some tests --- internal/reader/readability/readability.go | 52 +++++++--------- .../reader/readability/readability_test.go | 61 +++++++++++++++++++ 2 files changed, 84 insertions(+), 29 deletions(-) diff --git a/internal/reader/readability/readability.go b/internal/reader/readability/readability.go index 18f30ded..8f10b145 100644 --- a/internal/reader/readability/readability.go +++ b/internal/reader/readability/readability.go @@ -4,7 +4,6 @@ package readability // import "miniflux.app/v2/internal/reader/readability" import ( - "bytes" "fmt" "io" "log/slog" @@ -23,7 +22,6 @@ const ( var ( divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`) - sentenceRegexp = regexp.MustCompile(`\.( |$)`) blacklistCandidatesRegexp = regexp.MustCompile(`popupbody|-ad|g-plus`) okMaybeItsACandidateRegexp = regexp.MustCompile(`and|article|body|column|main|shadow`) @@ -84,7 +82,7 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er } document.Find("script,style").Each(func(i int, s *goquery.Selection) { - removeNodes(s) + s.Remove() }) transformMisusedDivsIntoParagraphs(document) @@ -106,7 +104,8 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er // Now that we have the top candidate, look through its siblings for content that might also be related. // Things like preambles, content split by ads that we removed, etc. func getArticle(topCandidate *candidate, candidates candidateList) string { - output := bytes.NewBufferString("
") + var output strings.Builder + output.WriteString("
") siblingScoreThreshold := max(10, topCandidate.score*.2) topCandidate.selection.Siblings().Union(topCandidate.selection).Each(func(i int, s *goquery.Selection) { @@ -124,10 +123,14 @@ func getArticle(topCandidate *candidate, candidates candidateList) string { content := s.Text() contentLength := len(content) - if contentLength >= 80 && linkDensity < .25 { - append = true - } else if contentLength < 80 && linkDensity == 0 && sentenceRegexp.MatchString(content) { - append = true + if contentLength >= 80 { + if linkDensity < .25 { + append = true + } + } else { + if linkDensity == 0 && containsSentence(content) { + append = true + } } } @@ -138,7 +141,7 @@ func getArticle(topCandidate *candidate, candidates candidateList) string { } html, _ := s.Html() - fmt.Fprintf(output, "<%s>%s", tag, html, tag) + output.WriteString("<" + tag + ">" + html + "") } }) @@ -156,9 +159,9 @@ func removeUnlikelyCandidates(document *goquery.Document) { str := strings.ToLower(class + id) if blacklistCandidatesRegexp.MatchString(str) { - removeNodes(s) + s.Remove() } else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) { - removeNodes(s) + s.Remove() } }) } @@ -222,7 +225,7 @@ func getCandidates(document *goquery.Document) candidateList { contentScore += float32(strings.Count(text, ",") + 1) // For every 100 characters in this paragraph, add another point. Up to 3 points. - contentScore += float32(min(int(len(text)/100.0), 3)) + contentScore += float32(min(len(text)/100.0, 3)) candidates[parentNode].score += contentScore if grandParentNode != nil { @@ -261,13 +264,14 @@ func scoreNode(s *goquery.Selection) *candidate { // Get the density of links as a percentage of the content // This is the amount of text that is inside a link divided by the total text in the node. func getLinkDensity(s *goquery.Selection) float32 { - linkLength := len(s.Find("a").Text()) textLength := len(s.Text()) if textLength == 0 { return 0 } + linkLength := len(s.Find("a").Text()) + return float32(linkLength) / float32(textLength) } @@ -278,25 +282,20 @@ func getClassWeight(s *goquery.Selection) float32 { class, _ := s.Attr("class") id, _ := s.Attr("id") - class = strings.ToLower(class) - id = strings.ToLower(id) - if class != "" { + class = strings.ToLower(class) if negativeRegexp.MatchString(class) { weight -= 25 - } - - if positiveRegexp.MatchString(class) { + } else if positiveRegexp.MatchString(class) { weight += 25 } } if id != "" { + id = strings.ToLower(id) if negativeRegexp.MatchString(id) { weight -= 25 - } - - if positiveRegexp.MatchString(id) { + } else if positiveRegexp.MatchString(id) { weight += 25 } } @@ -314,11 +313,6 @@ func transformMisusedDivsIntoParagraphs(document *goquery.Document) { }) } -func removeNodes(s *goquery.Selection) { - s.Each(func(i int, s *goquery.Selection) { - parent := s.Parent() - if parent.Length() > 0 { - parent.Get(0).RemoveChild(s.Get(0)) - } - }) +func containsSentence(content string) bool { + return strings.HasSuffix(content, ".") || strings.Contains(content, ". ") } diff --git a/internal/reader/readability/readability_test.go b/internal/reader/readability/readability_test.go index bd47d859..8baee1a0 100644 --- a/internal/reader/readability/readability_test.go +++ b/internal/reader/readability/readability_test.go @@ -100,3 +100,64 @@ func TestWithoutBaseURL(t *testing.T) { t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL) } } + +func TestRemoveStyleScript(t *testing.T) { + html := ` + + + Test + + + + + +
Some content
+ + ` + want := `
Somecontent
` + + _, content, err := ExtractContent(strings.NewReader(html)) + if err != nil { + t.Fatal(err) + } + + content = strings.ReplaceAll(content, "\n", "") + content = strings.ReplaceAll(content, " ", "") + content = strings.ReplaceAll(content, "\t", "") + + if content != want { + t.Errorf(`Invalid content, got %s instead of %s`, content, want) + } +} + +func TestRemoveBlacklist(t *testing.T) { + html := ` + + + Test + + +
Some content
+
Some other thing
+
And more
+
Valid!
+ + ` + want := `
Valid!
` + + _, content, err := ExtractContent(strings.NewReader(html)) + if err != nil { + t.Fatal(err) + } + + content = strings.ReplaceAll(content, "\n", "") + content = strings.ReplaceAll(content, " ", "") + content = strings.ReplaceAll(content, "\t", "") + + if content != want { + t.Errorf(`Invalid content, got %s instead of %s`, content, want) + } +}