refactor(readability): add a getSelectionLength function

When we're only interested in the length of contained Text, there is no need to materialize it fully to then call len() on the result: we can simply iterate over the text element and sum their length instead.
2025-09-30 19:22:11 +00:00 · 2025-07-01 18:23:42 +02:00 · 2025-07-01 18:23:42 +02:00 · 8a98926674
commit 8a98926674
parent 435a950d64
1 changed files with 29 additions and 5 deletions
--- a/internal/reader/readability/readability.go
+++ b/internal/reader/readability/readability.go
@ -107,6 +107,28 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er
 	return baseURL, extractedContent, nil
 }

+func getSelectionLength(s *goquery.Selection) int {
+	var getLengthOfTextContent func(*html.Node) int
+	getLengthOfTextContent = func(n *html.Node) int {
+		total := 0
+		if n.Type == html.TextNode {
+			total += len(n.Data)
+		}
+		if n.FirstChild != nil {
+			for c := n.FirstChild; c != nil; c = c.NextSibling {
+				total += getLengthOfTextContent(c)
+			}
+		}
+		return total
+	}
+
+	sum := 0
+	for _, n := range s.Nodes {
+		sum += getLengthOfTextContent(n)
+	}
+	return sum
+}
+
 // Now that we have the top candidate, look through its siblings for content that might also be related.
 // Things like preambles, content split by ads that we removed, etc.
 func getArticle(topCandidate *candidate, candidates candidateList) string {
@ -127,8 +149,7 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
 		} else if s.Is("p") {
 			tag = node.Data
 			linkDensity := getLinkDensity(s)
-			content := s.Text()
-			contentLength := len(content)
+			contentLength := getSelectionLength(s)

 			if contentLength >= 80 {
 				if linkDensity < .25 {
@ -136,6 +157,8 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
 				}
 			} else {
 				if linkDensity == 0 {
+					// It's a small selection, so .Text doesn't impact performances too much.
+					content := s.Text()
 					if containsSentence(content) {
 						append = true
 					}
@ -223,10 +246,10 @@ func getCandidates(document *goquery.Document) candidateList {
 	candidates := make(candidateList)

 	document.Find(defaultTagsToScore).Each(func(i int, s *goquery.Selection) {
-		text := s.Text()
+		textLen := getSelectionLength(s)

 		// If this paragraph is less than 25 characters, don't even count it.
-		if len(text) < 25 {
+		if textLen < 25 {
 			return
 		}

@ -253,10 +276,11 @@ func getCandidates(document *goquery.Document) candidateList {
 		contentScore := float32(1.0)

 		// Add points for any commas within this paragraph.
+		text := s.Text()
 		contentScore += float32(strings.Count(text, ",") + 1)

 		// For every 100 characters in this paragraph, add another point. Up to 3 points.
-		contentScore += float32(min(len(text)/100.0, 3))
+		contentScore += float32(min(textLen/100.0, 3))

 		candidates[parentNode].score += contentScore
 		if grandParentNode != nil {