1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-08-06 17:41:00 +00:00

refactor(readability): add a getSelectionLength function

When we're only interested in the length of contained Text, there is no need to
materialize it fully to then call len() on the result: we can simply iterate
over the text element and sum their length instead.
This commit is contained in:
jvoisin 2025-07-01 18:23:42 +02:00 committed by Frédéric Guillot
parent 435a950d64
commit 8a98926674

View file

@ -107,6 +107,28 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er
return baseURL, extractedContent, nil return baseURL, extractedContent, nil
} }
func getSelectionLength(s *goquery.Selection) int {
var getLengthOfTextContent func(*html.Node) int
getLengthOfTextContent = func(n *html.Node) int {
total := 0
if n.Type == html.TextNode {
total += len(n.Data)
}
if n.FirstChild != nil {
for c := n.FirstChild; c != nil; c = c.NextSibling {
total += getLengthOfTextContent(c)
}
}
return total
}
sum := 0
for _, n := range s.Nodes {
sum += getLengthOfTextContent(n)
}
return sum
}
// Now that we have the top candidate, look through its siblings for content that might also be related. // Now that we have the top candidate, look through its siblings for content that might also be related.
// Things like preambles, content split by ads that we removed, etc. // Things like preambles, content split by ads that we removed, etc.
func getArticle(topCandidate *candidate, candidates candidateList) string { func getArticle(topCandidate *candidate, candidates candidateList) string {
@ -127,8 +149,7 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
} else if s.Is("p") { } else if s.Is("p") {
tag = node.Data tag = node.Data
linkDensity := getLinkDensity(s) linkDensity := getLinkDensity(s)
content := s.Text() contentLength := getSelectionLength(s)
contentLength := len(content)
if contentLength >= 80 { if contentLength >= 80 {
if linkDensity < .25 { if linkDensity < .25 {
@ -136,6 +157,8 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
} }
} else { } else {
if linkDensity == 0 { if linkDensity == 0 {
// It's a small selection, so .Text doesn't impact performances too much.
content := s.Text()
if containsSentence(content) { if containsSentence(content) {
append = true append = true
} }
@ -223,10 +246,10 @@ func getCandidates(document *goquery.Document) candidateList {
candidates := make(candidateList) candidates := make(candidateList)
document.Find(defaultTagsToScore).Each(func(i int, s *goquery.Selection) { document.Find(defaultTagsToScore).Each(func(i int, s *goquery.Selection) {
text := s.Text() textLen := getSelectionLength(s)
// If this paragraph is less than 25 characters, don't even count it. // If this paragraph is less than 25 characters, don't even count it.
if len(text) < 25 { if textLen < 25 {
return return
} }
@ -253,10 +276,11 @@ func getCandidates(document *goquery.Document) candidateList {
contentScore := float32(1.0) contentScore := float32(1.0)
// Add points for any commas within this paragraph. // Add points for any commas within this paragraph.
text := s.Text()
contentScore += float32(strings.Count(text, ",") + 1) contentScore += float32(strings.Count(text, ",") + 1)
// For every 100 characters in this paragraph, add another point. Up to 3 points. // For every 100 characters in this paragraph, add another point. Up to 3 points.
contentScore += float32(min(len(text)/100.0, 3)) contentScore += float32(min(textLen/100.0, 3))
candidates[parentNode].score += contentScore candidates[parentNode].score += contentScore
if grandParentNode != nil { if grandParentNode != nil {