1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-08-06 17:41:00 +00:00

refactor(readability): add a getSelectionLength function

When we're only interested in the length of contained Text, there is no need to
materialize it fully to then call len() on the result: we can simply iterate
over the text element and sum their length instead.
This commit is contained in:
jvoisin 2025-07-01 18:23:42 +02:00 committed by Frédéric Guillot
parent 435a950d64
commit 8a98926674

View file

@ -107,6 +107,28 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er
return baseURL, extractedContent, nil
}
func getSelectionLength(s *goquery.Selection) int {
var getLengthOfTextContent func(*html.Node) int
getLengthOfTextContent = func(n *html.Node) int {
total := 0
if n.Type == html.TextNode {
total += len(n.Data)
}
if n.FirstChild != nil {
for c := n.FirstChild; c != nil; c = c.NextSibling {
total += getLengthOfTextContent(c)
}
}
return total
}
sum := 0
for _, n := range s.Nodes {
sum += getLengthOfTextContent(n)
}
return sum
}
// Now that we have the top candidate, look through its siblings for content that might also be related.
// Things like preambles, content split by ads that we removed, etc.
func getArticle(topCandidate *candidate, candidates candidateList) string {
@ -127,8 +149,7 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
} else if s.Is("p") {
tag = node.Data
linkDensity := getLinkDensity(s)
content := s.Text()
contentLength := len(content)
contentLength := getSelectionLength(s)
if contentLength >= 80 {
if linkDensity < .25 {
@ -136,6 +157,8 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
}
} else {
if linkDensity == 0 {
// It's a small selection, so .Text doesn't impact performances too much.
content := s.Text()
if containsSentence(content) {
append = true
}
@ -223,10 +246,10 @@ func getCandidates(document *goquery.Document) candidateList {
candidates := make(candidateList)
document.Find(defaultTagsToScore).Each(func(i int, s *goquery.Selection) {
text := s.Text()
textLen := getSelectionLength(s)
// If this paragraph is less than 25 characters, don't even count it.
if len(text) < 25 {
if textLen < 25 {
return
}
@ -253,10 +276,11 @@ func getCandidates(document *goquery.Document) candidateList {
contentScore := float32(1.0)
// Add points for any commas within this paragraph.
text := s.Text()
contentScore += float32(strings.Count(text, ",") + 1)
// For every 100 characters in this paragraph, add another point. Up to 3 points.
contentScore += float32(min(len(text)/100.0, 3))
contentScore += float32(min(textLen/100.0, 3))
candidates[parentNode].score += contentScore
if grandParentNode != nil {