diff --git a/internal/reader/readability/readability.go b/internal/reader/readability/readability.go index 7c5384ee..d3d4fca9 100644 --- a/internal/reader/readability/readability.go +++ b/internal/reader/readability/readability.go @@ -107,6 +107,28 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er return baseURL, extractedContent, nil } +func getSelectionLength(s *goquery.Selection) int { + var getLengthOfTextContent func(*html.Node) int + getLengthOfTextContent = func(n *html.Node) int { + total := 0 + if n.Type == html.TextNode { + total += len(n.Data) + } + if n.FirstChild != nil { + for c := n.FirstChild; c != nil; c = c.NextSibling { + total += getLengthOfTextContent(c) + } + } + return total + } + + sum := 0 + for _, n := range s.Nodes { + sum += getLengthOfTextContent(n) + } + return sum +} + // Now that we have the top candidate, look through its siblings for content that might also be related. // Things like preambles, content split by ads that we removed, etc. func getArticle(topCandidate *candidate, candidates candidateList) string { @@ -127,8 +149,7 @@ func getArticle(topCandidate *candidate, candidates candidateList) string { } else if s.Is("p") { tag = node.Data linkDensity := getLinkDensity(s) - content := s.Text() - contentLength := len(content) + contentLength := getSelectionLength(s) if contentLength >= 80 { if linkDensity < .25 { @@ -136,6 +157,8 @@ func getArticle(topCandidate *candidate, candidates candidateList) string { } } else { if linkDensity == 0 { + // It's a small selection, so .Text doesn't impact performances too much. + content := s.Text() if containsSentence(content) { append = true } @@ -223,10 +246,10 @@ func getCandidates(document *goquery.Document) candidateList { candidates := make(candidateList) document.Find(defaultTagsToScore).Each(func(i int, s *goquery.Selection) { - text := s.Text() + textLen := getSelectionLength(s) // If this paragraph is less than 25 characters, don't even count it. - if len(text) < 25 { + if textLen < 25 { return } @@ -253,10 +276,11 @@ func getCandidates(document *goquery.Document) candidateList { contentScore := float32(1.0) // Add points for any commas within this paragraph. + text := s.Text() contentScore += float32(strings.Count(text, ",") + 1) // For every 100 characters in this paragraph, add another point. Up to 3 points. - contentScore += float32(min(len(text)/100.0, 3)) + contentScore += float32(min(textLen/100.0, 3)) candidates[parentNode].score += contentScore if grandParentNode != nil {