mirror of
https://github.com/miniflux/v2.git
synced 2025-08-06 17:41:00 +00:00
refactor(readability): add a getSelectionLength function
When we're only interested in the length of contained Text, there is no need to materialize it fully to then call len() on the result: we can simply iterate over the text element and sum their length instead.
This commit is contained in:
parent
435a950d64
commit
8a98926674
1 changed files with 29 additions and 5 deletions
|
@ -107,6 +107,28 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er
|
|||
return baseURL, extractedContent, nil
|
||||
}
|
||||
|
||||
func getSelectionLength(s *goquery.Selection) int {
|
||||
var getLengthOfTextContent func(*html.Node) int
|
||||
getLengthOfTextContent = func(n *html.Node) int {
|
||||
total := 0
|
||||
if n.Type == html.TextNode {
|
||||
total += len(n.Data)
|
||||
}
|
||||
if n.FirstChild != nil {
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
total += getLengthOfTextContent(c)
|
||||
}
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
sum := 0
|
||||
for _, n := range s.Nodes {
|
||||
sum += getLengthOfTextContent(n)
|
||||
}
|
||||
return sum
|
||||
}
|
||||
|
||||
// Now that we have the top candidate, look through its siblings for content that might also be related.
|
||||
// Things like preambles, content split by ads that we removed, etc.
|
||||
func getArticle(topCandidate *candidate, candidates candidateList) string {
|
||||
|
@ -127,8 +149,7 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
|
|||
} else if s.Is("p") {
|
||||
tag = node.Data
|
||||
linkDensity := getLinkDensity(s)
|
||||
content := s.Text()
|
||||
contentLength := len(content)
|
||||
contentLength := getSelectionLength(s)
|
||||
|
||||
if contentLength >= 80 {
|
||||
if linkDensity < .25 {
|
||||
|
@ -136,6 +157,8 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
|
|||
}
|
||||
} else {
|
||||
if linkDensity == 0 {
|
||||
// It's a small selection, so .Text doesn't impact performances too much.
|
||||
content := s.Text()
|
||||
if containsSentence(content) {
|
||||
append = true
|
||||
}
|
||||
|
@ -223,10 +246,10 @@ func getCandidates(document *goquery.Document) candidateList {
|
|||
candidates := make(candidateList)
|
||||
|
||||
document.Find(defaultTagsToScore).Each(func(i int, s *goquery.Selection) {
|
||||
text := s.Text()
|
||||
textLen := getSelectionLength(s)
|
||||
|
||||
// If this paragraph is less than 25 characters, don't even count it.
|
||||
if len(text) < 25 {
|
||||
if textLen < 25 {
|
||||
return
|
||||
}
|
||||
|
||||
|
@ -253,10 +276,11 @@ func getCandidates(document *goquery.Document) candidateList {
|
|||
contentScore := float32(1.0)
|
||||
|
||||
// Add points for any commas within this paragraph.
|
||||
text := s.Text()
|
||||
contentScore += float32(strings.Count(text, ",") + 1)
|
||||
|
||||
// For every 100 characters in this paragraph, add another point. Up to 3 points.
|
||||
contentScore += float32(min(len(text)/100.0, 3))
|
||||
contentScore += float32(min(textLen/100.0, 3))
|
||||
|
||||
candidates[parentNode].score += contentScore
|
||||
if grandParentNode != nil {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue