mirror of
https://github.com/miniflux/v2.git
synced 2025-08-06 17:41:00 +00:00
refactor(readability): add a getSelectionLength function
When we're only interested in the length of contained Text, there is no need to materialize it fully to then call len() on the result: we can simply iterate over the text element and sum their length instead.
This commit is contained in:
parent
435a950d64
commit
8a98926674
1 changed files with 29 additions and 5 deletions
|
@ -107,6 +107,28 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er
|
||||||
return baseURL, extractedContent, nil
|
return baseURL, extractedContent, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getSelectionLength(s *goquery.Selection) int {
|
||||||
|
var getLengthOfTextContent func(*html.Node) int
|
||||||
|
getLengthOfTextContent = func(n *html.Node) int {
|
||||||
|
total := 0
|
||||||
|
if n.Type == html.TextNode {
|
||||||
|
total += len(n.Data)
|
||||||
|
}
|
||||||
|
if n.FirstChild != nil {
|
||||||
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||||
|
total += getLengthOfTextContent(c)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return total
|
||||||
|
}
|
||||||
|
|
||||||
|
sum := 0
|
||||||
|
for _, n := range s.Nodes {
|
||||||
|
sum += getLengthOfTextContent(n)
|
||||||
|
}
|
||||||
|
return sum
|
||||||
|
}
|
||||||
|
|
||||||
// Now that we have the top candidate, look through its siblings for content that might also be related.
|
// Now that we have the top candidate, look through its siblings for content that might also be related.
|
||||||
// Things like preambles, content split by ads that we removed, etc.
|
// Things like preambles, content split by ads that we removed, etc.
|
||||||
func getArticle(topCandidate *candidate, candidates candidateList) string {
|
func getArticle(topCandidate *candidate, candidates candidateList) string {
|
||||||
|
@ -127,8 +149,7 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
|
||||||
} else if s.Is("p") {
|
} else if s.Is("p") {
|
||||||
tag = node.Data
|
tag = node.Data
|
||||||
linkDensity := getLinkDensity(s)
|
linkDensity := getLinkDensity(s)
|
||||||
content := s.Text()
|
contentLength := getSelectionLength(s)
|
||||||
contentLength := len(content)
|
|
||||||
|
|
||||||
if contentLength >= 80 {
|
if contentLength >= 80 {
|
||||||
if linkDensity < .25 {
|
if linkDensity < .25 {
|
||||||
|
@ -136,6 +157,8 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if linkDensity == 0 {
|
if linkDensity == 0 {
|
||||||
|
// It's a small selection, so .Text doesn't impact performances too much.
|
||||||
|
content := s.Text()
|
||||||
if containsSentence(content) {
|
if containsSentence(content) {
|
||||||
append = true
|
append = true
|
||||||
}
|
}
|
||||||
|
@ -223,10 +246,10 @@ func getCandidates(document *goquery.Document) candidateList {
|
||||||
candidates := make(candidateList)
|
candidates := make(candidateList)
|
||||||
|
|
||||||
document.Find(defaultTagsToScore).Each(func(i int, s *goquery.Selection) {
|
document.Find(defaultTagsToScore).Each(func(i int, s *goquery.Selection) {
|
||||||
text := s.Text()
|
textLen := getSelectionLength(s)
|
||||||
|
|
||||||
// If this paragraph is less than 25 characters, don't even count it.
|
// If this paragraph is less than 25 characters, don't even count it.
|
||||||
if len(text) < 25 {
|
if textLen < 25 {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -253,10 +276,11 @@ func getCandidates(document *goquery.Document) candidateList {
|
||||||
contentScore := float32(1.0)
|
contentScore := float32(1.0)
|
||||||
|
|
||||||
// Add points for any commas within this paragraph.
|
// Add points for any commas within this paragraph.
|
||||||
|
text := s.Text()
|
||||||
contentScore += float32(strings.Count(text, ",") + 1)
|
contentScore += float32(strings.Count(text, ",") + 1)
|
||||||
|
|
||||||
// For every 100 characters in this paragraph, add another point. Up to 3 points.
|
// For every 100 characters in this paragraph, add another point. Up to 3 points.
|
||||||
contentScore += float32(min(len(text)/100.0, 3))
|
contentScore += float32(min(textLen/100.0, 3))
|
||||||
|
|
||||||
candidates[parentNode].score += contentScore
|
candidates[parentNode].score += contentScore
|
||||||
if grandParentNode != nil {
|
if grandParentNode != nil {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue