mirror of
https://github.com/miniflux/v2.git
synced 2025-08-06 17:41:00 +00:00
perf(readability): improve getLinkDensity
- There is no need to materialize all the content of a given Node when we can simply compute its length directly, saving a lot of memory, on the order of several megabytes on my instance, with peaks at a couple of dozen. - One might object to the usage of a recursive construct, but this is a direct port of goquery's Text method, so this change doesn't make anything worse. - The computation of linkLength can be similarly computed, but this can go in another commit, as it's a bit trickier, since we need to get the length of every Node that has a `a` Node as parent, without iterating on the whole parent chain every time.
This commit is contained in:
parent
6eeccae7cd
commit
2f7b2e7375
2 changed files with 22 additions and 4 deletions
|
@ -300,15 +300,33 @@ func scoreNode(s *goquery.Selection) *candidate {
|
|||
// Get the density of links as a percentage of the content
|
||||
// This is the amount of text that is inside a link divided by the total text in the node.
|
||||
func getLinkDensity(s *goquery.Selection) float32 {
|
||||
textLength := len(s.Text())
|
||||
var getLengthOfTextContent func(*html.Node) int
|
||||
getLengthOfTextContent = func(n *html.Node) int {
|
||||
total := 0
|
||||
if n.Type == html.TextNode {
|
||||
total += len(n.Data)
|
||||
}
|
||||
if n.FirstChild != nil {
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
total += getLengthOfTextContent(c)
|
||||
}
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
if textLength == 0 {
|
||||
sum := 0
|
||||
for _, n := range s.Nodes {
|
||||
sum += getLengthOfTextContent(n)
|
||||
}
|
||||
|
||||
if sum == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
// TODO: use something better than materializing the HTML.
|
||||
linkLength := len(s.Find("a").Text())
|
||||
|
||||
return float32(linkLength) / float32(textLength)
|
||||
return float32(linkLength) / float32(sum)
|
||||
}
|
||||
|
||||
// Get an elements class/id weight. Uses regular expressions to tell if this
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue