1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-08-06 17:41:00 +00:00

perf(readability): improve getLinkDensity

- There is no need to materialize all the content of a given Node when we can
  simply compute its length directly, saving a lot of memory, on the order of
  several megabytes on my instance, with peaks at a couple of dozen.
- One might object to the usage of a recursive construct, but this is a direct
  port of goquery's Text method, so this change doesn't make anything worse.
- The computation of linkLength can be similarly computed, but this can go in
  another commit, as it's a bit trickier, since we need to get the length of
  every Node that has a `a` Node as parent, without iterating on the whole
  parent chain every time.
This commit is contained in:
jvoisin 2025-06-29 19:21:36 +02:00 committed by Frédéric Guillot
parent 6eeccae7cd
commit 2f7b2e7375
2 changed files with 22 additions and 4 deletions

View file

@ -300,15 +300,33 @@ func scoreNode(s *goquery.Selection) *candidate {
// Get the density of links as a percentage of the content // Get the density of links as a percentage of the content
// This is the amount of text that is inside a link divided by the total text in the node. // This is the amount of text that is inside a link divided by the total text in the node.
func getLinkDensity(s *goquery.Selection) float32 { func getLinkDensity(s *goquery.Selection) float32 {
textLength := len(s.Text()) var getLengthOfTextContent func(*html.Node) int
getLengthOfTextContent = func(n *html.Node) int {
total := 0
if n.Type == html.TextNode {
total += len(n.Data)
}
if n.FirstChild != nil {
for c := n.FirstChild; c != nil; c = c.NextSibling {
total += getLengthOfTextContent(c)
}
}
return total
}
if textLength == 0 { sum := 0
for _, n := range s.Nodes {
sum += getLengthOfTextContent(n)
}
if sum == 0 {
return 0 return 0
} }
// TODO: use something better than materializing the HTML.
linkLength := len(s.Find("a").Text()) linkLength := len(s.Find("a").Text())
return float32(linkLength) / float32(textLength) return float32(linkLength) / float32(sum)
} }
// Get an elements class/id weight. Uses regular expressions to tell if this // Get an elements class/id weight. Uses regular expressions to tell if this

View file

@ -1274,7 +1274,7 @@ func TestGetLinkDensity(t *testing.T) {
// Use a small epsilon for float comparison // Use a small epsilon for float comparison
epsilon := float32(0.001) epsilon := float32(0.001)
if result < tc.expected-epsilon || result > tc.expected+epsilon { if result < tc.expected-epsilon || result > tc.expected+epsilon {
t.Errorf("Expected link density %f, got %f", tc.expected, result) t.Errorf("Expected link density %f, got %f for %s", tc.expected, result, tc.name)
} }
}) })
} }