From 2f7b2e737533ed31777d1db3275fe9c4057e5bb9 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Sun, 29 Jun 2025 19:21:36 +0200 Subject: [PATCH] perf(readability): improve getLinkDensity - There is no need to materialize all the content of a given Node when we can simply compute its length directly, saving a lot of memory, on the order of several megabytes on my instance, with peaks at a couple of dozen. - One might object to the usage of a recursive construct, but this is a direct port of goquery's Text method, so this change doesn't make anything worse. - The computation of linkLength can be similarly computed, but this can go in another commit, as it's a bit trickier, since we need to get the length of every Node that has a `a` Node as parent, without iterating on the whole parent chain every time. --- internal/reader/readability/readability.go | 24 ++++++++++++++++--- .../reader/readability/readability_test.go | 2 +- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/internal/reader/readability/readability.go b/internal/reader/readability/readability.go index c423f1a7..a3270c86 100644 --- a/internal/reader/readability/readability.go +++ b/internal/reader/readability/readability.go @@ -300,15 +300,33 @@ func scoreNode(s *goquery.Selection) *candidate { // Get the density of links as a percentage of the content // This is the amount of text that is inside a link divided by the total text in the node. func getLinkDensity(s *goquery.Selection) float32 { - textLength := len(s.Text()) + var getLengthOfTextContent func(*html.Node) int + getLengthOfTextContent = func(n *html.Node) int { + total := 0 + if n.Type == html.TextNode { + total += len(n.Data) + } + if n.FirstChild != nil { + for c := n.FirstChild; c != nil; c = c.NextSibling { + total += getLengthOfTextContent(c) + } + } + return total + } - if textLength == 0 { + sum := 0 + for _, n := range s.Nodes { + sum += getLengthOfTextContent(n) + } + + if sum == 0 { return 0 } + // TODO: use something better than materializing the HTML. linkLength := len(s.Find("a").Text()) - return float32(linkLength) / float32(textLength) + return float32(linkLength) / float32(sum) } // Get an elements class/id weight. Uses regular expressions to tell if this diff --git a/internal/reader/readability/readability_test.go b/internal/reader/readability/readability_test.go index 96813d42..de4a434c 100644 --- a/internal/reader/readability/readability_test.go +++ b/internal/reader/readability/readability_test.go @@ -1274,7 +1274,7 @@ func TestGetLinkDensity(t *testing.T) { // Use a small epsilon for float comparison epsilon := float32(0.001) if result < tc.expected-epsilon || result > tc.expected+epsilon { - t.Errorf("Expected link density %f, got %f", tc.expected, result) + t.Errorf("Expected link density %f, got %f for %s", tc.expected, result, tc.name) } }) }