From 1de9cf4241695ac3ff1d25fe8ac3de8204383b22 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Thu, 10 Jul 2025 17:21:16 +0200 Subject: [PATCH] perf(readability): simplify removeUnlikelyCandidates - Use an iterator instead of generating a whole slice when iterating on the selection. - Using an iterator allows to use a for-loop construct, instead of a lambda, which is a bit clearer - Do the filtering Find()'s selector, instead of in the loop, which doesn't matter much now that we're using an iterator, but it makes the code a bit more obvious/simpler, and likely reduces a bit the number of iterations. --- internal/reader/readability/readability.go | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/internal/reader/readability/readability.go b/internal/reader/readability/readability.go index 0510d1cb..6d1d4289 100644 --- a/internal/reader/readability/readability.go +++ b/internal/reader/readability/readability.go @@ -208,14 +208,18 @@ func shouldRemoveCandidate(str string) bool { } func removeUnlikelyCandidates(document *goquery.Document) { - document.Find("*").Each(func(i int, s *goquery.Selection) { - if s.Length() == 0 || s.Get(0).Data == "html" || s.Get(0).Data == "body" { - return + // Only select tags with either a class or an id attribute, + // and never the html nor body tags, as we don't want to ever remove them. + selector := "[class]:not(body,html)" + "," + "[id]:not(body,html)" + + for _, s := range document.Find(selector).EachIter() { + if s.Length() == 0 { + continue } // Don't remove elements within code blocks (pre or code tags) - if s.Closest("pre, code").Length() > 0 { - return + if s.Closest("pre,code").Length() > 0 { + continue } if class, ok := s.Attr("class"); ok && shouldRemoveCandidate(class) { @@ -223,7 +227,7 @@ func removeUnlikelyCandidates(document *goquery.Document) { } else if id, ok := s.Attr("id"); ok && shouldRemoveCandidate(id) { s.Remove() } - }) + } } func getTopCandidate(document *goquery.Document, candidates candidateList) *candidate {