diff --git a/internal/reader/readability/readability.go b/internal/reader/readability/readability.go index 0510d1cb..6d1d4289 100644 --- a/internal/reader/readability/readability.go +++ b/internal/reader/readability/readability.go @@ -208,14 +208,18 @@ func shouldRemoveCandidate(str string) bool { } func removeUnlikelyCandidates(document *goquery.Document) { - document.Find("*").Each(func(i int, s *goquery.Selection) { - if s.Length() == 0 || s.Get(0).Data == "html" || s.Get(0).Data == "body" { - return + // Only select tags with either a class or an id attribute, + // and never the html nor body tags, as we don't want to ever remove them. + selector := "[class]:not(body,html)" + "," + "[id]:not(body,html)" + + for _, s := range document.Find(selector).EachIter() { + if s.Length() == 0 { + continue } // Don't remove elements within code blocks (pre or code tags) - if s.Closest("pre, code").Length() > 0 { - return + if s.Closest("pre,code").Length() > 0 { + continue } if class, ok := s.Attr("class"); ok && shouldRemoveCandidate(class) { @@ -223,7 +227,7 @@ func removeUnlikelyCandidates(document *goquery.Document) { } else if id, ok := s.Attr("id"); ok && shouldRemoveCandidate(id) { s.Remove() } - }) + } } func getTopCandidate(document *goquery.Document, candidates candidateList) *candidate {