From 89c32d518d140aa49857c880d8ae632599ec3ddb Mon Sep 17 00:00:00 2001 From: jvoisin Date: Tue, 1 Jul 2025 15:58:11 +0200 Subject: [PATCH] perf(readability): significantly improve transformMisusedDivsIntoParagraphs --- internal/reader/readability/readability.go | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/internal/reader/readability/readability.go b/internal/reader/readability/readability.go index a3270c86..7c5384ee 100644 --- a/internal/reader/readability/readability.go +++ b/internal/reader/readability/readability.go @@ -361,10 +361,24 @@ func getWeight(s string) int { func transformMisusedDivsIntoParagraphs(document *goquery.Document) { document.Find("div").Each(func(i int, s *goquery.Selection) { - html, _ := s.Html() - if !divToPElementsRegexp.MatchString(html) { + nodes := s.Children().Nodes + + if len(nodes) == 0 { node := s.Get(0) node.Data = "p" + return + } + + for _, node := range nodes { + switch node.Data { + case "a", "blockquote", "div", "dl", + "img", "ol", "p", "pre", + "table", "ul": + return + default: + node := s.Get(0) + node.Data = "p" + } } }) }