Use graby ContentExtractor to clean html

It might be better to re-use some graby functionalities to clean html instead of building a new system.
2025-09-05 18:41:02 +00:00 · 2017-05-12 07:53:21 +02:00 · 2017-05-12 07:53:21 +02:00 · 74a75f7d43
commit 74a75f7d43
parent fb436e8ca0
4 changed files with 66 additions and 2 deletions
--- a/src/Wallabag/ApiBundle/Controller/EntryRestController.php
+++ b/src/Wallabag/ApiBundle/Controller/EntryRestController.php
@ -336,7 +336,6 @@ class EntryRestController extends WallabagRestController
            $entry->setUrl($url);
        }

-
        if (!empty($tags)) {
            $this->get('wallabag_core.tags_assigner')->assignTagsToEntry($entry, $tags);
        }
--- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php
+++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
@ -47,6 +47,16 @@ class ContentProxy
    {
        // ensure content is a bit cleaned up
        if (!empty($content['html'])) {
+            $extractor = $this->graby->getExtractor();
+            $contentExtracted = $extractor->process($content['html'], $url);
+
+            if ($contentExtracted) {
+                $contentBlock = $extractor->getContent();
+                $contentBlock->normalize();
+
+                $content['html'] = trim($contentBlock->innerHTML);
+            }
+
            $content['html'] = htmLawed($content['html'], [
                'safe' => 1,
                // which means: do not remove iframe elements