1
0
Fork 0
mirror of https://github.com/wallabag/wallabag.git synced 2025-07-17 17:08:37 +00:00

Use graby ContentExtractor to clean html

It might be better to re-use some graby functionalities to clean html instead of building a new system.
This commit is contained in:
Jeremy Benoist 2017-05-12 07:53:21 +02:00
parent fb436e8ca0
commit 74a75f7d43
No known key found for this signature in database
GPG key ID: BCA73962457ACC3C
4 changed files with 66 additions and 2 deletions

View file

@ -336,7 +336,6 @@ class EntryRestController extends WallabagRestController
$entry->setUrl($url);
}
if (!empty($tags)) {
$this->get('wallabag_core.tags_assigner')->assignTagsToEntry($entry, $tags);
}

View file

@ -47,6 +47,16 @@ class ContentProxy
{
// ensure content is a bit cleaned up
if (!empty($content['html'])) {
$extractor = $this->graby->getExtractor();
$contentExtracted = $extractor->process($content['html'], $url);
if ($contentExtracted) {
$contentBlock = $extractor->getContent();
$contentBlock->normalize();
$content['html'] = trim($contentBlock->innerHTML);
}
$content['html'] = htmLawed($content['html'], [
'safe' => 1,
// which means: do not remove iframe elements