1
0
Fork 0
mirror of https://github.com/wallabag/wallabag.git synced 2025-07-27 17:28:39 +00:00

More robust srcset image attribute handling

Linked to HTMLawed PR https://github.com/kesar/HTMLawed/pull/17
This commit is contained in:
Simounet 2018-07-11 19:57:34 +02:00
parent 0f36a88e16
commit e6f12c0734
2 changed files with 33 additions and 3 deletions

View file

@ -185,7 +185,7 @@ class DownloadImages
*
* @return array An array of urls
*/
protected function getSrcsetUrls(Crawler $imagesCrawler)
private function getSrcsetUrls(Crawler $imagesCrawler)
{
$urls = [];
$iterator = $imagesCrawler
@ -193,9 +193,14 @@ class DownloadImages
while ($iterator->valid()) {
$srcsetAttribute = $iterator->current()->getAttribute('srcset');
if ('' !== $srcsetAttribute) {
$srcset = array_map('trim', explode(',', $srcsetAttribute));
// Couldn't start with " OR ' OR a white space
// Could be one or more white space
// Must be one or more digits followed by w OR x
$pattern = "/(?:[^\"'\s]+\s*(?:\d+[wx])+)/";
preg_match_all($pattern, $srcsetAttribute, $matches);
$srcset = call_user_func_array('array_merge', $matches);
$srcsetUrls = array_map(function ($src) {
return explode(' ', $src)[0];
return trim(explode(' ', $src, 2)[0]);
}, $srcset);
$urls = array_merge($srcsetUrls, $urls);
}