Fix srcset attribute on images downloaded

This commit is contained in:
Simounet 2018-05-31 23:42:06 +02:00
parent 9707ac4661
commit c15bb5ad72
2 changed files with 54 additions and 3 deletions

View file

@ -42,14 +42,17 @@ class DownloadImages
public function processHtml($entryId, $html, $url)
{
$crawler = new Crawler($html);
$result = $crawler
->filterXpath('//img')
$imagesCrawler = $crawler
->filterXpath('//img');
$imagesUrls = $imagesCrawler
->extract(['src']);
$imagesSrcsetUrls = $this->getSrcsetUrls($imagesCrawler);
$imagesUrls = array_unique(array_merge($imagesUrls, $imagesSrcsetUrls));
$relativePath = $this->getRelativePath($entryId);
// download and save the image to the folder
foreach ($result as $image) {
foreach ($imagesUrls as $image) {
$imagePath = $this->processSingleImage($entryId, $image, $url, $relativePath);
if (false === $imagePath) {
@ -171,6 +174,33 @@ class DownloadImages
@rmdir($folderPath);
}
/**
* Get images urls from the srcset image attribute.
*
* @param Crawler $imagesCrawler
*
* @return array An array of urls
*/
protected function getSrcsetUrls(Crawler $imagesCrawler)
{
$urls = [];
$iterator = $imagesCrawler
->getIterator();
while ($iterator->valid()) {
$srcsetAttribute = $iterator->current()->getAttribute('srcset');
if ('' !== $srcsetAttribute) {
$srcset = array_map('trim', explode(',', $srcsetAttribute));
$srcsetUrls = array_map(function ($src) {
return explode(' ', $src)[0];
}, $srcset);
$urls = array_merge($srcsetUrls, $urls);
}
$iterator->next();
}
return $urls;
}
/**
* Setup base folder where all images are going to be saved.
*/

View file

@ -183,4 +183,25 @@ class DownloadImagesTest extends TestCase
$this->assertContains('http://wallabag.io/assets/images/9/b/9b0ead26/', $res, 'Content-Type was empty but data is ok for an image');
$this->assertContains('DownloadImages: Checking extension (alternative)', $logHandler->getRecords()[3]['message']);
}
public function testProcessImageWithSrcset()
{
$client = new Client();
$mock = new Mock([
new Response(200, ['content-type' => 'image/jpeg'], Stream::factory(file_get_contents(__DIR__ . '/../fixtures/image-no-content-type.jpg'))),
new Response(200, ['content-type' => 'image/jpeg'], Stream::factory(file_get_contents(__DIR__ . '/../fixtures/image-no-content-type.jpg'))),
new Response(200, ['content-type' => 'image/jpeg'], Stream::factory(file_get_contents(__DIR__ . '/../fixtures/image-no-content-type.jpg'))),
]);
$client->getEmitter()->attach($mock);
$logHandler = new TestHandler();
$logger = new Logger('test', [$logHandler]);
$download = new DownloadImages($client, sys_get_temp_dir() . '/wallabag_test', 'http://wallabag.io/', $logger);
$res = $download->processHtml(123, '<p><img class="alignnone wp-image-1153" src="http://piketty.blog.lemonde.fr/files/2017/10/F1FR-530x375.jpg" alt="" width="628" height="444" srcset="http://piketty.blog.lemonde.fr/files/2017/10/F1FR-530x375.jpg 530w, http://piketty.blog.lemonde.fr/files/2017/10/F1FR-768x543.jpg 768w, http://piketty.blog.lemonde.fr/files/2017/10/F1FR-900x636.jpg 900w" sizes="(max-width: 628px) 100vw, 628px" /></p>', 'http://piketty.blog.lemonde.fr/2017/10/12/budget-2018-la-jeunesse-sacrifiee/');
$this->assertNotContains('http://piketty.blog.lemonde.fr/', $res, 'Image srcset attribute were not replaced');
}
}