Use an alternative way to detect image

When parsing content to retrieve images to save locally, we only check for the content-type of the image response.
In some case, that value is empty.
Now we’re also checking for the first few bytes of the content as an alternative to detect if it’s an image wallabag can handle.
We might get higher image supports using that alternative method.
This commit is contained in:
Jeremy Benoist 2017-06-05 22:54:02 +02:00
parent a687c8d915
commit 577c0b6dd8
No known key found for this signature in database
GPG key ID: BCA73962457ACC3C
3 changed files with 70 additions and 5 deletions

View file

@ -5,6 +5,7 @@ namespace Wallabag\CoreBundle\Helper;
use Psr\Log\LoggerInterface;
use Symfony\Component\DomCrawler\Crawler;
use GuzzleHttp\Client;
use GuzzleHttp\Message\Response;
use Symfony\Component\HttpFoundation\File\MimeType\MimeTypeExtensionGuesser;
use Symfony\Component\Finder\Finder;
@ -116,13 +117,11 @@ class DownloadImages
return false;
}
$ext = $this->mimeGuesser->guess($res->getHeader('content-type'));
$this->logger->debug('DownloadImages: Checking extension', ['ext' => $ext, 'header' => $res->getHeader('content-type')]);
if (!in_array($ext, ['jpeg', 'jpg', 'gif', 'png'], true)) {
$this->logger->error('DownloadImages: Processed image with not allowed extension. Skipping: '.$imagePath);
$ext = $this->getExtensionFromResponse($res, $imagePath);
if (false === $res) {
return false;
}
$hashImage = hash('crc32', $absolutePath);
$localPath = $folderPath.'/'.$hashImage.'.'.$ext;
@ -237,4 +236,45 @@ class DownloadImages
return false;
}
/**
* Retrieve and validate the extension from the response of the url of the image.
*
* @param Response $res Guzzle Response
* @param string $imagePath Path from the src image from the content (used for log only)
*
* @return string|false Extension name or false if validation failed
*/
private function getExtensionFromResponse(Response $res, $imagePath)
{
$ext = $this->mimeGuesser->guess($res->getHeader('content-type'));
$this->logger->debug('DownloadImages: Checking extension', ['ext' => $ext, 'header' => $res->getHeader('content-type')]);
// ok header doesn't have the extension, try a different way
if (empty($ext)) {
$types = [
'jpeg' => "\xFF\xD8\xFF",
'gif' => 'GIF',
'png' => "\x89\x50\x4e\x47\x0d\x0a",
];
$bytes = substr((string) $res->getBody(), 0, 8);
foreach ($types as $type => $header) {
if (0 === strpos($bytes, $header)) {
$ext = $type;
break;
}
}
$this->logger->debug('DownloadImages: Checking extension (alternative)', ['ext' => $ext]);
}
if (!in_array($ext, ['jpeg', 'jpg', 'gif', 'png'], true)) {
$this->logger->error('DownloadImages: Processed image with not allowed extension. Skipping: '.$imagePath);
return false;
}
return $ext;
}
}

View file

@ -157,4 +157,29 @@ class DownloadImagesTest extends \PHPUnit_Framework_TestCase
$this->assertFalse($res, 'Absolute image can not be determined, so it will not be replaced');
}
public function testProcessRealImage()
{
$client = new Client();
$mock = new Mock([
new Response(200, ['content-type' => null], Stream::factory(file_get_contents(__DIR__.'/../fixtures/image-no-content-type.jpg'))),
]);
$client->getEmitter()->attach($mock);
$logHandler = new TestHandler();
$logger = new Logger('test', array($logHandler));
$download = new DownloadImages($client, sys_get_temp_dir().'/wallabag_test', 'http://wallabag.io/', $logger);
$res = $download->processSingleImage(
123,
'https://cdn.theconversation.com/files/157200/article/width926/gsj2rjp2-1487348607.jpg',
'https://theconversation.com/conversation-avec-gerald-bronner-ce-nest-pas-la-post-verite-qui-nous-menace-mais-lextension-de-notre-credulite-73089'
);
$this->assertContains('http://wallabag.io/assets/images/9/b/9b0ead26/', $res, 'Content-Type was empty but data is ok for an image');
$this->assertContains('DownloadImages: Checking extension (alternative)', $logHandler->getRecords()[3]['message']);
}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 346 KiB