wallabag/src/Helper/DownloadImages.php

395 lines
13 KiB
PHP
Raw Normal View History

<?php
2024-02-19 00:30:12 +00:00
namespace Wallabag\Helper;
2022-09-27 13:39:39 +00:00
use enshrined\svgSanitize\Sanitizer;
use GuzzleHttp\Psr7\Uri;
use GuzzleHttp\Psr7\UriResolver;
2017-07-01 07:52:38 +00:00
use Psr\Log\LoggerInterface;
use Symfony\Component\DomCrawler\Crawler;
use Symfony\Component\Finder\Finder;
2024-02-08 20:13:52 +00:00
use Symfony\Component\HttpFoundation\Request;
use Symfony\Component\Mime\MimeTypes;
2024-02-08 20:13:52 +00:00
use Symfony\Contracts\HttpClient\HttpClientInterface;
use Symfony\Contracts\HttpClient\ResponseInterface;
2016-10-22 07:22:30 +00:00
class DownloadImages
{
2022-12-13 09:26:51 +00:00
public const REGENERATE_PICTURES_QUALITY = 80;
2016-10-30 09:48:29 +00:00
private $client;
private $baseFolder;
private $logger;
private $mimeTypes;
private $wallabagUrl;
2024-02-08 20:13:52 +00:00
public function __construct(HttpClientInterface $downloadImagesClient, $baseFolder, $wallabagUrl, LoggerInterface $logger)
2016-10-22 07:22:30 +00:00
{
2024-02-08 20:13:52 +00:00
$this->client = $downloadImagesClient;
2016-10-30 09:48:29 +00:00
$this->baseFolder = $baseFolder;
$this->wallabagUrl = rtrim($wallabagUrl, '/');
$this->logger = $logger;
$this->mimeTypes = new MimeTypes();
2016-10-30 09:48:29 +00:00
$this->setFolder();
}
public function getBaseFolder()
{
return $this->baseFolder;
}
2016-10-30 09:48:29 +00:00
/**
2019-05-10 21:01:07 +00:00
* Process the html and extract images URLs from it.
2016-10-30 09:48:29 +00:00
*
* @param string $html
*
2019-05-10 21:01:07 +00:00
* @return string[]
2016-10-30 09:48:29 +00:00
*/
2019-05-10 21:01:07 +00:00
public static function extractImagesUrlsFromHtml($html)
2016-10-22 07:22:30 +00:00
{
2016-10-30 09:48:29 +00:00
$crawler = new Crawler($html);
$imagesCrawler = $crawler->filterXpath('//img');
$imagesUrls = $imagesCrawler->extract(['src']);
2019-05-10 21:01:07 +00:00
$imagesSrcsetUrls = self::getSrcsetUrls($imagesCrawler);
return array_unique(array_merge($imagesUrls, $imagesSrcsetUrls));
}
/**
* Process the html and extract image from it, save them to local and return the updated html.
*
* @param int $entryId ID of the entry
* @param string $html
* @param string $url Used as a base path for relative image and folder
*
* @return string
*/
public function processHtml($entryId, $html, $url)
{
$imagesUrls = self::extractImagesUrlsFromHtml($html);
// ensure images aren't overlapping
arsort($imagesUrls);
$relativePath = $this->getRelativePath($entryId);
2016-10-30 09:48:29 +00:00
// download and save the image to the folder
foreach ($imagesUrls as $image) {
$newImage = $this->processSingleImage($entryId, $image, $url, $relativePath);
2016-10-30 09:48:29 +00:00
if (false === $newImage) {
2016-10-30 09:48:29 +00:00
continue;
}
$html = str_replace($image, $newImage, $html);
// if image contains "&" and we can't find it in the html it might be because it's encoded as &amp; or unicode
if (false !== stripos($image, '&') && false === stripos($html, $image)) {
$imageAmp = str_replace('&', '&amp;', $image);
$html = str_replace($imageAmp, $newImage, $html);
$imageUnicode = str_replace('&', '&#038;', $image);
$html = str_replace($imageUnicode, $newImage, $html);
}
}
2016-10-30 09:48:29 +00:00
return $html;
}
2016-10-30 09:48:29 +00:00
/**
* Process a single image:
* - retrieve it
* - re-saved it (for security reason)
* - return the new local path.
*
* @param int $entryId ID of the entry
2016-10-30 09:48:29 +00:00
* @param string $imagePath Path to the image to retrieve
* @param string $url Url from where the image were found
* @param string $relativePath Relative local path to saved the image
*
2020-12-15 09:06:35 +00:00
* @return string|false Relative url to access the image from the web
2016-10-30 09:48:29 +00:00
*/
public function processSingleImage($entryId, $imagePath, $url, $relativePath = null)
2016-10-22 07:22:30 +00:00
{
if (null === $imagePath) {
return false;
}
if (null === $relativePath) {
$relativePath = $this->getRelativePath($entryId);
}
2017-07-01 07:52:38 +00:00
$this->logger->debug('DownloadImages: working on image: ' . $imagePath);
2017-07-01 07:52:38 +00:00
$folderPath = $this->baseFolder . '/' . $relativePath;
2016-10-30 09:48:29 +00:00
// build image path
$absolutePath = $this->getAbsoluteLink($url, $imagePath);
if (false === $absolutePath) {
$this->logger->error('DownloadImages: Can not determine the absolute path for that image, skipping.');
return false;
}
2016-10-30 10:27:09 +00:00
try {
2024-02-08 20:13:52 +00:00
$res = $this->client->request(Request::METHOD_GET, $absolutePath);
2016-10-30 10:27:09 +00:00
} catch (\Exception $e) {
$this->logger->error('DownloadImages: Can not retrieve image, skipping.', ['exception' => $e]);
2016-10-30 10:27:09 +00:00
return false;
}
2016-10-30 09:48:29 +00:00
$ext = $this->getExtensionFromResponse($res, $imagePath);
if (false === $res) {
return false;
}
2016-10-30 09:48:29 +00:00
$hashImage = hash('crc32', $absolutePath);
2017-07-01 07:52:38 +00:00
$localPath = $folderPath . '/' . $hashImage . '.' . $ext;
2022-09-27 13:39:39 +00:00
$urlPath = $this->wallabagUrl . '/assets/images/' . $relativePath . '/' . $hashImage . '.' . $ext;
// custom case for SVG (because GD doesn't support SVG)
if ('svg' === $ext) {
try {
$sanitizer = new Sanitizer();
$sanitizer->minify(true);
$sanitizer->removeRemoteReferences(true);
2024-02-08 20:13:52 +00:00
$cleanSVG = $sanitizer->sanitize($res->getContent());
2022-09-27 13:39:39 +00:00
// add an extra validation by checking about `<svg `
2024-01-01 18:11:01 +00:00
if (false === $cleanSVG || !str_contains($cleanSVG, '<svg ')) {
2022-09-27 13:39:39 +00:00
$this->logger->error('DownloadImages: Bad SVG given', ['path' => $imagePath]);
return false;
}
file_put_contents($localPath, $cleanSVG);
return $urlPath;
} catch (\Exception $e) {
$this->logger->error('DownloadImages: Error while sanitize SVG', ['path' => $imagePath, 'message' => $e->getMessage()]);
return false;
}
}
2016-10-30 09:48:29 +00:00
try {
2024-02-08 20:13:52 +00:00
$im = imagecreatefromstring($res->getContent());
2016-10-30 09:48:29 +00:00
} catch (\Exception $e) {
$im = false;
}
2016-10-30 10:27:09 +00:00
if (false === $im) {
$this->logger->error('DownloadImages: Error while regenerating image', ['path' => $localPath]);
return false;
}
2016-10-30 09:48:29 +00:00
switch ($ext) {
case 'gif':
// use Imagick if available to keep GIF animation
2022-09-01 18:54:56 +00:00
if (class_exists(\Imagick::class)) {
try {
$imagick = new \Imagick();
2024-02-08 20:13:52 +00:00
$imagick->readImageBlob($res->getContent());
$imagick->setImageFormat('gif');
$imagick->writeImages($localPath, true);
} catch (\Exception $e) {
// if Imagick fail, fallback to the default solution
imagegif($im, $localPath);
}
} else {
imagegif($im, $localPath);
}
$this->logger->debug('DownloadImages: Re-creating gif');
break;
2016-10-30 09:48:29 +00:00
case 'jpeg':
case 'jpg':
2016-11-03 17:01:25 +00:00
imagejpeg($im, $localPath, self::REGENERATE_PICTURES_QUALITY);
$this->logger->debug('DownloadImages: Re-creating jpg');
break;
2016-10-30 09:48:29 +00:00
case 'png':
imagealphablending($im, false);
imagesavealpha($im, true);
2016-11-03 17:01:25 +00:00
imagepng($im, $localPath, ceil(self::REGENERATE_PICTURES_QUALITY / 100 * 9));
$this->logger->debug('DownloadImages: Re-creating png');
2022-03-11 03:36:58 +00:00
break;
case 'webp':
imagewebp($im, $localPath, self::REGENERATE_PICTURES_QUALITY);
$this->logger->debug('DownloadImages: Re-creating webp');
}
2016-10-30 09:48:29 +00:00
imagedestroy($im);
2022-09-27 13:39:39 +00:00
return $urlPath;
}
/**
* Remove all images for the given entry id.
*
* @param int $entryId ID of the entry
*/
public function removeImages($entryId)
{
$relativePath = $this->getRelativePath($entryId);
2017-07-01 07:52:38 +00:00
$folderPath = $this->baseFolder . '/' . $relativePath;
$finder = new Finder();
$finder
->files()
->ignoreDotFiles(true)
->in($folderPath);
foreach ($finder as $file) {
@unlink($file->getRealPath());
}
@rmdir($folderPath);
}
/**
* Generate the folder where we are going to save images based on the entry url.
*
* @param int $entryId ID of the entry
* @param bool $createFolder Should we create the folder for the given id?
*
* @return string
*/
public function getRelativePath($entryId, $createFolder = true)
{
$hashId = hash('crc32', $entryId);
$relativePath = $hashId[0] . '/' . $hashId[1] . '/' . $hashId;
$folderPath = $this->baseFolder . '/' . $relativePath;
if (!file_exists($folderPath) && $createFolder) {
mkdir($folderPath, 0777, true);
}
$this->logger->debug('DownloadImages: Folder used for that Entry id', ['folder' => $folderPath, 'entryId' => $entryId]);
return $relativePath;
}
/**
* Get images urls from the srcset image attribute.
*
* @return array An array of urls
*/
2019-05-10 21:01:07 +00:00
private static function getSrcsetUrls(Crawler $imagesCrawler)
{
$urls = [];
$iterator = $imagesCrawler->getIterator();
while ($iterator->valid()) {
2024-01-01 18:51:22 +00:00
$node = $iterator->current();
\assert($node instanceof \DOMElement);
$srcsetAttribute = $node->getAttribute('srcset');
if ('' !== $srcsetAttribute) {
// Couldn't start with " OR ' OR a white space
// Could be one or more white space
// Must be one or more digits followed by w OR x
$pattern = "/(?:[^\"'\s]+\s*(?:\d+[wx])+)/";
preg_match_all($pattern, $srcsetAttribute, $matches);
$srcset = \call_user_func_array('array_merge', $matches);
$srcsetUrls = array_map(function ($src) {
return trim(explode(' ', $src, 2)[0]);
}, $srcset);
$urls = array_merge($srcsetUrls, $urls);
}
$iterator->next();
}
return $urls;
}
2017-07-01 07:52:38 +00:00
/**
* Setup base folder where all images are going to be saved.
*/
private function setFolder()
{
// if folder doesn't exist, attempt to create one and store the folder name in property $folder
if (!file_exists($this->baseFolder)) {
mkdir($this->baseFolder, 0755, true);
}
}
2016-10-30 09:48:29 +00:00
/**
* Make an $url absolute based on the $base.
*
* @see Graby->makeAbsoluteStr
*
* @param string $base Base url
* @param string $url Url to make it absolute
*
* @return false|string
*/
private function getAbsoluteLink($base, $url)
{
if (preg_match('!^https?://!i', $url)) {
// already absolute
return $url;
}
$base = new Uri($base);
// in case the url has no scheme & host
if ('' === $base->getAuthority() || '' === $base->getScheme()) {
$this->logger->error('DownloadImages: Can not make an absolute link', ['base' => $base, 'url' => $url]);
return false;
2016-06-28 20:06:00 +00:00
}
2016-10-22 07:22:30 +00:00
return (string) UriResolver::resolve($base, new Uri($url));
2016-06-28 20:06:00 +00:00
}
/**
* Retrieve and validate the extension from the response of the url of the image.
*
2017-10-24 20:55:40 +00:00
* @param ResponseInterface $res Http Response
2019-01-28 05:10:26 +00:00
* @param string $imagePath Path from the src image from the content (used for log only)
*
* @return string|false Extension name or false if validation failed
*/
2017-10-24 20:55:40 +00:00
private function getExtensionFromResponse(ResponseInterface $res, $imagePath)
{
2024-02-08 20:13:52 +00:00
if (200 !== $res->getStatusCode()) {
return false;
}
$ext = current($this->mimeTypes->getExtensions(current($res->getHeaders()['content-type'] ?? [])));
$this->logger->debug('DownloadImages: Checking extension', ['ext' => $ext, 'header' => $res->getHeaders()['content-type'] ?? []]);
// ok header doesn't have the extension, try a different way
if (empty($ext)) {
$types = [
'jpeg' => "\xFF\xD8\xFF",
'gif' => 'GIF',
'png' => "\x89\x50\x4e\x47\x0d\x0a",
2022-03-11 03:36:58 +00:00
'webp' => "\x52\x49\x46\x46",
];
2024-02-08 20:13:52 +00:00
$bytes = substr($res->getContent(), 0, 8);
foreach ($types as $type => $header) {
2024-01-01 18:11:01 +00:00
if (str_starts_with($bytes, $header)) {
$ext = $type;
break;
}
}
$this->logger->debug('DownloadImages: Checking extension (alternative)', ['ext' => $ext]);
}
2022-09-27 13:39:39 +00:00
if (!\in_array($ext, ['jpeg', 'jpg', 'gif', 'png', 'webp', 'svg'], true)) {
2017-07-01 07:52:38 +00:00
$this->logger->error('DownloadImages: Processed image with not allowed extension. Skipping: ' . $imagePath);
return false;
}
return $ext;
}
}