From a9394f6d4fb8bd598bfb86c65c82a8ab00ae5548 Mon Sep 17 00:00:00 2001 From: Yassine Guedidi Date: Tue, 19 Nov 2024 23:30:28 +0100 Subject: [PATCH 1/4] Use DomCrawler in HtmlImport --- src/Import/HtmlImport.php | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/src/Import/HtmlImport.php b/src/Import/HtmlImport.php index e2f0600e5..909ff9bc8 100644 --- a/src/Import/HtmlImport.php +++ b/src/Import/HtmlImport.php @@ -2,6 +2,7 @@ namespace Wallabag\Import; +use Symfony\Component\DomCrawler\Crawler; use Wallabag\Entity\Entry; use Wallabag\Event\EntrySavedEvent; @@ -29,27 +30,23 @@ abstract class HtmlImport extends AbstractImport return false; } - $html = new \DOMDocument(); + $crawler = new Crawler(file_get_contents($this->filepath)); - libxml_use_internal_errors(true); - $html->loadHTMLFile($this->filepath); - $hrefs = $html->getElementsByTagName('a'); - libxml_use_internal_errors(false); + $hrefs = $crawler->filterXPath('//a'); - if (0 === $hrefs->length) { + if (0 === $hrefs->count()) { $this->logger->error('Wallabag HTML: no entries in imported file'); return false; } - $entries = []; - foreach ($hrefs as $href) { - $entry = []; - $entry['url'] = $href->getAttribute('href'); - $entry['tags'] = $href->getAttribute('tags'); - $entry['created_at'] = $href->getAttribute('add_date'); - $entries[] = $entry; - } + $entries = $hrefs->each(function (Crawler $node) { + return [ + 'url' => $node->attr('href'), + 'tags' => $node->attr('tags'), + 'created_at' => $node->attr('add_date'), + ]; + }); if ($this->producer) { $this->parseEntriesForProducer($entries); From c0d02153ab2b44ceb103ae35c0cc40799b7c31a4 Mon Sep 17 00:00:00 2001 From: Yassine Guedidi Date: Tue, 19 Nov 2024 23:30:37 +0100 Subject: [PATCH 2/4] Use DomCrawler in PocketHtmlImport --- src/Import/PocketHtmlImport.php | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/Import/PocketHtmlImport.php b/src/Import/PocketHtmlImport.php index d9ed7be21..b1c7e3edb 100644 --- a/src/Import/PocketHtmlImport.php +++ b/src/Import/PocketHtmlImport.php @@ -2,6 +2,8 @@ namespace Wallabag\Import; +use Symfony\Component\DomCrawler\Crawler; + class PocketHtmlImport extends HtmlImport { protected $filepath; @@ -44,27 +46,23 @@ class PocketHtmlImport extends HtmlImport return false; } - $html = new \DOMDocument(); + $crawler = new Crawler(file_get_contents($this->filepath)); - libxml_use_internal_errors(true); - $html->loadHTMLFile($this->filepath); - $hrefs = $html->getElementsByTagName('a'); - libxml_use_internal_errors(false); + $hrefs = $crawler->filterXPath('//a'); - if (0 === $hrefs->length) { + if (0 === $hrefs->count()) { $this->logger->error('Pocket HTML: no entries in imported file'); return false; } - $entries = []; - foreach ($hrefs as $href) { - $entry = []; - $entry['url'] = $href->getAttribute('href'); - $entry['tags'] = $href->getAttribute('tags'); - $entry['created_at'] = $href->getAttribute('time_added'); - $entries[] = $entry; - } + $entries = $hrefs->each(function (Crawler $node) { + return [ + 'url' => $node->attr('href'), + 'tags' => $node->attr('tags'), + 'created_at' => $node->attr('time_added'), + ]; + }); if ($this->producer) { $this->parseEntriesForProducer($entries); From 4e7b5c66ad7b627614975f9e61740f92ceb60ae4 Mon Sep 17 00:00:00 2001 From: Yassine Guedidi Date: Tue, 19 Nov 2024 23:30:49 +0100 Subject: [PATCH 3/4] Use DomCrawler in LoginFormAuthenticator --- .../Authenticator/LoginFormAuthenticator.php | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/src/SiteConfig/Authenticator/LoginFormAuthenticator.php b/src/SiteConfig/Authenticator/LoginFormAuthenticator.php index 68dded2d3..3292e79c6 100644 --- a/src/SiteConfig/Authenticator/LoginFormAuthenticator.php +++ b/src/SiteConfig/Authenticator/LoginFormAuthenticator.php @@ -4,6 +4,7 @@ namespace Wallabag\SiteConfig\Authenticator; use GuzzleHttp\ClientInterface; use GuzzleHttp\Cookie\CookieJar; +use Symfony\Component\DomCrawler\Crawler; use Symfony\Component\ExpressionLanguage\ExpressionLanguage; use Wallabag\ExpressionLanguage\AuthenticatorProvider; use Wallabag\SiteConfig\SiteConfig; @@ -54,22 +55,16 @@ class LoginFormAuthenticator implements Authenticator public function isLoginRequired($html) { - $useInternalErrors = libxml_use_internal_errors(true); - // need to check for the login dom element ($options['not_logged_in_xpath']) in the HTML - $doc = new \DOMDocument(); - $doc->loadHTML($html); + try { + $crawler = new Crawler((string) $html); - $xpath = new \DOMXPath($doc); - $loggedIn = $xpath->evaluate((string) $this->siteConfig->getNotLoggedInXpath()); - - if (false === $loggedIn) { + $loggedIn = $crawler->evaluate((string) $this->siteConfig->getNotLoggedInXpath()); + } catch (\Throwable $e) { return false; } - libxml_use_internal_errors($useInternalErrors); - - return $loggedIn->length > 0; + return \count($loggedIn) > 0; } /** From 9f2fb0c6fab71f545cfc9fd9829310d8ed3c30f6 Mon Sep 17 00:00:00 2001 From: Yassine Guedidi Date: Tue, 19 Nov 2024 23:30:18 +0100 Subject: [PATCH 4/4] Use DomCrawler in AuthenticatorProvider --- .../AuthenticatorProvider.php | 21 +++++++------------ 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/src/ExpressionLanguage/AuthenticatorProvider.php b/src/ExpressionLanguage/AuthenticatorProvider.php index dadf6c6bd..f51bc8b6e 100644 --- a/src/ExpressionLanguage/AuthenticatorProvider.php +++ b/src/ExpressionLanguage/AuthenticatorProvider.php @@ -3,6 +3,7 @@ namespace Wallabag\ExpressionLanguage; use GuzzleHttp\ClientInterface; +use Symfony\Component\DomCrawler\Crawler; use Symfony\Component\ExpressionLanguage\ExpressionFunction; use Symfony\Component\ExpressionLanguage\ExpressionFunctionProviderInterface; @@ -69,27 +70,19 @@ class AuthenticatorProvider implements ExpressionFunctionProviderInterface throw new \Exception('Not supported'); }, function (array $arguments, $xpathQuery, $html) { - $useInternalErrors = libxml_use_internal_errors(true); + try { + $crawler = new Crawler((string) $html); - $doc = new \DOMDocument(); - $doc->loadHTML((string) $html, \LIBXML_NOCDATA | \LIBXML_NOWARNING | \LIBXML_NOERROR); - - $xpath = new \DOMXPath($doc); - $domNodeList = $xpath->query($xpathQuery); - - if (0 === $domNodeList->length) { + $crawler = $crawler->filterXPath($xpathQuery); + } catch (\Throwable $e) { return ''; } - $domNode = $domNodeList->item(0); - - libxml_use_internal_errors($useInternalErrors); - - if (null === $domNode || null === $domNode->attributes) { + if (0 === $crawler->count()) { return ''; } - return $domNode->attributes->getNamedItem('value')->nodeValue; + return (string) $crawler->first()->attr('value'); } ); }