From b49c87acf12f22e38db751fb35be5da2436abc45 Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Mon, 22 Oct 2018 23:39:31 +0200 Subject: [PATCH] ignoreOriginUrl: add initial support of ignore lists Add the ability to specify hosts and patterns lists to ignore the given entry url and replace it with the fetched content url without touching to origin_url. This initial support should be reworked in the following months to move the hardcoded ignore lists in the database. Signed-off-by: Kevin Decherf --- .../CoreBundle/Helper/ContentProxy.php | 79 ++++++++++++++----- .../CoreBundle/Helper/ContentProxyTest.php | 34 +++++++- 2 files changed, 92 insertions(+), 21 deletions(-) diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index 1a2a330ff..2dc436f8e 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -332,31 +332,70 @@ class ContentProxy $diff_keys = array_keys($diff); sort($diff_keys); - switch ($diff_keys) { - case ['path']: - if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry - || ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId + if ($this->ignoreUrl($entry->getUrl())) { + $entry->setUrl($url); + } else { + switch ($diff_keys) { + case ['path']: + if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry + || ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId + $entry->setUrl($url); + } + break; + case ['scheme']: $entry->setUrl($url); - } - break; - case ['scheme']: - $entry->setUrl($url); - break; - case ['fragment']: - case ['query']: - case ['fragment', 'query']: - // noop - break; - default: - if (empty($entry->getOriginUrl())) { - $entry->setOriginUrl($entry->getUrl()); - } - $entry->setUrl($url); - break; + break; + case ['fragment']: + case ['query']: + case ['fragment', 'query']: + // noop + break; + default: + if (empty($entry->getOriginUrl())) { + $entry->setOriginUrl($entry->getUrl()); + } + $entry->setUrl($url); + break; + } } } } + /** + * Check entry url against an ignore list to replace with content url. + * + * XXX: move the ignore list in the database to let users handle it + * + * @param string $url url to test + * + * @return bool true if url matches ignore list otherwise false + */ + private function ignoreUrl($url) + { + $ignored_hosts = ['feedproxy.google.com', 'feeds.reuters.com']; + $ignored_patterns = ['https?://www\.lemonde\.fr/tiny.*']; + + $parsed_url = parse_url($url); + + $filtered = array_filter($ignored_hosts, function ($var) use ($parsed_url) { + return $var === $parsed_url['host']; + }); + + if ([] !== $filtered) { + return true; + } + + $filtered = array_filter($ignored_patterns, function ($var) use ($url) { + return preg_match("`$var`i", $url); + }); + + if ([] !== $filtered) { + return true; + } + + return false; + } + /** * Validate that the given content has at least a title, an html and a url. * diff --git a/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php b/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php index 3debc4577..a60aec5b4 100644 --- a/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php +++ b/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php @@ -808,7 +808,39 @@ class ContentProxyTest extends TestCase 'https://example.org/hello', null, 'example.org', - ] + ], + 'different path and query string in fetch content' => [ + 'https://example.org/hello', + null, + 'https://example.org/world?foo', + 'https://example.org/world?foo', + 'https://example.org/hello', + 'example.org', + ], + 'feedproxy ignore list test' => [ + 'http://feedproxy.google.com/~r/Wallabag/~3/helloworld', + null, + 'https://example.org/hello-wallabag', + 'https://example.org/hello-wallabag', + null, + 'example.org', + ], + 'feedproxy ignore list test with origin url already set' => [ + 'http://feedproxy.google.com/~r/Wallabag/~3/helloworld', + 'https://example.org/this-is-source', + 'https://example.org/hello-wallabag', + 'https://example.org/hello-wallabag', + 'https://example.org/this-is-source', + 'example.org', + ], + 'lemonde ignore pattern test' => [ + 'http://www.lemonde.fr/tiny/url', + null, + 'http://example.com/hello-world', + 'http://example.com/hello-world', + null, + 'example.com', + ], ]; }