ignoreOriginUrl: add initial support of ignore lists

Add the ability to specify hosts and patterns lists to ignore the given
entry url and replace it with the fetched content url without touching
to origin_url.

This initial support should be reworked in the following months to move
the hardcoded ignore lists in the database.

Signed-off-by: Kevin Decherf <kevin@kdecherf.com>
This commit is contained in:
Kevin Decherf 2018-10-22 23:39:31 +02:00
parent fc040c749d
commit b49c87acf1
2 changed files with 92 additions and 21 deletions

View file

@ -332,31 +332,70 @@ class ContentProxy
$diff_keys = array_keys($diff);
sort($diff_keys);
switch ($diff_keys) {
case ['path']:
if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry
|| ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId
if ($this->ignoreUrl($entry->getUrl())) {
$entry->setUrl($url);
} else {
switch ($diff_keys) {
case ['path']:
if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry
|| ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId
$entry->setUrl($url);
}
break;
case ['scheme']:
$entry->setUrl($url);
}
break;
case ['scheme']:
$entry->setUrl($url);
break;
case ['fragment']:
case ['query']:
case ['fragment', 'query']:
// noop
break;
default:
if (empty($entry->getOriginUrl())) {
$entry->setOriginUrl($entry->getUrl());
}
$entry->setUrl($url);
break;
break;
case ['fragment']:
case ['query']:
case ['fragment', 'query']:
// noop
break;
default:
if (empty($entry->getOriginUrl())) {
$entry->setOriginUrl($entry->getUrl());
}
$entry->setUrl($url);
break;
}
}
}
}
/**
* Check entry url against an ignore list to replace with content url.
*
* XXX: move the ignore list in the database to let users handle it
*
* @param string $url url to test
*
* @return bool true if url matches ignore list otherwise false
*/
private function ignoreUrl($url)
{
$ignored_hosts = ['feedproxy.google.com', 'feeds.reuters.com'];
$ignored_patterns = ['https?://www\.lemonde\.fr/tiny.*'];
$parsed_url = parse_url($url);
$filtered = array_filter($ignored_hosts, function ($var) use ($parsed_url) {
return $var === $parsed_url['host'];
});
if ([] !== $filtered) {
return true;
}
$filtered = array_filter($ignored_patterns, function ($var) use ($url) {
return preg_match("`$var`i", $url);
});
if ([] !== $filtered) {
return true;
}
return false;
}
/**
* Validate that the given content has at least a title, an html and a url.
*

View file

@ -808,7 +808,39 @@ class ContentProxyTest extends TestCase
'https://example.org/hello',
null,
'example.org',
]
],
'different path and query string in fetch content' => [
'https://example.org/hello',
null,
'https://example.org/world?foo',
'https://example.org/world?foo',
'https://example.org/hello',
'example.org',
],
'feedproxy ignore list test' => [
'http://feedproxy.google.com/~r/Wallabag/~3/helloworld',
null,
'https://example.org/hello-wallabag',
'https://example.org/hello-wallabag',
null,
'example.org',
],
'feedproxy ignore list test with origin url already set' => [
'http://feedproxy.google.com/~r/Wallabag/~3/helloworld',
'https://example.org/this-is-source',
'https://example.org/hello-wallabag',
'https://example.org/hello-wallabag',
'https://example.org/this-is-source',
'example.org',
],
'lemonde ignore pattern test' => [
'http://www.lemonde.fr/tiny/url',
null,
'http://example.com/hello-world',
'http://example.com/hello-world',
null,
'example.com',
],
];
}