mirror of
https://github.com/wallabag/wallabag.git
synced 2024-12-22 15:46:30 +00:00
ignoreOriginUrl: add initial support of ignore lists
Add the ability to specify hosts and patterns lists to ignore the given entry url and replace it with the fetched content url without touching to origin_url. This initial support should be reworked in the following months to move the hardcoded ignore lists in the database. Signed-off-by: Kevin Decherf <kevin@kdecherf.com>
This commit is contained in:
parent
fc040c749d
commit
b49c87acf1
2 changed files with 92 additions and 21 deletions
|
@ -332,31 +332,70 @@ class ContentProxy
|
|||
$diff_keys = array_keys($diff);
|
||||
sort($diff_keys);
|
||||
|
||||
switch ($diff_keys) {
|
||||
case ['path']:
|
||||
if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry
|
||||
|| ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId
|
||||
if ($this->ignoreUrl($entry->getUrl())) {
|
||||
$entry->setUrl($url);
|
||||
} else {
|
||||
switch ($diff_keys) {
|
||||
case ['path']:
|
||||
if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry
|
||||
|| ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId
|
||||
$entry->setUrl($url);
|
||||
}
|
||||
break;
|
||||
case ['scheme']:
|
||||
$entry->setUrl($url);
|
||||
}
|
||||
break;
|
||||
case ['scheme']:
|
||||
$entry->setUrl($url);
|
||||
break;
|
||||
case ['fragment']:
|
||||
case ['query']:
|
||||
case ['fragment', 'query']:
|
||||
// noop
|
||||
break;
|
||||
default:
|
||||
if (empty($entry->getOriginUrl())) {
|
||||
$entry->setOriginUrl($entry->getUrl());
|
||||
}
|
||||
$entry->setUrl($url);
|
||||
break;
|
||||
break;
|
||||
case ['fragment']:
|
||||
case ['query']:
|
||||
case ['fragment', 'query']:
|
||||
// noop
|
||||
break;
|
||||
default:
|
||||
if (empty($entry->getOriginUrl())) {
|
||||
$entry->setOriginUrl($entry->getUrl());
|
||||
}
|
||||
$entry->setUrl($url);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check entry url against an ignore list to replace with content url.
|
||||
*
|
||||
* XXX: move the ignore list in the database to let users handle it
|
||||
*
|
||||
* @param string $url url to test
|
||||
*
|
||||
* @return bool true if url matches ignore list otherwise false
|
||||
*/
|
||||
private function ignoreUrl($url)
|
||||
{
|
||||
$ignored_hosts = ['feedproxy.google.com', 'feeds.reuters.com'];
|
||||
$ignored_patterns = ['https?://www\.lemonde\.fr/tiny.*'];
|
||||
|
||||
$parsed_url = parse_url($url);
|
||||
|
||||
$filtered = array_filter($ignored_hosts, function ($var) use ($parsed_url) {
|
||||
return $var === $parsed_url['host'];
|
||||
});
|
||||
|
||||
if ([] !== $filtered) {
|
||||
return true;
|
||||
}
|
||||
|
||||
$filtered = array_filter($ignored_patterns, function ($var) use ($url) {
|
||||
return preg_match("`$var`i", $url);
|
||||
});
|
||||
|
||||
if ([] !== $filtered) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate that the given content has at least a title, an html and a url.
|
||||
*
|
||||
|
|
|
@ -808,7 +808,39 @@ class ContentProxyTest extends TestCase
|
|||
'https://example.org/hello',
|
||||
null,
|
||||
'example.org',
|
||||
]
|
||||
],
|
||||
'different path and query string in fetch content' => [
|
||||
'https://example.org/hello',
|
||||
null,
|
||||
'https://example.org/world?foo',
|
||||
'https://example.org/world?foo',
|
||||
'https://example.org/hello',
|
||||
'example.org',
|
||||
],
|
||||
'feedproxy ignore list test' => [
|
||||
'http://feedproxy.google.com/~r/Wallabag/~3/helloworld',
|
||||
null,
|
||||
'https://example.org/hello-wallabag',
|
||||
'https://example.org/hello-wallabag',
|
||||
null,
|
||||
'example.org',
|
||||
],
|
||||
'feedproxy ignore list test with origin url already set' => [
|
||||
'http://feedproxy.google.com/~r/Wallabag/~3/helloworld',
|
||||
'https://example.org/this-is-source',
|
||||
'https://example.org/hello-wallabag',
|
||||
'https://example.org/hello-wallabag',
|
||||
'https://example.org/this-is-source',
|
||||
'example.org',
|
||||
],
|
||||
'lemonde ignore pattern test' => [
|
||||
'http://www.lemonde.fr/tiny/url',
|
||||
null,
|
||||
'http://example.com/hello-world',
|
||||
'http://example.com/hello-world',
|
||||
null,
|
||||
'example.com',
|
||||
],
|
||||
];
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue