mirror of
https://github.com/wallabag/wallabag.git
synced 2024-12-17 21:26:27 +00:00
Merge pull request #3553 from wallabag/url-3529
Swap entry url with origin url if graby provides an updated one
This commit is contained in:
commit
a6e4e83809
2 changed files with 244 additions and 1 deletions
|
@ -66,6 +66,13 @@ class ContentProxy
|
||||||
// so we'll be able to refetch it in the future
|
// so we'll be able to refetch it in the future
|
||||||
$content['url'] = !empty($content['url']) ? $content['url'] : $url;
|
$content['url'] = !empty($content['url']) ? $content['url'] : $url;
|
||||||
|
|
||||||
|
// In one case (at least in tests), url is empty here
|
||||||
|
// so we set it using $url provided in the updateEntry call.
|
||||||
|
// Not sure what are the other possible cases where this property is empty
|
||||||
|
if (empty($entry->getUrl()) && !empty($url)) {
|
||||||
|
$entry->setUrl($url);
|
||||||
|
}
|
||||||
|
|
||||||
$this->stockEntry($entry, $content);
|
$this->stockEntry($entry, $content);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -239,7 +246,7 @@ class ContentProxy
|
||||||
*/
|
*/
|
||||||
private function stockEntry(Entry $entry, array $content)
|
private function stockEntry(Entry $entry, array $content)
|
||||||
{
|
{
|
||||||
$entry->setUrl($content['url']);
|
$this->updateOriginUrl($entry, $content['url']);
|
||||||
|
|
||||||
$this->setEntryDomainName($entry);
|
$this->setEntryDomainName($entry);
|
||||||
|
|
||||||
|
@ -305,6 +312,115 @@ class ContentProxy
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Update the origin_url field when a redirection occurs
|
||||||
|
* This field is set if it is empty and new url does not match ignore list.
|
||||||
|
*
|
||||||
|
* @param Entry $entry
|
||||||
|
* @param string $url
|
||||||
|
*/
|
||||||
|
private function updateOriginUrl(Entry $entry, $url)
|
||||||
|
{
|
||||||
|
if (empty($url) || $entry->getUrl() === $url) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
$parsed_entry_url = parse_url($entry->getUrl());
|
||||||
|
$parsed_content_url = parse_url($url);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The following part computes the list of part changes between two
|
||||||
|
* parse_url arrays.
|
||||||
|
*
|
||||||
|
* As array_diff_assoc only computes changes to go from the left array
|
||||||
|
* to the right one, we make two differents arrays to have both
|
||||||
|
* directions. We merge these two arrays and sort keys before passing
|
||||||
|
* the result to the switch.
|
||||||
|
*
|
||||||
|
* The resulting array gives us all changing parts between the two
|
||||||
|
* urls: scheme, host, path, query and/or fragment.
|
||||||
|
*/
|
||||||
|
$diff_ec = array_diff_assoc($parsed_entry_url, $parsed_content_url);
|
||||||
|
$diff_ce = array_diff_assoc($parsed_content_url, $parsed_entry_url);
|
||||||
|
|
||||||
|
$diff = array_merge($diff_ec, $diff_ce);
|
||||||
|
$diff_keys = array_keys($diff);
|
||||||
|
sort($diff_keys);
|
||||||
|
|
||||||
|
if ($this->ignoreUrl($entry->getUrl())) {
|
||||||
|
$entry->setUrl($url);
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This switch case lets us apply different behaviors according to
|
||||||
|
* changing parts of urls.
|
||||||
|
*
|
||||||
|
* As $diff_keys is an array, we provide arrays as cases. ['path'] means
|
||||||
|
* 'only the path is different between the two urls' whereas
|
||||||
|
* ['fragment', 'query'] means 'only fragment and query string parts are
|
||||||
|
* different between the two urls'.
|
||||||
|
*
|
||||||
|
* Note that values in $diff_keys are sorted.
|
||||||
|
*/
|
||||||
|
switch ($diff_keys) {
|
||||||
|
case ['path']:
|
||||||
|
if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry
|
||||||
|
|| ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId
|
||||||
|
$entry->setUrl($url);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case ['scheme']:
|
||||||
|
$entry->setUrl($url);
|
||||||
|
break;
|
||||||
|
case ['fragment']:
|
||||||
|
// noop
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
if (empty($entry->getOriginUrl())) {
|
||||||
|
$entry->setOriginUrl($entry->getUrl());
|
||||||
|
}
|
||||||
|
$entry->setUrl($url);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check entry url against an ignore list to replace with content url.
|
||||||
|
*
|
||||||
|
* XXX: move the ignore list in the database to let users handle it
|
||||||
|
*
|
||||||
|
* @param string $url url to test
|
||||||
|
*
|
||||||
|
* @return bool true if url matches ignore list otherwise false
|
||||||
|
*/
|
||||||
|
private function ignoreUrl($url)
|
||||||
|
{
|
||||||
|
$ignored_hosts = ['feedproxy.google.com', 'feeds.reuters.com'];
|
||||||
|
$ignored_patterns = ['https?://www\.lemonde\.fr/tiny.*'];
|
||||||
|
|
||||||
|
$parsed_url = parse_url($url);
|
||||||
|
|
||||||
|
$filtered = array_filter($ignored_hosts, function ($var) use ($parsed_url) {
|
||||||
|
return $var === $parsed_url['host'];
|
||||||
|
});
|
||||||
|
|
||||||
|
if ([] !== $filtered) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
$filtered = array_filter($ignored_patterns, function ($var) use ($url) {
|
||||||
|
return preg_match("`$var`i", $url);
|
||||||
|
});
|
||||||
|
|
||||||
|
if ([] !== $filtered) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Validate that the given content has at least a title, an html and a url.
|
* Validate that the given content has at least a title, an html and a url.
|
||||||
*
|
*
|
||||||
|
|
|
@ -739,6 +739,133 @@ class ContentProxyTest extends TestCase
|
||||||
$this->assertSame($expectedTitle, $this->strToHex($entry->getTitle()));
|
$this->assertSame($expectedTitle, $this->strToHex($entry->getTitle()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Data provider for testWithChangedUrl.
|
||||||
|
*
|
||||||
|
* Arrays contain the following values:
|
||||||
|
* $entry_url
|
||||||
|
* $origin_url
|
||||||
|
* $content_url
|
||||||
|
* $expected_entry_url
|
||||||
|
* $expected_origin_url
|
||||||
|
* $expected_domain
|
||||||
|
*/
|
||||||
|
public function dataForChangedUrl()
|
||||||
|
{
|
||||||
|
return [
|
||||||
|
'normal' => [
|
||||||
|
'http://0.0.0.0',
|
||||||
|
null,
|
||||||
|
'http://1.1.1.1',
|
||||||
|
'http://1.1.1.1',
|
||||||
|
'http://0.0.0.0',
|
||||||
|
'1.1.1.1',
|
||||||
|
],
|
||||||
|
'origin already set' => [
|
||||||
|
'http://0.0.0.0',
|
||||||
|
'http://hello',
|
||||||
|
'http://1.1.1.1',
|
||||||
|
'http://1.1.1.1',
|
||||||
|
'http://hello',
|
||||||
|
'1.1.1.1',
|
||||||
|
],
|
||||||
|
'trailing slash' => [
|
||||||
|
'https://example.com/hello-world',
|
||||||
|
null,
|
||||||
|
'https://example.com/hello-world/',
|
||||||
|
'https://example.com/hello-world/',
|
||||||
|
null,
|
||||||
|
'example.com',
|
||||||
|
],
|
||||||
|
'query string in fetched content' => [
|
||||||
|
'https://example.org/hello',
|
||||||
|
null,
|
||||||
|
'https://example.org/hello?world=1',
|
||||||
|
'https://example.org/hello?world=1',
|
||||||
|
'https://example.org/hello',
|
||||||
|
'example.org',
|
||||||
|
],
|
||||||
|
'fragment in fetched content' => [
|
||||||
|
'https://example.org/hello',
|
||||||
|
null,
|
||||||
|
'https://example.org/hello#world',
|
||||||
|
'https://example.org/hello',
|
||||||
|
null,
|
||||||
|
'example.org',
|
||||||
|
],
|
||||||
|
'fragment and query string in fetched content' => [
|
||||||
|
'https://example.org/hello',
|
||||||
|
null,
|
||||||
|
'https://example.org/hello?foo#world',
|
||||||
|
'https://example.org/hello?foo#world',
|
||||||
|
'https://example.org/hello',
|
||||||
|
'example.org',
|
||||||
|
],
|
||||||
|
'different path and query string in fetch content' => [
|
||||||
|
'https://example.org/hello',
|
||||||
|
null,
|
||||||
|
'https://example.org/world?foo',
|
||||||
|
'https://example.org/world?foo',
|
||||||
|
'https://example.org/hello',
|
||||||
|
'example.org',
|
||||||
|
],
|
||||||
|
'feedproxy ignore list test' => [
|
||||||
|
'http://feedproxy.google.com/~r/Wallabag/~3/helloworld',
|
||||||
|
null,
|
||||||
|
'https://example.org/hello-wallabag',
|
||||||
|
'https://example.org/hello-wallabag',
|
||||||
|
null,
|
||||||
|
'example.org',
|
||||||
|
],
|
||||||
|
'feedproxy ignore list test with origin url already set' => [
|
||||||
|
'http://feedproxy.google.com/~r/Wallabag/~3/helloworld',
|
||||||
|
'https://example.org/this-is-source',
|
||||||
|
'https://example.org/hello-wallabag',
|
||||||
|
'https://example.org/hello-wallabag',
|
||||||
|
'https://example.org/this-is-source',
|
||||||
|
'example.org',
|
||||||
|
],
|
||||||
|
'lemonde ignore pattern test' => [
|
||||||
|
'http://www.lemonde.fr/tiny/url',
|
||||||
|
null,
|
||||||
|
'http://example.com/hello-world',
|
||||||
|
'http://example.com/hello-world',
|
||||||
|
null,
|
||||||
|
'example.com',
|
||||||
|
],
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @dataProvider dataForChangedUrl
|
||||||
|
*/
|
||||||
|
public function testWithChangedUrl($entry_url, $origin_url, $content_url, $expected_entry_url, $expected_origin_url, $expected_domain)
|
||||||
|
{
|
||||||
|
$tagger = $this->getTaggerMock();
|
||||||
|
$tagger->expects($this->once())
|
||||||
|
->method('tag');
|
||||||
|
|
||||||
|
$proxy = new ContentProxy((new Graby()), $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage, true);
|
||||||
|
$entry = new Entry(new User());
|
||||||
|
$entry->setOriginUrl($origin_url);
|
||||||
|
$proxy->updateEntry(
|
||||||
|
$entry,
|
||||||
|
$entry_url,
|
||||||
|
[
|
||||||
|
'html' => false,
|
||||||
|
'title' => '',
|
||||||
|
'url' => $content_url,
|
||||||
|
'content_type' => '',
|
||||||
|
'language' => '',
|
||||||
|
],
|
||||||
|
true
|
||||||
|
);
|
||||||
|
|
||||||
|
$this->assertSame($expected_entry_url, $entry->getUrl());
|
||||||
|
$this->assertSame($expected_domain, $entry->getDomainName());
|
||||||
|
$this->assertSame($expected_origin_url, $entry->getOriginUrl());
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* https://stackoverflow.com/a/18506801.
|
* https://stackoverflow.com/a/18506801.
|
||||||
*
|
*
|
||||||
|
|
Loading…
Reference in a new issue