<?php namespace Tests\Wallabag\Helper; use Graby\Graby; use Monolog\Handler\TestHandler; use Monolog\Logger; use PHPUnit\Framework\TestCase; use Psr\Log\NullLogger; use Symfony\Component\Validator\ConstraintViolation; use Symfony\Component\Validator\ConstraintViolationList; use Symfony\Component\Validator\Validator\RecursiveValidator; use Wallabag\Entity\Entry; use Wallabag\Entity\User; use Wallabag\Helper\ContentProxy; use Wallabag\Helper\RuleBasedIgnoreOriginProcessor; use Wallabag\Helper\RuleBasedTagger; class ContentProxyTest extends TestCase { private $fetchingErrorMessage = 'wallabag can\'t retrieve contents for this article. Please <a href="http://doc.wallabag.org/en/user/errors_during_fetching.html#how-can-i-help-to-fix-that">troubleshoot this issue</a>.'; public function testWithBadUrl() { $tagger = $this->getTaggerMock(); $tagger->expects($this->once()) ->method('tag'); $ruleBasedIgnoreOriginProcessor = $this->getRuleBasedIgnoreOriginProcessorMock(); $graby = $this->getMockBuilder(Graby::class) ->setMethods(['fetchContent']) ->disableOriginalConstructor() ->getMock(); $graby->expects($this->any()) ->method('fetchContent') ->willReturn([ 'html' => false, 'title' => '', 'url' => '', 'headers' => [ 'content-type' => '', ], 'language' => '', ]); $proxy = new ContentProxy($graby, $tagger, $ruleBasedIgnoreOriginProcessor, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); $entry = new Entry(new User()); $proxy->updateEntry($entry, 'http://user@:80'); $this->assertSame('http://user@:80', $entry->getUrl()); $this->assertEmpty($entry->getTitle()); $this->assertSame($this->fetchingErrorMessage, $entry->getContent()); $this->assertEmpty($entry->getPreviewPicture()); $this->assertEmpty($entry->getMimetype()); $this->assertEmpty($entry->getLanguage()); $this->assertSame(0.0, $entry->getReadingTime()); $this->assertNull($entry->getDomainName()); $this->assertTrue($entry->isNotParsed()); } public function testWithEmptyContent() { $tagger = $this->getTaggerMock(); $tagger->expects($this->once()) ->method('tag'); $ruleBasedIgnoreOriginProcessor = $this->getRuleBasedIgnoreOriginProcessorMock(); $graby = $this->getMockBuilder(Graby::class) ->setMethods(['fetchContent']) ->disableOriginalConstructor() ->getMock(); $graby->expects($this->any()) ->method('fetchContent') ->willReturn([ 'html' => false, 'title' => '', 'url' => '', 'headers' => [ 'content-type' => '', ], 'language' => '', ]); $proxy = new ContentProxy($graby, $tagger, $ruleBasedIgnoreOriginProcessor, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); $entry = new Entry(new User()); $proxy->updateEntry($entry, 'http://0.0.0.0'); $this->assertSame('http://0.0.0.0', $entry->getUrl()); $this->assertEmpty($entry->getTitle()); $this->assertSame($this->fetchingErrorMessage, $entry->getContent()); $this->assertEmpty($entry->getPreviewPicture()); $this->assertEmpty($entry->getMimetype()); $this->assertEmpty($entry->getLanguage()); $this->assertSame(0.0, $entry->getReadingTime()); $this->assertSame('0.0.0.0', $entry->getDomainName()); $this->assertTrue($entry->isNotParsed()); } public function testWithEmptyContentButOG() { $tagger = $this->getTaggerMock(); $tagger->expects($this->once()) ->method('tag'); $ruleBasedIgnoreOriginProcessor = $this->getRuleBasedIgnoreOriginProcessorMock(); $graby = $this->getMockBuilder(Graby::class) ->setMethods(['fetchContent']) ->disableOriginalConstructor() ->getMock(); $graby->expects($this->any()) ->method('fetchContent') ->willReturn([ 'html' => false, 'title' => 'my title', 'url' => '', 'headers' => [ 'content-type' => '', ], 'language' => '', 'status' => '', 'description' => 'desc', ]); $proxy = new ContentProxy($graby, $tagger, $ruleBasedIgnoreOriginProcessor, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); $entry = new Entry(new User()); $proxy->updateEntry($entry, 'http://domain.io'); $this->assertSame('http://domain.io', $entry->getUrl()); $this->assertSame('my title', $entry->getTitle()); $this->assertSame($this->fetchingErrorMessage . '<p><i>But we found a short description: </i></p>desc', $entry->getContent()); $this->assertEmpty($entry->getPreviewPicture()); $this->assertEmpty($entry->getLanguage()); $this->assertEmpty($entry->getHttpStatus()); $this->assertEmpty($entry->getMimetype()); $this->assertSame(0.0, $entry->getReadingTime()); $this->assertSame('domain.io', $entry->getDomainName()); $this->assertTrue($entry->isNotParsed()); } public function testWithContent() { $tagger = $this->getTaggerMock(); $tagger->expects($this->once()) ->method('tag'); $ruleBasedIgnoreOriginProcessor = $this->getRuleBasedIgnoreOriginProcessorMock(); $ruleBasedIgnoreOriginProcessor->expects($this->once()) ->method('process'); $graby = $this->getMockBuilder(Graby::class) ->setMethods(['fetchContent']) ->disableOriginalConstructor() ->getMock(); $graby->expects($this->any()) ->method('fetchContent') ->willReturn([ 'html' => str_repeat('this is my content', 325), 'title' => 'this is my title', 'url' => 'http://1.1.1.1', 'language' => 'fr', 'status' => '200', 'description' => 'OG desc', 'image' => 'http://3.3.3.3/cover.jpg', 'headers' => [ 'content-type' => 'text/html', ], ]); $proxy = new ContentProxy($graby, $tagger, $ruleBasedIgnoreOriginProcessor, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); $entry = new Entry(new User()); $proxy->updateEntry($entry, 'http://0.0.0.0'); $this->assertSame('http://1.1.1.1', $entry->getUrl()); $this->assertSame('this is my title', $entry->getTitle()); $this->assertStringContainsString('content', $entry->getContent()); $this->assertSame('http://3.3.3.3/cover.jpg', $entry->getPreviewPicture()); $this->assertSame('text/html', $entry->getMimetype()); $this->assertSame('fr', $entry->getLanguage()); $this->assertSame('200', $entry->getHttpStatus()); $this->assertSame(4.0, $entry->getReadingTime()); $this->assertSame('1.1.1.1', $entry->getDomainName()); $this->assertFalse($entry->isNotParsed()); } public function testWithContentAndNoOgImage() { $tagger = $this->getTaggerMock(); $tagger->expects($this->once()) ->method('tag'); $ruleBasedIgnoreOriginProcessor = $this->getRuleBasedIgnoreOriginProcessorMock(); $ruleBasedIgnoreOriginProcessor->expects($this->once()) ->method('process'); $graby = $this->getMockBuilder(Graby::class) ->setMethods(['fetchContent']) ->disableOriginalConstructor() ->getMock(); $graby->expects($this->any()) ->method('fetchContent') ->willReturn([ 'html' => str_repeat('this is my content', 325), 'title' => 'this is my title', 'url' => 'http://1.1.1.1', 'language' => 'fr', 'status' => '200', 'description' => 'OG desc', 'image' => null, 'headers' => [ 'content-type' => 'text/html', ], ]); $proxy = new ContentProxy($graby, $tagger, $ruleBasedIgnoreOriginProcessor, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); $entry = new Entry(new User()); $proxy->updateEntry($entry, 'http://0.0.0.0'); $this->assertSame('http://1.1.1.1', $entry->getUrl()); $this->assertSame('this is my title', $entry->getTitle()); $this->assertStringContainsString('content', $entry->getContent()); $this->assertNull($entry->getPreviewPicture()); $this->assertSame('text/html', $entry->getMimetype()); $this->assertSame('fr', $entry->getLanguage()); $this->assertSame('200', $entry->getHttpStatus()); $this->assertSame(4.0, $entry->getReadingTime()); $this->assertSame('1.1.1.1', $entry->getDomainName()); $this->assertFalse($entry->isNotParsed()); } public function testWithContentAndContentImage() { $tagger = $this->getTaggerMock(); $tagger->expects($this->once()) ->method('tag'); $ruleBasedIgnoreOriginProcessor = $this->getRuleBasedIgnoreOriginProcessorMock(); $ruleBasedIgnoreOriginProcessor->expects($this->once()) ->method('process'); $graby = $this->getMockBuilder(Graby::class) ->setMethods(['fetchContent']) ->disableOriginalConstructor() ->getMock(); $graby->expects($this->any()) ->method('fetchContent') ->willReturn([ 'html' => "<h1>Test</h1><p><img src='http://3.3.3.3/cover.jpg'/></p>", 'title' => 'this is my title', 'url' => 'http://1.1.1.1', 'headers' => [ 'content-type' => 'text/html', ], 'language' => 'fr', 'status' => '200', 'image' => null, ]); $proxy = new ContentProxy($graby, $tagger, $ruleBasedIgnoreOriginProcessor, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); $entry = new Entry(new User()); $proxy->updateEntry($entry, 'http://0.0.0.0'); $this->assertSame('http://1.1.1.1', $entry->getUrl()); $this->assertSame('this is my title', $entry->getTitle()); $this->assertSame("<h1>Test</h1><p><img src='http://3.3.3.3/cover.jpg'/></p>", $entry->getContent()); $this->assertSame('http://3.3.3.3/cover.jpg', $entry->getPreviewPicture()); $this->assertSame('text/html', $entry->getMimetype()); $this->assertSame('fr', $entry->getLanguage()); $this->assertSame('200', $entry->getHttpStatus()); $this->assertSame(0.0, $entry->getReadingTime()); $this->assertSame('1.1.1.1', $entry->getDomainName()); $this->assertFalse($entry->isNotParsed()); } public function testWithContentImageAndOgImage() { $tagger = $this->getTaggerMock(); $tagger->expects($this->once()) ->method('tag'); $ruleBasedIgnoreOriginProcessor = $this->getRuleBasedIgnoreOriginProcessorMock(); $ruleBasedIgnoreOriginProcessor->expects($this->once()) ->method('process'); $graby = $this->getMockBuilder(Graby::class) ->setMethods(['fetchContent']) ->disableOriginalConstructor() ->getMock(); $graby->expects($this->any()) ->method('fetchContent') ->willReturn([ 'html' => "<h1>Test</h1><p><img src='http://3.3.3.3/nevermind.jpg'/></p>", 'title' => 'this is my title', 'url' => 'http://1.1.1.1', 'headers' => [ 'content-type' => 'text/html', ], 'language' => 'fr', 'status' => '200', 'image' => 'http://3.3.3.3/cover.jpg', ]); $proxy = new ContentProxy($graby, $tagger, $ruleBasedIgnoreOriginProcessor, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); $entry = new Entry(new User()); $proxy->updateEntry($entry, 'http://0.0.0.0'); $this->assertSame('http://1.1.1.1', $entry->getUrl()); $this->assertSame('this is my title', $entry->getTitle()); $this->assertSame("<h1>Test</h1><p><img src='http://3.3.3.3/nevermind.jpg'/></p>", $entry->getContent()); $this->assertSame('http://3.3.3.3/cover.jpg', $entry->getPreviewPicture()); $this->assertSame('text/html', $entry->getMimetype()); $this->assertSame('fr', $entry->getLanguage()); $this->assertSame('200', $entry->getHttpStatus()); $this->assertSame(0.0, $entry->getReadingTime()); $this->assertSame('1.1.1.1', $entry->getDomainName()); $this->assertFalse($entry->isNotParsed()); } public function testWithContentAndBadLanguage() { $tagger = $this->getTaggerMock(); $tagger->expects($this->once()) ->method('tag'); $ruleBasedIgnoreOriginProcessor = $this->getRuleBasedIgnoreOriginProcessorMock(); $ruleBasedIgnoreOriginProcessor->expects($this->once()) ->method('process'); $validator = $this->getValidator(false); $validator->expects($this->once()) ->method('validate') ->willReturn(new ConstraintViolationList([new ConstraintViolation('oops', 'oops', [], 'oops', 'language', 'dontexist')])); $graby = $this->getMockBuilder(Graby::class) ->setMethods(['fetchContent']) ->disableOriginalConstructor() ->getMock(); $graby->expects($this->any()) ->method('fetchContent') ->willReturn([ 'html' => str_repeat('this is my content', 325), 'title' => 'this is my title', 'url' => 'http://1.1.1.1', 'language' => 'dontexist', 'status' => '200', 'headers' => [ 'content-type' => 'text/html', ], ]); $proxy = new ContentProxy($graby, $tagger, $ruleBasedIgnoreOriginProcessor, $validator, $this->getLogger(), $this->fetchingErrorMessage); $entry = new Entry(new User()); $proxy->updateEntry($entry, 'http://0.0.0.0'); $this->assertSame('http://1.1.1.1', $entry->getUrl()); $this->assertSame('this is my title', $entry->getTitle()); $this->assertStringContainsString('content', $entry->getContent()); $this->assertSame('text/html', $entry->getMimetype()); $this->assertNull($entry->getLanguage()); $this->assertSame('200', $entry->getHttpStatus()); $this->assertSame(4.0, $entry->getReadingTime()); $this->assertSame('1.1.1.1', $entry->getDomainName()); $this->assertFalse($entry->isNotParsed()); } public function testWithContentAndBadOgImage() { $tagger = $this->getTaggerMock(); $tagger->expects($this->once()) ->method('tag'); $ruleBasedIgnoreOriginProcessor = $this->getRuleBasedIgnoreOriginProcessorMock(); $ruleBasedIgnoreOriginProcessor->expects($this->once()) ->method('process'); $validator = $this->getValidator(false); $validator->expects($this->exactly(2)) ->method('validate') ->will($this->onConsecutiveCalls( new ConstraintViolationList(), new ConstraintViolationList([new ConstraintViolation('oops', 'oops', [], 'oops', 'url', 'https://')]) )); $graby = $this->getMockBuilder(Graby::class) ->setMethods(['fetchContent']) ->disableOriginalConstructor() ->getMock(); $graby->expects($this->any()) ->method('fetchContent') ->willReturn([ 'html' => str_repeat('this is my content', 325), 'title' => 'this is my title', 'url' => 'http://1.1.1.1', 'headers' => [ 'content-type' => 'text/html', ], 'language' => 'fr', 'status' => '200', 'description' => 'OG desc', 'image' => 'https://', ]); $proxy = new ContentProxy($graby, $tagger, $ruleBasedIgnoreOriginProcessor, $validator, $this->getLogger(), $this->fetchingErrorMessage); $entry = new Entry(new User()); $proxy->updateEntry($entry, 'http://0.0.0.0'); $this->assertSame('http://1.1.1.1', $entry->getUrl()); $this->assertSame('this is my title', $entry->getTitle()); $this->assertStringContainsString('content', $entry->getContent()); $this->assertNull($entry->getPreviewPicture()); $this->assertSame('text/html', $entry->getMimetype()); $this->assertSame('fr', $entry->getLanguage()); $this->assertSame('200', $entry->getHttpStatus()); $this->assertSame(4.0, $entry->getReadingTime()); $this->assertSame('1.1.1.1', $entry->getDomainName()); $this->assertFalse($entry->isNotParsed()); } public function testWithForcedContent() { $tagger = $this->getTaggerMock(); $tagger->expects($this->once()) ->method('tag'); $ruleBasedIgnoreOriginProcessor = $this->getRuleBasedIgnoreOriginProcessorMock(); $ruleBasedIgnoreOriginProcessor->expects($this->once()) ->method('process'); $proxy = new ContentProxy(new Graby(), $tagger, $ruleBasedIgnoreOriginProcessor, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage, true); $entry = new Entry(new User()); $proxy->updateEntry( $entry, 'http://0.0.0.0', [ 'html' => str_repeat('this is my content', 325), 'title' => 'this is my title', 'url' => 'http://1.1.1.1', 'language' => 'fr', 'date' => '1395635872', 'authors' => ['Jeremy', 'Nico', 'Thomas'], 'headers' => [ 'cache-control' => 'no-cache', 'content-type' => 'text/html', ], ] ); $this->assertSame('http://1.1.1.1', $entry->getUrl()); $this->assertSame('this is my title', $entry->getTitle()); $this->assertStringContainsString('content', $entry->getContent()); $this->assertSame('text/html', $entry->getMimetype()); $this->assertSame('fr', $entry->getLanguage()); $this->assertSame(4.0, $entry->getReadingTime()); $this->assertSame('1.1.1.1', $entry->getDomainName()); $this->assertSame('24/03/2014', $entry->getPublishedAt()->format('d/m/Y')); $this->assertContains('Jeremy', $entry->getPublishedBy()); $this->assertContains('Nico', $entry->getPublishedBy()); $this->assertContains('Thomas', $entry->getPublishedBy()); $this->assertNotNull($entry->getHeaders(), 'Headers are stored, so value is not null'); $this->assertContains('no-cache', $entry->getHeaders()); $this->assertFalse($entry->isNotParsed()); } public function testWithForcedContentAndDateTime() { $tagger = $this->getTaggerMock(); $tagger->expects($this->once()) ->method('tag'); $ruleBasedIgnoreOriginProcessor = $this->getRuleBasedIgnoreOriginProcessorMock(); $logHandler = new TestHandler(); $logger = new Logger('test', [$logHandler]); $proxy = new ContentProxy(new Graby(), $tagger, $ruleBasedIgnoreOriginProcessor, $this->getValidator(), $logger, $this->fetchingErrorMessage); $entry = new Entry(new User()); $proxy->updateEntry( $entry, 'http://1.1.1.1', [ 'html' => str_repeat('this is my content', 325), 'title' => 'this is my title', 'url' => 'http://1.1.1.1', 'language' => 'fr', 'date' => '2016-09-08T11:55:58+0200', 'headers' => [ 'content-type' => 'text/html', ], ] ); $this->assertSame('http://1.1.1.1', $entry->getUrl()); $this->assertSame('this is my title', $entry->getTitle()); $this->assertStringContainsString('content', $entry->getContent()); $this->assertSame('text/html', $entry->getMimetype()); $this->assertSame('fr', $entry->getLanguage()); $this->assertSame(4.0, $entry->getReadingTime()); $this->assertSame('1.1.1.1', $entry->getDomainName()); $this->assertSame('08/09/2016', $entry->getPublishedAt()->format('d/m/Y')); $this->assertFalse($entry->isNotParsed()); } public function testWithForcedContentAndBadDate() { $tagger = $this->getTaggerMock(); $tagger->expects($this->once()) ->method('tag'); $ruleBasedIgnoreOriginProcessor = $this->getRuleBasedIgnoreOriginProcessorMock(); $logger = new Logger('foo'); $handler = new TestHandler(); $logger->pushHandler($handler); $proxy = new ContentProxy(new Graby(), $tagger, $ruleBasedIgnoreOriginProcessor, $this->getValidator(), $logger, $this->fetchingErrorMessage); $entry = new Entry(new User()); $proxy->updateEntry( $entry, 'http://1.1.1.1', [ 'html' => str_repeat('this is my content', 325), 'title' => 'this is my title', 'url' => 'http://1.1.1.1', 'language' => 'fr', 'date' => '01 02 2012', 'headers' => [ 'content-type' => 'text/html', ], ] ); $this->assertSame('http://1.1.1.1', $entry->getUrl()); $this->assertSame('this is my title', $entry->getTitle()); $this->assertStringContainsString('content', $entry->getContent()); $this->assertSame('text/html', $entry->getMimetype()); $this->assertSame('fr', $entry->getLanguage()); $this->assertSame(4.0, $entry->getReadingTime()); $this->assertSame('1.1.1.1', $entry->getDomainName()); $this->assertNull($entry->getPublishedAt()); $this->assertFalse($entry->isNotParsed()); $records = $handler->getRecords(); $this->assertCount(3, $records); $this->assertStringContainsString('Error while defining date', $records[0]['message']); } public function testTaggerThrowException() { $tagger = $this->getTaggerMock(); $tagger->expects($this->once()) ->method('tag') ->will($this->throwException(new \Exception())); $ruleBasedIgnoreOriginProcessor = $this->getRuleBasedIgnoreOriginProcessorMock(); $proxy = new ContentProxy(new Graby(), $tagger, $ruleBasedIgnoreOriginProcessor, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); $entry = new Entry(new User()); $proxy->updateEntry( $entry, 'http://1.1.1.1', [ 'html' => str_repeat('this is my content', 325), 'title' => 'this is my title', 'url' => 'http://1.1.1.1', 'language' => 'fr', 'headers' => [ 'content-type' => 'text/html', ], ] ); $this->assertCount(0, $entry->getTags()); } public function dataForCrazyHtml() { return [ 'script and comment' => [ '<strong>Script inside:</strong> <!--[if gte IE 4]><script>alert(\'lol\');</script><![endif]--><br />', 'lol', ], 'script' => [ '<strong>Script inside:</strong><script>alert(\'lol\');</script>', 'script', ], ]; } /** * @dataProvider dataForCrazyHtml */ public function testWithCrazyHtmlContent($html, $escapedString) { $tagger = $this->getTaggerMock(); $tagger->expects($this->once()) ->method('tag'); $ruleBasedIgnoreOriginProcessor = $this->getRuleBasedIgnoreOriginProcessorMock(); $proxy = new ContentProxy(new Graby(), $tagger, $ruleBasedIgnoreOriginProcessor, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); $entry = new Entry(new User()); $proxy->updateEntry( $entry, 'http://1.1.1.1', [ 'html' => $html, 'title' => 'this is my title', 'url' => 'http://1.1.1.1', 'language' => 'fr', 'status' => '200', // 'og_title' => 'my OG title', 'description' => 'OG desc', 'image' => 'http://3.3.3.3/cover.jpg', 'headers' => [ 'content-type' => 'text/html', ], ] ); $this->assertSame('http://1.1.1.1', $entry->getUrl()); $this->assertSame('this is my title', $entry->getTitle()); $this->assertStringNotContainsString($escapedString, $entry->getContent()); $this->assertSame('http://3.3.3.3/cover.jpg', $entry->getPreviewPicture()); $this->assertSame('text/html', $entry->getMimetype()); $this->assertSame('fr', $entry->getLanguage()); $this->assertSame('200', $entry->getHttpStatus()); $this->assertSame('1.1.1.1', $entry->getDomainName()); $this->assertFalse($entry->isNotParsed()); } public function testWithImageAsContent() { $tagger = $this->getTaggerMock(); $tagger->expects($this->once()) ->method('tag'); $ruleBasedIgnoreOriginProcessor = $this->getRuleBasedIgnoreOriginProcessorMock(); $graby = $this->getMockBuilder(Graby::class) ->setMethods(['fetchContent']) ->disableOriginalConstructor() ->getMock(); $graby->expects($this->any()) ->method('fetchContent') ->willReturn([ 'html' => '<p><img src="http://1.1.1.1/image.jpg" /></p>', 'title' => 'this is my title', 'url' => 'http://1.1.1.1/image.jpg', 'status' => '200', 'headers' => [ 'content-type' => 'image/jpeg', ], ]); $proxy = new ContentProxy($graby, $tagger, $ruleBasedIgnoreOriginProcessor, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); $entry = new Entry(new User()); $proxy->updateEntry($entry, 'http://0.0.0.0'); $this->assertSame('http://1.1.1.1/image.jpg', $entry->getUrl()); $this->assertSame('this is my title', $entry->getTitle()); $this->assertStringContainsString('http://1.1.1.1/image.jpg', $entry->getContent()); $this->assertSame('http://1.1.1.1/image.jpg', $entry->getPreviewPicture()); $this->assertSame('image/jpeg', $entry->getMimetype()); $this->assertSame('200', $entry->getHttpStatus()); $this->assertSame('1.1.1.1', $entry->getDomainName()); $this->assertFalse($entry->isNotParsed()); } public function testWebsiteWithValidUTF8TitleDoNothing() { // You can use https://www.online-toolz.com/tools/text-hex-convertor.php to convert UTF-8 text <=> hex // See http://graphemica.com for more info about the characters // '😻ℤz' (U+1F63B or F09F98BB; U+2124 or E284A4; U+007A or 7A) in hexadecimal and UTF-8 $actualTitle = $this->hexToStr('F09F98BB' . 'E284A4' . '7A'); $tagger = $this->getTaggerMock(); $tagger->expects($this->once()) ->method('tag'); $ruleBasedIgnoreOriginProcessor = $this->getRuleBasedIgnoreOriginProcessorMock(); $graby = $this->getMockBuilder(Graby::class) ->setMethods(['fetchContent']) ->disableOriginalConstructor() ->getMock(); $graby->expects($this->any()) ->method('fetchContent') ->willReturn([ 'html' => false, 'title' => $actualTitle, 'url' => '', 'headers' => [ 'content-type' => 'text/html', ], 'language' => '', ]); $proxy = new ContentProxy($graby, $tagger, $ruleBasedIgnoreOriginProcessor, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); $entry = new Entry(new User()); $proxy->updateEntry($entry, 'http://0.0.0.0'); // '😻ℤz' (U+1F63B or F09F98BB; U+2124 or E284A4; U+007A or 7A) in hexadecimal and UTF-8 $expectedTitle = 'F09F98BB' . 'E284A4' . '7A'; $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle())); } public function testWebsiteWithInvalidUTF8TitleRemoveInvalidCharacter() { // See http://graphemica.com for more info about the characters // 'a€b' (61;80;62) in hexadecimal and WINDOWS-1252 - but 80 is a invalid UTF-8 character. // The correct UTF-8 € character (U+20AC) is E282AC $actualTitle = $this->hexToStr('61' . '80' . '62'); $tagger = $this->getTaggerMock(); $tagger->expects($this->once()) ->method('tag'); $ruleBasedIgnoreOriginProcessor = $this->getRuleBasedIgnoreOriginProcessorMock(); $graby = $this->getMockBuilder(Graby::class) ->setMethods(['fetchContent']) ->disableOriginalConstructor() ->getMock(); $graby->expects($this->any()) ->method('fetchContent') ->willReturn([ 'html' => false, 'title' => $actualTitle, 'url' => '', 'headers' => [ 'content-type' => 'text/html', ], 'language' => '', ]); $proxy = new ContentProxy($graby, $tagger, $ruleBasedIgnoreOriginProcessor, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); $entry = new Entry(new User()); $proxy->updateEntry($entry, 'http://0.0.0.0'); // 'ab' (61;62) because all invalid UTF-8 character (like 80) are removed $expectedTitle = '61' . '62'; $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle())); } public function testPdfWithUTF16BETitleConvertToUTF8() { // See http://graphemica.com for more info about the characters // '😻' (U+1F63B;D83DDE3B) in hexadecimal and as UTF16BE $actualTitle = $this->hexToStr('D83DDE3B'); $tagger = $this->getTaggerMock(); $tagger->expects($this->once()) ->method('tag'); $ruleBasedIgnoreOriginProcessor = $this->getRuleBasedIgnoreOriginProcessorMock(); $graby = $this->getMockBuilder(Graby::class) ->setMethods(['fetchContent']) ->disableOriginalConstructor() ->getMock(); $graby->expects($this->any()) ->method('fetchContent') ->willReturn([ 'html' => false, 'title' => $actualTitle, 'url' => '', 'headers' => [ 'content-type' => 'application/pdf', ], 'language' => '', ]); $proxy = new ContentProxy($graby, $tagger, $ruleBasedIgnoreOriginProcessor, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); $entry = new Entry(new User()); $proxy->updateEntry($entry, 'http://0.0.0.0'); // '😻' (U+1F63B or F09F98BB) in hexadecimal and UTF-8 $expectedTitle = 'F09F98BB'; $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle())); } public function testPdfWithUTF8TitleDoNothing() { // See http://graphemica.com for more info about the characters // '😻' (U+1F63B;D83DDE3B) in hexadecimal and as UTF8 $actualTitle = $this->hexToStr('F09F98BB'); $tagger = $this->getTaggerMock(); $tagger->expects($this->once()) ->method('tag'); $ruleBasedIgnoreOriginProcessor = $this->getRuleBasedIgnoreOriginProcessorMock(); $graby = $this->getMockBuilder(Graby::class) ->setMethods(['fetchContent']) ->disableOriginalConstructor() ->getMock(); $graby->expects($this->any()) ->method('fetchContent') ->willReturn([ 'html' => false, 'title' => $actualTitle, 'url' => '', 'headers' => [ 'content-type' => 'application/pdf', ], 'language' => '', ]); $proxy = new ContentProxy($graby, $tagger, $ruleBasedIgnoreOriginProcessor, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); $entry = new Entry(new User()); $proxy->updateEntry($entry, 'http://0.0.0.0'); // '😻' (U+1F63B or F09F98BB) in hexadecimal and UTF-8 $expectedTitle = 'F09F98BB'; $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle())); } public function testPdfWithWINDOWS1252TitleConvertToUTF8() { // See http://graphemica.com for more info about the characters // '€' (80) in hexadecimal and WINDOWS-1252 $actualTitle = $this->hexToStr('80'); $tagger = $this->getTaggerMock(); $tagger->expects($this->once()) ->method('tag'); $ruleBasedIgnoreOriginProcessor = $this->getRuleBasedIgnoreOriginProcessorMock(); $graby = $this->getMockBuilder(Graby::class) ->setMethods(['fetchContent']) ->disableOriginalConstructor() ->getMock(); $graby->expects($this->any()) ->method('fetchContent') ->willReturn([ 'html' => false, 'title' => $actualTitle, 'url' => '', 'headers' => [ 'content-type' => 'application/pdf', ], 'language' => '', ]); $proxy = new ContentProxy($graby, $tagger, $ruleBasedIgnoreOriginProcessor, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); $entry = new Entry(new User()); $proxy->updateEntry($entry, 'http://0.0.0.0'); // '€' (U+20AC or E282AC) in hexadecimal and UTF-8 $expectedTitle = 'E282AC'; $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle())); } public function testPdfWithInvalidCharacterInTitleRemoveInvalidCharacter() { /* * I spend too much time on trying to solve the problem of that test. * Starting with PHP 8.1 this test fails because the string with invalid character is detected as WINDOWS-1252 and then converted. * In PHP < 8.1, the string encoding can't be detected and nothing is then converted. * So the removal of the invalid char happens in `sanitizeUTF8Text` * * So, I don't understand why the string with invalid char is detected as WINDOWS-1252 in PHP 8.1 and not before. */ $this->markTestSkipped('Encoding issue in PHP >= 8.1'); // See http://graphemica.com for more info about the characters // '😻ℤ�z' (U+1F63B or F09F98BB; U+2124 or E284A4; invalid character 81; U+007A or 7A) in hexadecimal and UTF-8 // 0x81 is not a valid character for UTF16, UTF8 and WINDOWS-1252 $actualTitle = $this->hexToStr('F09F98BB' . 'E284A4' . '81' . '7A'); $tagger = $this->getTaggerMock(); $tagger->expects($this->once()) ->method('tag'); $ruleBasedIgnoreOriginProcessor = $this->getRuleBasedIgnoreOriginProcessorMock(); $graby = $this->getMockBuilder(Graby::class) ->setMethods(['fetchContent']) ->disableOriginalConstructor() ->getMock(); $graby->expects($this->any()) ->method('fetchContent') ->willReturn([ 'html' => false, 'title' => $actualTitle, 'url' => '', 'headers' => [ 'content-type' => 'application/pdf', ], 'language' => '', ]); $proxy = new ContentProxy($graby, $tagger, $ruleBasedIgnoreOriginProcessor, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); $entry = new Entry(new User()); $proxy->updateEntry($entry, 'http://0.0.0.0'); // '😻ℤz' (U+1F63B or F09F98BB; U+2124 or E284A4; U+007A or 7A) in hexadecimal and UTF-8 // the 0x81 (represented by �) is invalid for UTF16, UTF8 and WINDOWS-1252 and is removed $expectedTitle = 'F09F98BB' . 'E284A4' . '7A'; $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle())); } /** * Data provider for testWithChangedUrl. * * Arrays contain the following values: * $entry_url * $origin_url * $content_url * $expected_entry_url * $expected_origin_url * $expected_domain * $processor_result */ public function dataForChangedUrl() { return [ 'normal' => [ 'http://0.0.0.0', null, 'http://1.1.1.1', 'http://1.1.1.1', 'http://0.0.0.0', '1.1.1.1', false, ], 'origin already set' => [ 'http://0.0.0.0', 'http://hello', 'http://1.1.1.1', 'http://1.1.1.1', 'http://hello', '1.1.1.1', false, ], 'trailing slash' => [ 'https://example.com/hello-world', null, 'https://example.com/hello-world/', 'https://example.com/hello-world/', null, 'example.com', false, ], 'query string in fetched content' => [ 'https://example.org/hello', null, 'https://example.org/hello?world=1', 'https://example.org/hello?world=1', 'https://example.org/hello', 'example.org', false, ], 'fragment in fetched content' => [ 'https://example.org/hello', null, 'https://example.org/hello#world', 'https://example.org/hello', null, 'example.org', false, ], 'fragment and query string in fetched content' => [ 'https://example.org/hello', null, 'https://example.org/hello?foo#world', 'https://example.org/hello?foo#world', 'https://example.org/hello', 'example.org', false, ], 'different path and query string in fetch content' => [ 'https://example.org/hello', null, 'https://example.org/world?foo', 'https://example.org/world?foo', 'https://example.org/hello', 'example.org', false, ], 'feedproxy ignore list test' => [ 'http://feedproxy.google.com/~r/Wallabag/~3/helloworld', null, 'https://example.org/hello-wallabag', 'https://example.org/hello-wallabag', null, 'example.org', true, ], 'feedproxy ignore list test with origin url already set' => [ 'http://feedproxy.google.com/~r/Wallabag/~3/helloworld', 'https://example.org/this-is-source', 'https://example.org/hello-wallabag', 'https://example.org/hello-wallabag', 'https://example.org/this-is-source', 'example.org', true, ], 'lemonde ignore pattern test' => [ 'http://www.lemonde.fr/tiny/url', null, 'http://example.com/hello-world', 'http://example.com/hello-world', null, 'example.com', true, ], ]; } /** * @dataProvider dataForChangedUrl */ public function testWithChangedUrl($entry_url, $origin_url, $content_url, $expected_entry_url, $expected_origin_url, $expected_domain, $processor_result) { $tagger = $this->getTaggerMock(); $tagger->expects($this->once()) ->method('tag'); $ruleBasedIgnoreOriginProcessor = $this->getRuleBasedIgnoreOriginProcessorMock(); $ruleBasedIgnoreOriginProcessor->expects($this->once()) ->method('process') ->willReturn($processor_result); $proxy = new ContentProxy(new Graby(), $tagger, $ruleBasedIgnoreOriginProcessor, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage, true); $entry = new Entry(new User()); $entry->setOriginUrl($origin_url); $proxy->updateEntry( $entry, $entry_url, [ 'html' => false, 'title' => '', 'url' => $content_url, 'headers' => [ 'content-type' => '', ], 'language' => '', ], true ); $this->assertSame($expected_entry_url, $entry->getUrl()); $this->assertSame($expected_domain, $entry->getDomainName()); $this->assertSame($expected_origin_url, $entry->getOriginUrl()); } /** * https://stackoverflow.com/a/18506801. * * @return string */ private function strToHex($string) { $hex = ''; for ($i = 0; $i < \strlen($string); ++$i) { $ord = \ord($string[$i]); $hexCode = dechex($ord); $hex .= substr('0' . $hexCode, -2); } return strtoupper($hex); } /** * Convert hex to string. * * @see https://stackoverflow.com/a/18506801 * * @return string */ private function hexToStr($hex) { $string = ''; for ($i = 0; $i < \strlen($hex) - 1; $i += 2) { $string .= \chr(hexdec($hex[$i] . $hex[$i + 1])); } return $string; } private function getTaggerMock() { return $this->getMockBuilder(RuleBasedTagger::class) ->setMethods(['tag']) ->disableOriginalConstructor() ->getMock(); } private function getRuleBasedIgnoreOriginProcessorMock() { return $this->getMockBuilder(RuleBasedIgnoreOriginProcessor::class) ->setMethods(['process']) ->disableOriginalConstructor() ->getMock(); } private function getLogger() { return new NullLogger(); } private function getValidator($withDefaultMock = true) { $mock = $this->getMockBuilder(RecursiveValidator::class) ->setMethods(['validate']) ->disableOriginalConstructor() ->getMock(); if ($withDefaultMock) { $mock->expects($this->any()) ->method('validate') ->willReturn(new ConstraintViolationList()); } return $mock; } }