From 74a75f7d430eb7a69cd377194e52012db34d39b4 Mon Sep 17 00:00:00 2001 From: Jeremy Benoist Date: Fri, 12 May 2017 07:53:21 +0200 Subject: [PATCH] Use graby ContentExtractor to clean html It might be better to re-use some graby functionalities to clean html instead of building a new system. --- composer.json | 2 +- .../Controller/EntryRestController.php | 1 - .../CoreBundle/Helper/ContentProxy.php | 10 ++++ .../CoreBundle/Helper/ContentProxyTest.php | 55 +++++++++++++++++++ 4 files changed, 66 insertions(+), 2 deletions(-) diff --git a/composer.json b/composer.json index d8c58de28..31cfb6a14 100644 --- a/composer.json +++ b/composer.json @@ -64,7 +64,7 @@ "htmlawed/htmlawed": "~1.1.19", "liip/theme-bundle": "~1.1", "lexik/form-filter-bundle": "~5.0", - "j0k3r/graby": "~1.0", + "j0k3r/graby": "dev-extractor", "friendsofsymfony/user-bundle": "^2.0", "friendsofsymfony/oauth-server-bundle": "^1.5", "stof/doctrine-extensions-bundle": "^1.2", diff --git a/src/Wallabag/ApiBundle/Controller/EntryRestController.php b/src/Wallabag/ApiBundle/Controller/EntryRestController.php index e6bbe5528..0930c1097 100644 --- a/src/Wallabag/ApiBundle/Controller/EntryRestController.php +++ b/src/Wallabag/ApiBundle/Controller/EntryRestController.php @@ -336,7 +336,6 @@ class EntryRestController extends WallabagRestController $entry->setUrl($url); } - if (!empty($tags)) { $this->get('wallabag_core.tags_assigner')->assignTagsToEntry($entry, $tags); } diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index e06ad3d66..a1df16d8c 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -47,6 +47,16 @@ class ContentProxy { // ensure content is a bit cleaned up if (!empty($content['html'])) { + $extractor = $this->graby->getExtractor(); + $contentExtracted = $extractor->process($content['html'], $url); + + if ($contentExtracted) { + $contentBlock = $extractor->getContent(); + $contentBlock->normalize(); + + $content['html'] = trim($contentBlock->innerHTML); + } + $content['html'] = htmLawed($content['html'], [ 'safe' => 1, // which means: do not remove iframe elements diff --git a/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php b/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php index 44fca0737..7a50b3737 100644 --- a/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php +++ b/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php @@ -8,6 +8,7 @@ use Wallabag\CoreBundle\Entity\Entry; use Wallabag\CoreBundle\Entity\Tag; use Wallabag\UserBundle\Entity\User; use Wallabag\CoreBundle\Helper\RuleBasedTagger; +use Graby\Graby; class ContentProxyTest extends \PHPUnit_Framework_TestCase { @@ -253,6 +254,60 @@ class ContentProxyTest extends \PHPUnit_Framework_TestCase $this->assertCount(0, $entry->getTags()); } + public function dataForCrazyHtml() + { + return [ + 'script and comment' => [ + 'Script inside:
', + 'lol' + ], + 'script' => [ + 'Script inside:', + 'script' + ], + ]; + } + + /** + * @dataProvider dataForCrazyHtml + */ + public function testWithCrazyHtmlContent($html, $escapedString) + { + $tagger = $this->getTaggerMock(); + $tagger->expects($this->once()) + ->method('tag'); + + $graby = new Graby(); + + $proxy = new ContentProxy($graby, $tagger, $this->getTagRepositoryMock(), $this->getLogger(), $this->fetchingErrorMessage); + $entry = $proxy->updateEntry( + new Entry(new User()), + 'http://1.1.1.1', + [ + 'html' => $html, + 'title' => 'this is my title', + 'url' => 'http://1.1.1.1', + 'content_type' => 'text/html', + 'language' => 'fr', + 'status' => '200', + 'open_graph' => [ + 'og_title' => 'my OG title', + 'og_description' => 'OG desc', + 'og_image' => 'http://3.3.3.3/cover.jpg', + ], + ] + ); + + $this->assertEquals('http://1.1.1.1', $entry->getUrl()); + $this->assertEquals('this is my title', $entry->getTitle()); + $this->assertNotContains($escapedString, $entry->getContent()); + $this->assertEquals('http://3.3.3.3/cover.jpg', $entry->getPreviewPicture()); + $this->assertEquals('text/html', $entry->getMimetype()); + $this->assertEquals('fr', $entry->getLanguage()); + $this->assertEquals('200', $entry->getHttpStatus()); + $this->assertEquals('1.1.1.1', $entry->getDomainName()); + } + private function getTaggerMock() { return $this->getMockBuilder(RuleBasedTagger::class)