diff --git a/composer.json b/composer.json
index d8c58de28..31cfb6a14 100644
--- a/composer.json
+++ b/composer.json
@@ -64,7 +64,7 @@
"htmlawed/htmlawed": "~1.1.19",
"liip/theme-bundle": "~1.1",
"lexik/form-filter-bundle": "~5.0",
- "j0k3r/graby": "~1.0",
+ "j0k3r/graby": "dev-extractor",
"friendsofsymfony/user-bundle": "^2.0",
"friendsofsymfony/oauth-server-bundle": "^1.5",
"stof/doctrine-extensions-bundle": "^1.2",
diff --git a/src/Wallabag/ApiBundle/Controller/EntryRestController.php b/src/Wallabag/ApiBundle/Controller/EntryRestController.php
index e6bbe5528..0930c1097 100644
--- a/src/Wallabag/ApiBundle/Controller/EntryRestController.php
+++ b/src/Wallabag/ApiBundle/Controller/EntryRestController.php
@@ -336,7 +336,6 @@ class EntryRestController extends WallabagRestController
$entry->setUrl($url);
}
-
if (!empty($tags)) {
$this->get('wallabag_core.tags_assigner')->assignTagsToEntry($entry, $tags);
}
diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
index e06ad3d66..a1df16d8c 100644
--- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php
+++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
@@ -47,6 +47,16 @@ class ContentProxy
{
// ensure content is a bit cleaned up
if (!empty($content['html'])) {
+ $extractor = $this->graby->getExtractor();
+ $contentExtracted = $extractor->process($content['html'], $url);
+
+ if ($contentExtracted) {
+ $contentBlock = $extractor->getContent();
+ $contentBlock->normalize();
+
+ $content['html'] = trim($contentBlock->innerHTML);
+ }
+
$content['html'] = htmLawed($content['html'], [
'safe' => 1,
// which means: do not remove iframe elements
diff --git a/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php b/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php
index 44fca0737..7a50b3737 100644
--- a/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php
+++ b/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php
@@ -8,6 +8,7 @@ use Wallabag\CoreBundle\Entity\Entry;
use Wallabag\CoreBundle\Entity\Tag;
use Wallabag\UserBundle\Entity\User;
use Wallabag\CoreBundle\Helper\RuleBasedTagger;
+use Graby\Graby;
class ContentProxyTest extends \PHPUnit_Framework_TestCase
{
@@ -253,6 +254,60 @@ class ContentProxyTest extends \PHPUnit_Framework_TestCase
$this->assertCount(0, $entry->getTags());
}
+ public function dataForCrazyHtml()
+ {
+ return [
+ 'script and comment' => [
+ 'Script inside:
',
+ 'lol'
+ ],
+ 'script' => [
+ 'Script inside:',
+ 'script'
+ ],
+ ];
+ }
+
+ /**
+ * @dataProvider dataForCrazyHtml
+ */
+ public function testWithCrazyHtmlContent($html, $escapedString)
+ {
+ $tagger = $this->getTaggerMock();
+ $tagger->expects($this->once())
+ ->method('tag');
+
+ $graby = new Graby();
+
+ $proxy = new ContentProxy($graby, $tagger, $this->getTagRepositoryMock(), $this->getLogger(), $this->fetchingErrorMessage);
+ $entry = $proxy->updateEntry(
+ new Entry(new User()),
+ 'http://1.1.1.1',
+ [
+ 'html' => $html,
+ 'title' => 'this is my title',
+ 'url' => 'http://1.1.1.1',
+ 'content_type' => 'text/html',
+ 'language' => 'fr',
+ 'status' => '200',
+ 'open_graph' => [
+ 'og_title' => 'my OG title',
+ 'og_description' => 'OG desc',
+ 'og_image' => 'http://3.3.3.3/cover.jpg',
+ ],
+ ]
+ );
+
+ $this->assertEquals('http://1.1.1.1', $entry->getUrl());
+ $this->assertEquals('this is my title', $entry->getTitle());
+ $this->assertNotContains($escapedString, $entry->getContent());
+ $this->assertEquals('http://3.3.3.3/cover.jpg', $entry->getPreviewPicture());
+ $this->assertEquals('text/html', $entry->getMimetype());
+ $this->assertEquals('fr', $entry->getLanguage());
+ $this->assertEquals('200', $entry->getHttpStatus());
+ $this->assertEquals('1.1.1.1', $entry->getDomainName());
+ }
+
private function getTaggerMock()
{
return $this->getMockBuilder(RuleBasedTagger::class)