Use graby ContentExtractor to clean html

It might be better to re-use some graby functionalities to clean html instead of building a new system.
2025-02-16 18:55:15 +00:00 · 2017-05-12 07:53:21 +02:00 · 2017-05-12 07:53:21 +02:00 · 74a75f7d43
commit 74a75f7d43
parent fb436e8ca0
4 changed files with 66 additions and 2 deletions
--- a/composer.json
+++ b/composer.json
@ -64,7 +64,7 @@
        "htmlawed/htmlawed": "~1.1.19",
        "liip/theme-bundle": "~1.1",
        "lexik/form-filter-bundle": "~5.0",
-        "j0k3r/graby": "~1.0",
+        "j0k3r/graby": "dev-extractor",
        "friendsofsymfony/user-bundle": "^2.0",
        "friendsofsymfony/oauth-server-bundle": "^1.5",
        "stof/doctrine-extensions-bundle": "^1.2",
--- a/src/Wallabag/ApiBundle/Controller/EntryRestController.php
+++ b/src/Wallabag/ApiBundle/Controller/EntryRestController.php
@ -336,7 +336,6 @@ class EntryRestController extends WallabagRestController
            $entry->setUrl($url);
        }

-
        if (!empty($tags)) {
            $this->get('wallabag_core.tags_assigner')->assignTagsToEntry($entry, $tags);
        }
--- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php
+++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
@ -47,6 +47,16 @@ class ContentProxy
    {
        // ensure content is a bit cleaned up
        if (!empty($content['html'])) {
+            $extractor = $this->graby->getExtractor();
+            $contentExtracted = $extractor->process($content['html'], $url);
+
+            if ($contentExtracted) {
+                $contentBlock = $extractor->getContent();
+                $contentBlock->normalize();
+
+                $content['html'] = trim($contentBlock->innerHTML);
+            }
+
            $content['html'] = htmLawed($content['html'], [
                'safe' => 1,
                // which means: do not remove iframe elements
--- a/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php
+++ b/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php
@ -8,6 +8,7 @@ use Wallabag\CoreBundle\Entity\Entry;
 use Wallabag\CoreBundle\Entity\Tag;
 use Wallabag\UserBundle\Entity\User;
 use Wallabag\CoreBundle\Helper\RuleBasedTagger;
+use Graby\Graby;

 class ContentProxyTest extends \PHPUnit_Framework_TestCase
 {
@ -253,6 +254,60 @@ class ContentProxyTest extends \PHPUnit_Framework_TestCase
        $this->assertCount(0, $entry->getTags());
    }

+    public function dataForCrazyHtml()
+    {
+        return [
+            'script and comment' => [
+                '<strong>Script inside:</strong> <!--[if gte IE 4]><script>alert(\'lol\');</script><![endif]--><br />',
+                'lol'
+            ],
+            'script' => [
+                '<strong>Script inside:</strong><script>alert(\'lol\');</script>',
+                'script'
+            ],
+        ];
+    }
+
+    /**
+     * @dataProvider dataForCrazyHtml
+     */
+    public function testWithCrazyHtmlContent($html, $escapedString)
+    {
+        $tagger = $this->getTaggerMock();
+        $tagger->expects($this->once())
+            ->method('tag');
+
+        $graby = new Graby();
+
+        $proxy = new ContentProxy($graby, $tagger, $this->getTagRepositoryMock(), $this->getLogger(), $this->fetchingErrorMessage);
+        $entry = $proxy->updateEntry(
+            new Entry(new User()),
+            'http://1.1.1.1',
+            [
+                'html' => $html,
+                'title' => 'this is my title',
+                'url' => 'http://1.1.1.1',
+                'content_type' => 'text/html',
+                'language' => 'fr',
+                'status' => '200',
+                'open_graph' => [
+                    'og_title' => 'my OG title',
+                    'og_description' => 'OG desc',
+                    'og_image' => 'http://3.3.3.3/cover.jpg',
+                ],
+            ]
+        );
+
+        $this->assertEquals('http://1.1.1.1', $entry->getUrl());
+        $this->assertEquals('this is my title', $entry->getTitle());
+        $this->assertNotContains($escapedString, $entry->getContent());
+        $this->assertEquals('http://3.3.3.3/cover.jpg', $entry->getPreviewPicture());
+        $this->assertEquals('text/html', $entry->getMimetype());
+        $this->assertEquals('fr', $entry->getLanguage());
+        $this->assertEquals('200', $entry->getHttpStatus());
+        $this->assertEquals('1.1.1.1', $entry->getDomainName());
+    }
+
    private function getTaggerMock()
    {
        return $this->getMockBuilder(RuleBasedTagger::class)