mirror of
https://github.com/wallabag/wallabag.git
synced 2025-01-24 15:48:08 +00:00
Use graby ContentExtractor to clean html
It might be better to re-use some graby functionalities to clean html instead of building a new system.
This commit is contained in:
parent
fb436e8ca0
commit
74a75f7d43
4 changed files with 66 additions and 2 deletions
|
@ -64,7 +64,7 @@
|
||||||
"htmlawed/htmlawed": "~1.1.19",
|
"htmlawed/htmlawed": "~1.1.19",
|
||||||
"liip/theme-bundle": "~1.1",
|
"liip/theme-bundle": "~1.1",
|
||||||
"lexik/form-filter-bundle": "~5.0",
|
"lexik/form-filter-bundle": "~5.0",
|
||||||
"j0k3r/graby": "~1.0",
|
"j0k3r/graby": "dev-extractor",
|
||||||
"friendsofsymfony/user-bundle": "^2.0",
|
"friendsofsymfony/user-bundle": "^2.0",
|
||||||
"friendsofsymfony/oauth-server-bundle": "^1.5",
|
"friendsofsymfony/oauth-server-bundle": "^1.5",
|
||||||
"stof/doctrine-extensions-bundle": "^1.2",
|
"stof/doctrine-extensions-bundle": "^1.2",
|
||||||
|
|
|
@ -336,7 +336,6 @@ class EntryRestController extends WallabagRestController
|
||||||
$entry->setUrl($url);
|
$entry->setUrl($url);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (!empty($tags)) {
|
if (!empty($tags)) {
|
||||||
$this->get('wallabag_core.tags_assigner')->assignTagsToEntry($entry, $tags);
|
$this->get('wallabag_core.tags_assigner')->assignTagsToEntry($entry, $tags);
|
||||||
}
|
}
|
||||||
|
|
|
@ -47,6 +47,16 @@ class ContentProxy
|
||||||
{
|
{
|
||||||
// ensure content is a bit cleaned up
|
// ensure content is a bit cleaned up
|
||||||
if (!empty($content['html'])) {
|
if (!empty($content['html'])) {
|
||||||
|
$extractor = $this->graby->getExtractor();
|
||||||
|
$contentExtracted = $extractor->process($content['html'], $url);
|
||||||
|
|
||||||
|
if ($contentExtracted) {
|
||||||
|
$contentBlock = $extractor->getContent();
|
||||||
|
$contentBlock->normalize();
|
||||||
|
|
||||||
|
$content['html'] = trim($contentBlock->innerHTML);
|
||||||
|
}
|
||||||
|
|
||||||
$content['html'] = htmLawed($content['html'], [
|
$content['html'] = htmLawed($content['html'], [
|
||||||
'safe' => 1,
|
'safe' => 1,
|
||||||
// which means: do not remove iframe elements
|
// which means: do not remove iframe elements
|
||||||
|
|
|
@ -8,6 +8,7 @@ use Wallabag\CoreBundle\Entity\Entry;
|
||||||
use Wallabag\CoreBundle\Entity\Tag;
|
use Wallabag\CoreBundle\Entity\Tag;
|
||||||
use Wallabag\UserBundle\Entity\User;
|
use Wallabag\UserBundle\Entity\User;
|
||||||
use Wallabag\CoreBundle\Helper\RuleBasedTagger;
|
use Wallabag\CoreBundle\Helper\RuleBasedTagger;
|
||||||
|
use Graby\Graby;
|
||||||
|
|
||||||
class ContentProxyTest extends \PHPUnit_Framework_TestCase
|
class ContentProxyTest extends \PHPUnit_Framework_TestCase
|
||||||
{
|
{
|
||||||
|
@ -253,6 +254,60 @@ class ContentProxyTest extends \PHPUnit_Framework_TestCase
|
||||||
$this->assertCount(0, $entry->getTags());
|
$this->assertCount(0, $entry->getTags());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function dataForCrazyHtml()
|
||||||
|
{
|
||||||
|
return [
|
||||||
|
'script and comment' => [
|
||||||
|
'<strong>Script inside:</strong> <!--[if gte IE 4]><script>alert(\'lol\');</script><![endif]--><br />',
|
||||||
|
'lol'
|
||||||
|
],
|
||||||
|
'script' => [
|
||||||
|
'<strong>Script inside:</strong><script>alert(\'lol\');</script>',
|
||||||
|
'script'
|
||||||
|
],
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @dataProvider dataForCrazyHtml
|
||||||
|
*/
|
||||||
|
public function testWithCrazyHtmlContent($html, $escapedString)
|
||||||
|
{
|
||||||
|
$tagger = $this->getTaggerMock();
|
||||||
|
$tagger->expects($this->once())
|
||||||
|
->method('tag');
|
||||||
|
|
||||||
|
$graby = new Graby();
|
||||||
|
|
||||||
|
$proxy = new ContentProxy($graby, $tagger, $this->getTagRepositoryMock(), $this->getLogger(), $this->fetchingErrorMessage);
|
||||||
|
$entry = $proxy->updateEntry(
|
||||||
|
new Entry(new User()),
|
||||||
|
'http://1.1.1.1',
|
||||||
|
[
|
||||||
|
'html' => $html,
|
||||||
|
'title' => 'this is my title',
|
||||||
|
'url' => 'http://1.1.1.1',
|
||||||
|
'content_type' => 'text/html',
|
||||||
|
'language' => 'fr',
|
||||||
|
'status' => '200',
|
||||||
|
'open_graph' => [
|
||||||
|
'og_title' => 'my OG title',
|
||||||
|
'og_description' => 'OG desc',
|
||||||
|
'og_image' => 'http://3.3.3.3/cover.jpg',
|
||||||
|
],
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
$this->assertEquals('http://1.1.1.1', $entry->getUrl());
|
||||||
|
$this->assertEquals('this is my title', $entry->getTitle());
|
||||||
|
$this->assertNotContains($escapedString, $entry->getContent());
|
||||||
|
$this->assertEquals('http://3.3.3.3/cover.jpg', $entry->getPreviewPicture());
|
||||||
|
$this->assertEquals('text/html', $entry->getMimetype());
|
||||||
|
$this->assertEquals('fr', $entry->getLanguage());
|
||||||
|
$this->assertEquals('200', $entry->getHttpStatus());
|
||||||
|
$this->assertEquals('1.1.1.1', $entry->getDomainName());
|
||||||
|
}
|
||||||
|
|
||||||
private function getTaggerMock()
|
private function getTaggerMock()
|
||||||
{
|
{
|
||||||
return $this->getMockBuilder(RuleBasedTagger::class)
|
return $this->getMockBuilder(RuleBasedTagger::class)
|
||||||
|
|
Loading…
Reference in a new issue