fix bug #209: titles with colon bad parsed

This commit is contained in:
Nicolas Lœuillet 2013-09-12 19:28:59 +02:00
parent 084ec2a63d
commit b9523a0ba0
3 changed files with 49 additions and 9 deletions

View file

@ -0,0 +1,46 @@
<?php
class PocheReadability extends Readability
{
/**
* Get the article title as an H1.
*
* @return DOMElement
*/
protected function getArticleTitle() {
$curTitle = '';
$origTitle = '';
try {
$curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
} catch(Exception $e) {}
if (preg_match('/ [\|\-] /', $curTitle))
{
$curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
if (count(explode(' ', $curTitle)) < 3) {
$curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
}
}
else if(strlen($curTitle) > 150 || strlen($curTitle) < 15)
{
$hOnes = $this->dom->getElementsByTagName('h1');
if($hOnes->length == 1)
{
$curTitle = $this->getInnerText($hOnes->item(0));
}
}
$curTitle = trim($curTitle);
if (count(explode(' ', $curTitle)) <= 4) {
$curTitle = $origTitle;
}
$articleTitle = $this->dom->createElement('h1');
$articleTitle->innerHTML = $curTitle;
return $articleTitle;
}
}

View file

@ -354,7 +354,7 @@ class Url
}
if (isset($splink)) {
// Build DOM tree from HTML
$readability = new Readability($html, $url);
$readability = new PocheReadability($html, $url);
$xpath = new DOMXPath($readability->dom);
// Loop through single_page_link xpath expressions
$single_page_url = null;

View file

@ -20,6 +20,7 @@ require_once __DIR__ . '/../../inc/poche/Url.class.php';
require_once __DIR__ . '/../../inc/3rdparty/class.messages.php';
require_once __DIR__ . '/../../inc/poche/Poche.class.php';
require_once __DIR__ . '/../../inc/3rdparty/Readability.php';
require_once __DIR__ . '/../../inc/poche/PocheReadability.php';
require_once __DIR__ . '/../../inc/3rdparty/Encoding.php';
require_once __DIR__ . '/../../inc/poche/Database.class.php';
require_once __DIR__ . '/../../vendor/autoload.php';
@ -48,10 +49,3 @@ if (!ini_get('date.timezone') || !@date_default_timezone_set(ini_get('date.timez
}
$poche = new Poche();
#XSRF protection with token
// if (!empty($_POST)) {
// if (!Session::isToken($_POST['token'])) {
// die(_('Wrong token'));
// }
// unset($_SESSION['tokens']);
// }