mirror of
https://github.com/wallabag/wallabag.git
synced 2025-01-10 17:05:26 +00:00
Merge branch 'memiks-gestion_erreur_readability' into dev
This commit is contained in:
commit
3e7188185d
2 changed files with 29 additions and 8 deletions
|
@ -80,7 +80,7 @@ class Readability
|
||||||
public $debug = false;
|
public $debug = false;
|
||||||
protected $body = null; //
|
protected $body = null; //
|
||||||
protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later
|
protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later
|
||||||
protected $flags = self::FLAG_CLEAN_CONDITIONALLY; // 1 | 2 | 4; // Start with all flags set.
|
protected $flags = self::FLAG_CLEAN_CONDITIONALLY; // 1 | 2 | 4; // Start with all flags set.
|
||||||
protected $success = false; // indicates whether we were able to extract or not
|
protected $success = false; // indicates whether we were able to extract or not
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -90,7 +90,7 @@ class Readability
|
||||||
public $regexps = array(
|
public $regexps = array(
|
||||||
'unlikelyCandidates' => '/combx|comment|comments|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i',
|
'unlikelyCandidates' => '/combx|comment|comments|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i',
|
||||||
'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
|
'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
|
||||||
'positive' => '/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i',
|
'positive' => '/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story|attachment/i',
|
||||||
'negative' => '/combx|comment|comments|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i',
|
'negative' => '/combx|comment|comments|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i',
|
||||||
'divToPElements' => '/<(a|blockquote|dl|div|ol|p|pre|table|ul)/i',
|
'divToPElements' => '/<(a|blockquote|dl|div|ol|p|pre|table|ul)/i',
|
||||||
'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i',
|
'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i',
|
||||||
|
@ -106,7 +106,7 @@ class Readability
|
||||||
* Create instance of Readability
|
* Create instance of Readability
|
||||||
* @param string UTF-8 encoded string
|
* @param string UTF-8 encoded string
|
||||||
* @param string (optional) URL associated with HTML (used for footnotes)
|
* @param string (optional) URL associated with HTML (used for footnotes)
|
||||||
*/
|
*/
|
||||||
function __construct($html, $url=null)
|
function __construct($html, $url=null)
|
||||||
{
|
{
|
||||||
/* Turn all double br's into p's */
|
/* Turn all double br's into p's */
|
||||||
|
@ -185,6 +185,7 @@ class Readability
|
||||||
$articleContent = $this->dom->createElement('div');
|
$articleContent = $this->dom->createElement('div');
|
||||||
$articleContent->setAttribute('id', 'readability-content');
|
$articleContent->setAttribute('id', 'readability-content');
|
||||||
$articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>';
|
$articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>';
|
||||||
|
return $this->success;
|
||||||
}
|
}
|
||||||
|
|
||||||
$overlay->setAttribute('id', 'readOverlay');
|
$overlay->setAttribute('id', 'readOverlay');
|
||||||
|
|
|
@ -39,6 +39,10 @@ function get_external_file($url)
|
||||||
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
|
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
|
||||||
curl_setopt($curl, CURLOPT_HEADER, false);
|
curl_setopt($curl, CURLOPT_HEADER, false);
|
||||||
|
|
||||||
|
// FOR SSL do not verified certificate
|
||||||
|
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE);
|
||||||
|
curl_setopt($curl, CURLOPT_AUTOREFERER, TRUE );
|
||||||
|
|
||||||
// FeedBurner requires a proper USER-AGENT...
|
// FeedBurner requires a proper USER-AGENT...
|
||||||
curl_setopt($curl, CURL_HTTP_VERSION_1_1, true);
|
curl_setopt($curl, CURL_HTTP_VERSION_1_1, true);
|
||||||
curl_setopt($curl, CURLOPT_ENCODING, "gzip, deflate");
|
curl_setopt($curl, CURLOPT_ENCODING, "gzip, deflate");
|
||||||
|
@ -54,7 +58,15 @@ function get_external_file($url)
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
// create http context and add timeout and user-agent
|
// create http context and add timeout and user-agent
|
||||||
$context = stream_context_create(array('http'=>array('timeout' => $timeout,'header'=> "User-Agent: ".$useragent,/*spoot Mozilla Firefox*/'follow_location' => true)));
|
$context = stream_context_create(array(
|
||||||
|
'http'=>array('timeout' => $timeout,
|
||||||
|
'header'=> "User-Agent: ".$useragent, /*spoot Mozilla Firefox*/
|
||||||
|
'follow_location' => true),
|
||||||
|
// FOR SSL do not verified certificate
|
||||||
|
'ssl' => array('verify_peer' => false,
|
||||||
|
'allow_self_signed' => true)
|
||||||
|
)
|
||||||
|
);
|
||||||
|
|
||||||
// only download page lesser than 4MB
|
// only download page lesser than 4MB
|
||||||
$data = @file_get_contents($url, false, $context, -1, 4000000); // We download at most 4 MB from source.
|
$data = @file_get_contents($url, false, $context, -1, 4000000); // We download at most 4 MB from source.
|
||||||
|
@ -98,6 +110,8 @@ function get_external_file($url)
|
||||||
*/
|
*/
|
||||||
function prepare_url($url)
|
function prepare_url($url)
|
||||||
{
|
{
|
||||||
|
global $msg;
|
||||||
|
|
||||||
$parametres = array();
|
$parametres = array();
|
||||||
$url = html_entity_decode(trim($url));
|
$url = html_entity_decode(trim($url));
|
||||||
|
|
||||||
|
@ -108,14 +122,20 @@ function prepare_url($url)
|
||||||
$i=strpos($url,'#xtor=RSS-'); if ($i!==false) $url=substr($url,0,$i);
|
$i=strpos($url,'#xtor=RSS-'); if ($i!==false) $url=substr($url,0,$i);
|
||||||
|
|
||||||
$title = $url;
|
$title = $url;
|
||||||
if (!preg_match('!^https?://!i', $url))
|
$html = Encoding::toUTF8(get_external_file($url,15));
|
||||||
$url = 'http://' . $url;
|
// If get_external_file if not able to retrieve HTTPS content try the same URL with HTTP protocol
|
||||||
|
if (!preg_match('!^https?://!i', $url) && (!isset($html) || strlen($html) <= 0)) {
|
||||||
|
$url = 'http://' . $url;
|
||||||
|
$html = Encoding::toUTF8(get_external_file($url,15));
|
||||||
|
}
|
||||||
|
|
||||||
$html = Encoding::toUTF8(get_external_file($url,15));
|
|
||||||
if (isset($html) and strlen($html) > 0)
|
if (isset($html) and strlen($html) > 0)
|
||||||
{
|
{
|
||||||
$r = new Readability($html, $url);
|
$r = new Readability($html, $url);
|
||||||
|
|
||||||
$r->convertLinksToFootnotes = CONVERT_LINKS_FOOTNOTES;
|
$r->convertLinksToFootnotes = CONVERT_LINKS_FOOTNOTES;
|
||||||
|
$r->revertForcedParagraphElements = REVERT_FORCED_PARAGRAPH_ELEMENTS;
|
||||||
|
|
||||||
if($r->init())
|
if($r->init())
|
||||||
{
|
{
|
||||||
$content = $r->articleContent->innerHTML;
|
$content = $r->articleContent->innerHTML;
|
||||||
|
@ -362,4 +382,4 @@ function logm($message)
|
||||||
{
|
{
|
||||||
$t = strval(date('Y/m/d_H:i:s')).' - '.$_SERVER["REMOTE_ADDR"].' - '.strval($message)."\n";
|
$t = strval(date('Y/m/d_H:i:s')).' - '.$_SERVER["REMOTE_ADDR"].' - '.strval($message)."\n";
|
||||||
file_put_contents('./log.txt',$t,FILE_APPEND);
|
file_put_contents('./log.txt',$t,FILE_APPEND);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue