2013-08-04 18:58:31 +00:00
|
|
|
<?php
|
|
|
|
/**
|
|
|
|
* poche, a read it later open source system
|
|
|
|
*
|
|
|
|
* @category poche
|
|
|
|
* @author Nicolas Lœuillet <support@inthepoche.com>
|
|
|
|
* @copyright 2013
|
|
|
|
* @license http://www.wtfpl.net/ see COPYING file
|
|
|
|
*/
|
|
|
|
|
|
|
|
class Url
|
|
|
|
{
|
|
|
|
public $url;
|
|
|
|
|
|
|
|
function __construct($url)
|
|
|
|
{
|
|
|
|
$this->url = base64_decode($url);
|
|
|
|
}
|
|
|
|
|
|
|
|
public function getUrl() {
|
|
|
|
return $this->url;
|
|
|
|
}
|
|
|
|
|
|
|
|
public function setUrl($url) {
|
|
|
|
$this->url = $url;
|
|
|
|
}
|
|
|
|
|
|
|
|
public function isCorrect()
|
|
|
|
{
|
2013-08-15 09:46:40 +00:00
|
|
|
$pattern = '|^(.*:)//([a-z\-.]+)(:[0-9]+)?(.*)$|i';
|
2013-08-04 18:58:31 +00:00
|
|
|
|
|
|
|
return preg_match($pattern, $this->url);
|
|
|
|
}
|
|
|
|
|
|
|
|
public function clean()
|
|
|
|
{
|
|
|
|
$url = html_entity_decode(trim($this->url));
|
|
|
|
|
|
|
|
$stuff = strpos($url,'&utm_source=');
|
|
|
|
if ($stuff !== FALSE)
|
|
|
|
$url = substr($url, 0, $stuff);
|
|
|
|
$stuff = strpos($url,'?utm_source=');
|
|
|
|
if ($stuff !== FALSE)
|
|
|
|
$url = substr($url, 0, $stuff);
|
|
|
|
$stuff = strpos($url,'#xtor=RSS-');
|
|
|
|
if ($stuff !== FALSE)
|
|
|
|
$url = substr($url, 0, $stuff);
|
|
|
|
|
|
|
|
$this->url = $url;
|
|
|
|
}
|
|
|
|
|
|
|
|
public function fetchContent()
|
|
|
|
{
|
|
|
|
if ($this->isCorrect()) {
|
|
|
|
$this->clean();
|
|
|
|
$html = Encoding::toUTF8(Tools::getFile($this->getUrl()));
|
|
|
|
|
|
|
|
# if Tools::getFile() if not able to retrieve HTTPS content, try the same URL with HTTP protocol
|
|
|
|
if (!preg_match('!^https?://!i', $this->getUrl()) && (!isset($html) || strlen($html) <= 0)) {
|
|
|
|
$this->setUrl('http://' . $this->getUrl());
|
|
|
|
$html = Encoding::toUTF8(Tools::getFile($this->getUrl()));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (function_exists('tidy_parse_string')) {
|
|
|
|
$tidy = tidy_parse_string($html, array(), 'UTF8');
|
|
|
|
$tidy->cleanRepair();
|
2013-08-21 22:49:23 +00:00
|
|
|
|
2013-08-23 19:07:32 +00:00
|
|
|
//Warning: tidy might fail so, ensure there is still a content
|
|
|
|
$body = $tidy->body();
|
2013-08-21 22:49:23 +00:00
|
|
|
|
2013-08-23 19:07:32 +00:00
|
|
|
//hasChildren does not seem to work, just check the string
|
|
|
|
//returned (and do not forget to clean the white spaces)
|
|
|
|
if (preg_replace('/\s+/', '', $body->value) !== "<body></body>") {
|
|
|
|
$html = $tidy->value;
|
|
|
|
}
|
2013-08-21 22:49:23 +00:00
|
|
|
}
|
2013-08-04 18:58:31 +00:00
|
|
|
|
|
|
|
$parameters = array();
|
|
|
|
if (isset($html) and strlen($html) > 0)
|
|
|
|
{
|
|
|
|
$readability = new Readability($html, $this->getUrl());
|
|
|
|
$readability->convertLinksToFootnotes = CONVERT_LINKS_FOOTNOTES;
|
|
|
|
$readability->revertForcedParagraphElements = REVERT_FORCED_PARAGRAPH_ELEMENTS;
|
|
|
|
|
|
|
|
if($readability->init())
|
|
|
|
{
|
|
|
|
$content = $readability->articleContent->innerHTML;
|
2013-08-23 16:29:36 +00:00
|
|
|
$parameters['title'] = ($readability->articleTitle->innerHTML != '' ? $readability->articleTitle->innerHTML : _('Untitled'));
|
2013-08-04 18:58:31 +00:00
|
|
|
$parameters['content'] = $content;
|
|
|
|
|
|
|
|
return $parameters;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
#$msg->add('e', _('error during url preparation : the link is not valid'));
|
|
|
|
Tools::logm($this->getUrl() . ' is not a valid url');
|
|
|
|
}
|
|
|
|
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
}
|