wallabag/inc/3rdparty/libraries/MOBIClass/OnlineArticle.php

117 lines
2.9 KiB
PHP
Raw Normal View History

2014-07-24 13:49:36 +00:00
<?php
/**
* Description of OnlineArticle
*
* @author Sander
*/
class OnlineArticle extends ContentProvider {
private $text;
private $images;
private $metadata = array();
private $imgCounter = 0;
public function __construct($url) {
if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url;
$data = Http::Request($url);
//$enc = mb_detect_encoding($str, "UTF-8,ISO-8859-1,ASCII");
$html = mb_convert_encoding($data, "UTF-8", "UTF-8,ISO-8859-1,ASCII");
//$html = utf8_encode($html);
$r = new Readability($html, $url);
$r->init();
if(!isset($this->metadata["title"])){
$this->metadata["title"] = CharacterEntities::convert(strip_tags($r->getTitle()->innerHTML));
}
if(!isset($this->metadata["author"])){
$parts = parse_url($url);
$this->metadata["author"] = $parts["host"];
}
$article = $r->getContent()->innerHTML;
if(substr($article, 0, 5) == "<body"){
$article = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'/></head>".$article."</html>";
}else{
$article = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'/></head><body>".$article."</body></html>";
}
$doc = new DOMDocument();
@$doc->loadHTML($article) or die($article);
$doc->normalizeDocument();
$this->images = $this->handleImages($doc, $url);
$this->text = $doc->saveHTML();
}
/**
* Get the text data to be integrated in the MOBI file
* @return string
*/
public function getTextData(){
return $this->text;
}
/**
* Get the images (an array containing the jpeg data). Array entry 0 will
* correspond to image record 0.
* @return array
*/
public function getImages(){
return $this->images;
}
/**
* Get the metadata in the form of a hashtable (for example, title or author).
* @return array
*/
public function getMetaData(){
return $this->metadata;
}
/**
*
* @param DOMElement $dom
* @return array
*/
private function handleImages($dom, $url){
$images = array();
$parts = parse_url($url);
$savedImages = array();
$imgElements = $dom->getElementsByTagName('img');
foreach($imgElements as $img) {
$src = $img->getAttribute("src");
$is_root = false;
if(substr($src, 0, 1) == "/"){
$is_root = true;
}
$parsed = parse_url($src);
if(!isset($parsed["host"])){
if($is_root){
$src = http_build_url($url, $parsed, HTTP_URL_REPLACE);
}else{
$src = http_build_url($url, $parsed, HTTP_URL_JOIN_PATH);
}
}
$img->setAttribute("src", "");
if(isset($savedImages[$src])){
$img->setAttribute("recindex", $savedImages[$src]);
}else{
$image = ImageHandler::DownloadImage($src);
if($image !== false){
$images[$this->imgCounter] = new FileRecord(new Record($image));
$img->setAttribute("recindex", $this->imgCounter);
$savedImages[$src] = $this->imgCounter;
$this->imgCounter++;
}
}
}
return $images;
}
}
?>