mirror of
https://github.com/wallabag/wallabag.git
synced 2024-12-24 16:40:30 +00:00
202 lines
7.2 KiB
PHP
202 lines
7.2 KiB
PHP
|
<?php
|
||
|
/**
|
||
|
* Split an HTML file into smaller html files, retaining the formatting and structure for the individual parts.
|
||
|
* What this splitter does is using DOM to try and retain any formatting in the file, including rebuilding the DOM tree for subsequent parts.
|
||
|
* Split size is considered max target size. The actual size is the result of an even split across the resulting files.
|
||
|
*
|
||
|
* @author A. Grandt <php@grandt.com>
|
||
|
* @copyright 2009-2014 A. Grandt
|
||
|
* @license GNU LGPL 2.1
|
||
|
* @link http://www.phpclasses.org/package/6115
|
||
|
* @link https://github.com/Grandt/PHPePub
|
||
|
* @version 3.20
|
||
|
*/
|
||
|
class EPubChapterSplitter {
|
||
|
const VERSION = 3.20;
|
||
|
|
||
|
private $splitDefaultSize = 250000;
|
||
|
private $bookVersion = EPub::BOOK_VERSION_EPUB2;
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
* Enter description here ...
|
||
|
*
|
||
|
* @param unknown_type $ident
|
||
|
*/
|
||
|
function setVersion($bookVersion) {
|
||
|
$this->bookVersion = is_string($bookVersion) ? trim($bookVersion) : EPub::BOOK_VERSION_EPUB2;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Set default chapter target size.
|
||
|
* Default is 250000 bytes, and minimum is 10240 bytes.
|
||
|
*
|
||
|
* @param $size segment size in bytes
|
||
|
* @return void
|
||
|
*/
|
||
|
function setSplitSize($size) {
|
||
|
$this->splitDefaultSize = (int)$size;
|
||
|
if ($size < 10240) {
|
||
|
$this->splitDefaultSize = 10240; // Making the file smaller than 10k is not a good idea.
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Get the chapter target size.
|
||
|
*
|
||
|
* @return $size
|
||
|
*/
|
||
|
function getSplitSize() {
|
||
|
return $this->splitDefaultSize;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Split $chapter into multiple parts.
|
||
|
*
|
||
|
* The search string can either be a regular string or a PHP PECL Regular Expression pattern as defined here: http://www.php.net/manual/en/pcre.pattern.php
|
||
|
* If the search string is a regular string, the matching will be for lines in the HTML starting with the string given
|
||
|
*
|
||
|
* @param String $chapter XHTML file
|
||
|
* @param Bool $splitOnSearchString Split on chapter boundaries, Splitting on search strings disables the split size check.
|
||
|
* @param String $searchString Chapter string to search for can be fixed text, or a regular expression pattern.
|
||
|
*
|
||
|
* @return array with 1 or more parts
|
||
|
*/
|
||
|
function splitChapter($chapter, $splitOnSearchString = false, $searchString = '/^Chapter\\ /i') {
|
||
|
$chapterData = array();
|
||
|
$isSearchRegexp = $splitOnSearchString && (preg_match('#^(\D|\S|\W).+\1[imsxeADSUXJu]*$#m', $searchString) == 1);
|
||
|
if ($splitOnSearchString && !$isSearchRegexp) {
|
||
|
$searchString = '#^<.+?>' . preg_quote($searchString, '#') . "#";
|
||
|
}
|
||
|
|
||
|
if (!$splitOnSearchString && strlen($chapter) <= $this->splitDefaultSize) {
|
||
|
return array($chapter);
|
||
|
}
|
||
|
|
||
|
$xmlDoc = new DOMDocument();
|
||
|
@$xmlDoc->loadHTML($chapter);
|
||
|
|
||
|
$head = $xmlDoc->getElementsByTagName("head");
|
||
|
$body = $xmlDoc->getElementsByTagName("body");
|
||
|
|
||
|
$htmlPos = stripos($chapter, "<html");
|
||
|
$htmlEndPos = stripos($chapter, ">", $htmlPos);
|
||
|
$newXML = substr($chapter, 0, $htmlEndPos+1) . "\n</html>";
|
||
|
if (strpos(trim($newXML), "<?xml ") === FALSE) {
|
||
|
$newXML = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" . $newXML;
|
||
|
}
|
||
|
$headerLength = strlen($newXML);
|
||
|
|
||
|
$files = array();
|
||
|
$chapterNames = array();
|
||
|
$domDepth = 0;
|
||
|
$domPath = array();
|
||
|
$domClonedPath = array();
|
||
|
|
||
|
$curFile = $xmlDoc->createDocumentFragment();
|
||
|
$files[] = $curFile;
|
||
|
$curParent = $curFile;
|
||
|
$curSize = 0;
|
||
|
|
||
|
$bodyLen = strlen($xmlDoc->saveXML($body->item(0)));
|
||
|
$headLen = strlen($xmlDoc->saveXML($head->item(0))) + $headerLength;
|
||
|
|
||
|
$partSize = $this->splitDefaultSize - $headLen;
|
||
|
|
||
|
if ($bodyLen > $partSize) {
|
||
|
$parts = ceil($bodyLen / $partSize);
|
||
|
$partSize = ($bodyLen / $parts) - $headLen;
|
||
|
}
|
||
|
|
||
|
$node = $body->item(0)->firstChild;
|
||
|
|
||
|
do {
|
||
|
$nodeData = $xmlDoc->saveXML($node);
|
||
|
$nodeLen = strlen($nodeData);
|
||
|
|
||
|
if ($nodeLen > $partSize && $node->hasChildNodes()) {
|
||
|
$domPath[] = $node;
|
||
|
$domClonedPath[] = $node->cloneNode(false);
|
||
|
$domDepth++;
|
||
|
|
||
|
$node = $node->firstChild;
|
||
|
}
|
||
|
|
||
|
$node2 = $node->nextSibling;
|
||
|
|
||
|
if ($node != null && $node->nodeName != "#text") {
|
||
|
$doSplit = false;
|
||
|
if ($splitOnSearchString) {
|
||
|
$doSplit = preg_match($searchString, $nodeData) == 1;
|
||
|
if ($doSplit) {
|
||
|
$chapterNames[] = trim($nodeData);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if ($curSize > 0 && ($doSplit || (!$splitOnSearchString && $curSize + $nodeLen > $partSize))) {
|
||
|
$curFile = $xmlDoc->createDocumentFragment();
|
||
|
$files[] = $curFile;
|
||
|
$curParent = $curFile;
|
||
|
if ($domDepth > 0) {
|
||
|
reset($domPath);
|
||
|
reset($domClonedPath);
|
||
|
$oneDomClonedPath = each($domClonedPath);
|
||
|
while ($oneDomClonedPath) {
|
||
|
list($k, $v) = $oneDomClonedPath;
|
||
|
$newParent = $v->cloneNode(false);
|
||
|
$curParent->appendChild($newParent);
|
||
|
$curParent = $newParent;
|
||
|
$oneDomClonedPath = each($domClonedPath);
|
||
|
}
|
||
|
}
|
||
|
$curSize = strlen($xmlDoc->saveXML($curFile));
|
||
|
}
|
||
|
$curParent->appendChild($node->cloneNode(true));
|
||
|
$curSize += $nodeLen;
|
||
|
}
|
||
|
|
||
|
$node = $node2;
|
||
|
while ($node == null && $domDepth > 0) {
|
||
|
$domDepth--;
|
||
|
$node = end($domPath)->nextSibling;
|
||
|
array_pop($domPath);
|
||
|
array_pop($domClonedPath);
|
||
|
$curParent = $curParent->parentNode;
|
||
|
}
|
||
|
} while ($node != null);
|
||
|
|
||
|
$curFile = null;
|
||
|
$curSize = 0;
|
||
|
|
||
|
$xml = new DOMDocument('1.0', $xmlDoc->xmlEncoding);
|
||
|
$xml->lookupPrefix("http://www.w3.org/1999/xhtml");
|
||
|
$xml->preserveWhiteSpace = false;
|
||
|
$xml->formatOutput = true;
|
||
|
|
||
|
for ($idx = 0; $idx < count($files); $idx++) {
|
||
|
$xml2Doc = new DOMDocument('1.0', $xmlDoc->xmlEncoding);
|
||
|
$xml2Doc->lookupPrefix("http://www.w3.org/1999/xhtml");
|
||
|
$xml2Doc->loadXML($newXML);
|
||
|
$html = $xml2Doc->getElementsByTagName("html")->item(0);
|
||
|
$html->appendChild($xml2Doc->importNode($head->item(0), true));
|
||
|
$body = $xml2Doc->createElement("body");
|
||
|
$html->appendChild($body);
|
||
|
$body->appendChild($xml2Doc->importNode($files[$idx], true));
|
||
|
|
||
|
// force pretty printing and correct formatting, should not be needed, but it is.
|
||
|
$xml->loadXML($xml2Doc->saveXML());
|
||
|
|
||
|
$doc = $xml->saveXML();
|
||
|
|
||
|
if ($this->bookVersion === EPub::BOOK_VERSION_EPUB3) {
|
||
|
$doc = preg_replace('#^\s*<!DOCTYPE\ .+?>\s*#im', '', $doc);
|
||
|
}
|
||
|
|
||
|
$chapterData[$splitOnSearchString ? $chapterNames[$idx] : $idx] = $doc;
|
||
|
}
|
||
|
|
||
|
return $chapterData;
|
||
|
}
|
||
|
}
|
||
|
?>
|