wallabag/inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php

<?php
/**
 * Split an HTML file into smaller html files, retaining the formatting and structure for the individual parts.
 * What this splitter does is using DOM to try and retain any formatting in the file, including rebuilding the DOM tree for subsequent parts.
 * Split size is considered max target size. The actual size is the result of an even split across the resulting files.
 *
 * @author A. Grandt <php@grandt.com>
 * @copyright 2009-2014 A. Grandt
 * @license GNU LGPL 2.1
 * @link http://www.phpclasses.org/package/6115
 * @link https://github.com/Grandt/PHPePub
 * @version 3.20
 */
class EPubChapterSplitter {
    const VERSION = 3.20;

    private $splitDefaultSize = 250000;
    private $bookVersion = EPub::BOOK_VERSION_EPUB2;

    /**
     *
     * Enter description here ...
     *
     * @param unknown_type $ident
     */
    function setVersion($bookVersion) {
        $this->bookVersion = is_string($bookVersion) ? trim($bookVersion) : EPub::BOOK_VERSION_EPUB2;
    }

	/**
     * Set default chapter target size.
     * Default is 250000 bytes, and minimum is 10240 bytes.
     *
     * @param $size segment size in bytes
     * @return void
     */
    function setSplitSize($size) {
        $this->splitDefaultSize = (int)$size;
        if ($size < 10240) {
            $this->splitDefaultSize = 10240; // Making the file smaller than 10k is not a good idea.
        }
    }

    /**
     * Get the chapter target size.
     *
     * @return $size
     */
    function getSplitSize() {
        return $this->splitDefaultSize;
    }

    /**
     * Split $chapter into multiple parts.
     *
     * The search string can either be a regular string or a PHP PECL Regular Expression pattern as defined here: http://www.php.net/manual/en/pcre.pattern.php
     * If the search string is a regular string, the matching will be for lines in the HTML starting with the string given
     *
     * @param String $chapter XHTML file
     * @param Bool   $splitOnSearchString Split on chapter boundaries, Splitting on search strings disables the split size check.
     * @param String $searchString Chapter string to search for can be fixed text, or a regular expression pattern.
     *
     * @return array with 1 or more parts
     */
    function splitChapter($chapter, $splitOnSearchString = false, $searchString = '/^Chapter\\ /i') {
        $chapterData = array();
        $isSearchRegexp = $splitOnSearchString && (preg_match('#^(\D|\S|\W).+\1[imsxeADSUXJu]*$#m', $searchString) == 1);
        if ($splitOnSearchString && !$isSearchRegexp) {
            $searchString = '#^<.+?>' . preg_quote($searchString, '#') . "#";
        }

        if (!$splitOnSearchString && strlen($chapter) <= $this->splitDefaultSize) {
            return array($chapter);
        }

        $xmlDoc = new DOMDocument();
        @$xmlDoc->loadHTML($chapter);

        $head = $xmlDoc->getElementsByTagName("head");
        $body = $xmlDoc->getElementsByTagName("body");

        $htmlPos = stripos($chapter, "<html");
        $htmlEndPos = stripos($chapter, ">", $htmlPos);
        $newXML = substr($chapter, 0, $htmlEndPos+1) . "\n</html>";
        if (strpos(trim($newXML), "<?xml ") === FALSE) {
            $newXML = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" . $newXML;
        }
        $headerLength = strlen($newXML);

        $files = array();
        $chapterNames = array();
        $domDepth = 0;
        $domPath = array();
        $domClonedPath = array();

        $curFile = $xmlDoc->createDocumentFragment();
        $files[] = $curFile;
        $curParent = $curFile;
        $curSize = 0;

        $bodyLen = strlen($xmlDoc->saveXML($body->item(0)));
        $headLen = strlen($xmlDoc->saveXML($head->item(0))) + $headerLength;

        $partSize = $this->splitDefaultSize - $headLen;

        if ($bodyLen > $partSize) {
            $parts = ceil($bodyLen / $partSize);
            $partSize = ($bodyLen / $parts)  - $headLen;
        }

        $node = $body->item(0)->firstChild;

        do {
            $nodeData = $xmlDoc->saveXML($node);
            $nodeLen = strlen($nodeData);

            if ($nodeLen > $partSize && $node->hasChildNodes()) {
                $domPath[] = $node;
                $domClonedPath[] = $node->cloneNode(false);
                $domDepth++;

                $node = $node->firstChild;
            }

            $node2 = $node->nextSibling;

            if ($node != null && $node->nodeName != "#text") {
                $doSplit = false;
                if ($splitOnSearchString) {
                    $doSplit = preg_match($searchString, $nodeData) == 1;
                    if ($doSplit) {
                        $chapterNames[] = trim($nodeData);
                    }
                }

                if ($curSize > 0 && ($doSplit || (!$splitOnSearchString && $curSize + $nodeLen > $partSize))) {
                    $curFile = $xmlDoc->createDocumentFragment();
                    $files[] = $curFile;
                    $curParent = $curFile;
                    if ($domDepth > 0) {
                        reset($domPath);
                        reset($domClonedPath);
                        $oneDomClonedPath = each($domClonedPath);
                        while ($oneDomClonedPath) {
                            list($k, $v) = $oneDomClonedPath;
                            $newParent = $v->cloneNode(false);
                            $curParent->appendChild($newParent);
                            $curParent = $newParent;
                            $oneDomClonedPath = each($domClonedPath);
                        }
                    }
                    $curSize = strlen($xmlDoc->saveXML($curFile));
                }
                $curParent->appendChild($node->cloneNode(true));
                $curSize += $nodeLen;
            }

            $node = $node2;
            while ($node == null && $domDepth > 0) {
                $domDepth--;
                $node = end($domPath)->nextSibling;
                array_pop($domPath);
                array_pop($domClonedPath);
                $curParent = $curParent->parentNode;
            }
        } while ($node != null);

        $curFile = null;
        $curSize = 0;

        $xml = new DOMDocument('1.0', $xmlDoc->xmlEncoding);
        $xml->lookupPrefix("http://www.w3.org/1999/xhtml");
        $xml->preserveWhiteSpace = false;
        $xml->formatOutput = true;

        for ($idx = 0; $idx < count($files); $idx++) {
            $xml2Doc = new DOMDocument('1.0', $xmlDoc->xmlEncoding);
            $xml2Doc->lookupPrefix("http://www.w3.org/1999/xhtml");
            $xml2Doc->loadXML($newXML);
            $html = $xml2Doc->getElementsByTagName("html")->item(0);
            $html->appendChild($xml2Doc->importNode($head->item(0), true));
            $body = $xml2Doc->createElement("body");
            $html->appendChild($body);
            $body->appendChild($xml2Doc->importNode($files[$idx], true));

            // force pretty printing and correct formatting, should not be needed, but it is.
            $xml->loadXML($xml2Doc->saveXML());

			$doc = $xml->saveXML();

			if ($this->bookVersion === EPub::BOOK_VERSION_EPUB3) {
				$doc = preg_replace('#^\s*<!DOCTYPE\ .+?>\s*#im', '', $doc);
			}

            $chapterData[$splitOnSearchString ? $chapterNames[$idx] : $idx] = $doc;
        }

        return $chapterData;
    }
}
?>
Added epub export function 2014-04-24 01:04:02 +00:00			`<?php`
			`/**`
			`* Split an HTML file into smaller html files, retaining the formatting and structure for the individual parts.`
			`* What this splitter does is using DOM to try and retain any formatting in the file, including rebuilding the DOM tree for subsequent parts.`
			`* Split size is considered max target size. The actual size is the result of an even split across the resulting files.`
			`*`
			`* @author A. Grandt <php@grandt.com>`
			`* @copyright 2009-2014 A. Grandt`
			`* @license GNU LGPL 2.1`
			`* @link http://www.phpclasses.org/package/6115`
			`* @link https://github.com/Grandt/PHPePub`
			`* @version 3.20`
			`*/`
			`class EPubChapterSplitter {`
			`const VERSION = 3.20;`

			`private $splitDefaultSize = 250000;`
			`private $bookVersion = EPub::BOOK_VERSION_EPUB2;`

			`/**`
			`*`
			`* Enter description here ...`
			`*`
			`* @param unknown_type $ident`
			`*/`
			`function setVersion($bookVersion) {`
			`$this->bookVersion = is_string($bookVersion) ? trim($bookVersion) : EPub::BOOK_VERSION_EPUB2;`
			`}`

			`/**`
			`* Set default chapter target size.`
			`* Default is 250000 bytes, and minimum is 10240 bytes.`
			`*`
			`* @param $size segment size in bytes`
			`* @return void`
			`*/`
			`function setSplitSize($size) {`
			`$this->splitDefaultSize = (int)$size;`
			`if ($size < 10240) {`
			`$this->splitDefaultSize = 10240; // Making the file smaller than 10k is not a good idea.`
			`}`
			`}`

			`/**`
			`* Get the chapter target size.`
			`*`
			`* @return $size`
			`*/`
			`function getSplitSize() {`
			`return $this->splitDefaultSize;`
			`}`

			`/**`
			`* Split $chapter into multiple parts.`
			`*`
			`* The search string can either be a regular string or a PHP PECL Regular Expression pattern as defined here: http://www.php.net/manual/en/pcre.pattern.php`
			`* If the search string is a regular string, the matching will be for lines in the HTML starting with the string given`
			`*`
			`* @param String $chapter XHTML file`
			`* @param Bool $splitOnSearchString Split on chapter boundaries, Splitting on search strings disables the split size check.`
			`* @param String $searchString Chapter string to search for can be fixed text, or a regular expression pattern.`
			`*`
			`* @return array with 1 or more parts`
			`*/`
			`function splitChapter($chapter, $splitOnSearchString = false, $searchString = '/^Chapter\\ /i') {`
			`$chapterData = array();`
			`$isSearchRegexp = $splitOnSearchString && (preg_match('#^(\D\|\S\|\W).+\1[imsxeADSUXJu]*$#m', $searchString) == 1);`
			`if ($splitOnSearchString && !$isSearchRegexp) {`
			`$searchString = '#^<.+?>' . preg_quote($searchString, '#') . "#";`
			`}`

			`if (!$splitOnSearchString && strlen($chapter) <= $this->splitDefaultSize) {`
			`return array($chapter);`
			`}`

			`$xmlDoc = new DOMDocument();`
			`@$xmlDoc->loadHTML($chapter);`

			`$head = $xmlDoc->getElementsByTagName("head");`
			`$body = $xmlDoc->getElementsByTagName("body");`

			`$htmlPos = stripos($chapter, "<html");`
			`$htmlEndPos = stripos($chapter, ">", $htmlPos);`
			`$newXML = substr($chapter, 0, $htmlEndPos+1) . "\n</html>";`
			`if (strpos(trim($newXML), "<?xml ") === FALSE) {`
			`$newXML = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" . $newXML;`
			`}`
			`$headerLength = strlen($newXML);`

			`$files = array();`
			`$chapterNames = array();`
			`$domDepth = 0;`
			`$domPath = array();`
			`$domClonedPath = array();`

			`$curFile = $xmlDoc->createDocumentFragment();`
			`$files[] = $curFile;`
			`$curParent = $curFile;`
			`$curSize = 0;`

			`$bodyLen = strlen($xmlDoc->saveXML($body->item(0)));`
			`$headLen = strlen($xmlDoc->saveXML($head->item(0))) + $headerLength;`

			`$partSize = $this->splitDefaultSize - $headLen;`

			`if ($bodyLen > $partSize) {`
			`$parts = ceil($bodyLen / $partSize);`
			`$partSize = ($bodyLen / $parts) - $headLen;`
			`}`

			`$node = $body->item(0)->firstChild;`

			`do {`
			`$nodeData = $xmlDoc->saveXML($node);`
			`$nodeLen = strlen($nodeData);`

			`if ($nodeLen > $partSize && $node->hasChildNodes()) {`
			`$domPath[] = $node;`
			`$domClonedPath[] = $node->cloneNode(false);`
			`$domDepth++;`

			`$node = $node->firstChild;`
			`}`

			`$node2 = $node->nextSibling;`

			`if ($node != null && $node->nodeName != "#text") {`
			`$doSplit = false;`
			`if ($splitOnSearchString) {`
			`$doSplit = preg_match($searchString, $nodeData) == 1;`
			`if ($doSplit) {`
			`$chapterNames[] = trim($nodeData);`
			`}`
			`}`

			`if ($curSize > 0 && ($doSplit \|\| (!$splitOnSearchString && $curSize + $nodeLen > $partSize))) {`
			`$curFile = $xmlDoc->createDocumentFragment();`
			`$files[] = $curFile;`
			`$curParent = $curFile;`
			`if ($domDepth > 0) {`
			`reset($domPath);`
			`reset($domClonedPath);`
			`$oneDomClonedPath = each($domClonedPath);`
			`while ($oneDomClonedPath) {`
			`list($k, $v) = $oneDomClonedPath;`
			`$newParent = $v->cloneNode(false);`
			`$curParent->appendChild($newParent);`
			`$curParent = $newParent;`
			`$oneDomClonedPath = each($domClonedPath);`
			`}`
			`}`
			`$curSize = strlen($xmlDoc->saveXML($curFile));`
			`}`
			`$curParent->appendChild($node->cloneNode(true));`
			`$curSize += $nodeLen;`
			`}`

			`$node = $node2;`
			`while ($node == null && $domDepth > 0) {`
			`$domDepth--;`
			`$node = end($domPath)->nextSibling;`
			`array_pop($domPath);`
			`array_pop($domClonedPath);`
			`$curParent = $curParent->parentNode;`
			`}`
			`} while ($node != null);`

			`$curFile = null;`
			`$curSize = 0;`

			`$xml = new DOMDocument('1.0', $xmlDoc->xmlEncoding);`
			`$xml->lookupPrefix("http://www.w3.org/1999/xhtml");`
			`$xml->preserveWhiteSpace = false;`
			`$xml->formatOutput = true;`

			`for ($idx = 0; $idx < count($files); $idx++) {`
			`$xml2Doc = new DOMDocument('1.0', $xmlDoc->xmlEncoding);`
			`$xml2Doc->lookupPrefix("http://www.w3.org/1999/xhtml");`
			`$xml2Doc->loadXML($newXML);`
			`$html = $xml2Doc->getElementsByTagName("html")->item(0);`
			`$html->appendChild($xml2Doc->importNode($head->item(0), true));`
			`$body = $xml2Doc->createElement("body");`
			`$html->appendChild($body);`
			`$body->appendChild($xml2Doc->importNode($files[$idx], true));`

			`// force pretty printing and correct formatting, should not be needed, but it is.`
			`$xml->loadXML($xml2Doc->saveXML());`

			`$doc = $xml->saveXML();`

			`if ($this->bookVersion === EPub::BOOK_VERSION_EPUB3) {`
			`$doc = preg_replace('#^\s<!DOCTYPE\ .+?>\s#im', '', $doc);`
			`}`

			`$chapterData[$splitOnSearchString ? $chapterNames[$idx] : $idx] = $doc;`
			`}`

			`return $chapterData;`
			`}`
			`}`
			`?>`