mirror of
				https://github.com/wallabag/wallabag.git
				synced 2025-10-20 19:52:09 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			201 lines
		
	
	
	
		
			7.2 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
			
		
		
	
	
			201 lines
		
	
	
	
		
			7.2 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
| <?php
 | |
| /**
 | |
|  * Split an HTML file into smaller html files, retaining the formatting and structure for the individual parts.
 | |
|  * What this splitter does is using DOM to try and retain any formatting in the file, including rebuilding the DOM tree for subsequent parts.
 | |
|  * Split size is considered max target size. The actual size is the result of an even split across the resulting files.
 | |
|  *
 | |
|  * @author A. Grandt <php@grandt.com>
 | |
|  * @copyright 2009-2014 A. Grandt
 | |
|  * @license GNU LGPL 2.1
 | |
|  * @link http://www.phpclasses.org/package/6115
 | |
|  * @link https://github.com/Grandt/PHPePub
 | |
|  * @version 3.20
 | |
|  */
 | |
| class EPubChapterSplitter {
 | |
|     const VERSION = 3.20;
 | |
| 
 | |
|     private $splitDefaultSize = 250000;
 | |
|     private $bookVersion = EPub::BOOK_VERSION_EPUB2;
 | |
| 
 | |
|     /**
 | |
|      *
 | |
|      * Enter description here ...
 | |
|      *
 | |
|      * @param unknown_type $ident
 | |
|      */
 | |
|     function setVersion($bookVersion) {
 | |
|         $this->bookVersion = is_string($bookVersion) ? trim($bookVersion) : EPub::BOOK_VERSION_EPUB2;
 | |
|     }
 | |
| 
 | |
| 	/**
 | |
|      * Set default chapter target size.
 | |
|      * Default is 250000 bytes, and minimum is 10240 bytes.
 | |
|      *
 | |
|      * @param $size segment size in bytes
 | |
|      * @return void
 | |
|      */
 | |
|     function setSplitSize($size) {
 | |
|         $this->splitDefaultSize = (int)$size;
 | |
|         if ($size < 10240) {
 | |
|             $this->splitDefaultSize = 10240; // Making the file smaller than 10k is not a good idea.
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Get the chapter target size.
 | |
|      *
 | |
|      * @return $size
 | |
|      */
 | |
|     function getSplitSize() {
 | |
|         return $this->splitDefaultSize;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Split $chapter into multiple parts.
 | |
|      *
 | |
|      * The search string can either be a regular string or a PHP PECL Regular Expression pattern as defined here: http://www.php.net/manual/en/pcre.pattern.php
 | |
|      * If the search string is a regular string, the matching will be for lines in the HTML starting with the string given
 | |
|      *
 | |
|      * @param String $chapter XHTML file
 | |
|      * @param Bool   $splitOnSearchString Split on chapter boundaries, Splitting on search strings disables the split size check.
 | |
|      * @param String $searchString Chapter string to search for can be fixed text, or a regular expression pattern.
 | |
|      *
 | |
|      * @return array with 1 or more parts
 | |
|      */
 | |
|     function splitChapter($chapter, $splitOnSearchString = false, $searchString = '/^Chapter\\ /i') {
 | |
|         $chapterData = array();
 | |
|         $isSearchRegexp = $splitOnSearchString && (preg_match('#^(\D|\S|\W).+\1[imsxeADSUXJu]*$#m', $searchString) == 1);
 | |
|         if ($splitOnSearchString && !$isSearchRegexp) {
 | |
|             $searchString = '#^<.+?>' . preg_quote($searchString, '#') . "#";
 | |
|         }
 | |
| 
 | |
|         if (!$splitOnSearchString && strlen($chapter) <= $this->splitDefaultSize) {
 | |
|             return array($chapter);
 | |
|         }
 | |
| 
 | |
|         $xmlDoc = new DOMDocument();
 | |
|         @$xmlDoc->loadHTML($chapter);
 | |
| 
 | |
|         $head = $xmlDoc->getElementsByTagName("head");
 | |
|         $body = $xmlDoc->getElementsByTagName("body");
 | |
| 
 | |
|         $htmlPos = stripos($chapter, "<html");
 | |
|         $htmlEndPos = stripos($chapter, ">", $htmlPos);
 | |
|         $newXML = substr($chapter, 0, $htmlEndPos+1) . "\n</html>";
 | |
|         if (strpos(trim($newXML), "<?xml ") === FALSE) {
 | |
|             $newXML = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" . $newXML;
 | |
|         }
 | |
|         $headerLength = strlen($newXML);
 | |
| 
 | |
|         $files = array();
 | |
|         $chapterNames = array();
 | |
|         $domDepth = 0;
 | |
|         $domPath = array();
 | |
|         $domClonedPath = array();
 | |
| 
 | |
|         $curFile = $xmlDoc->createDocumentFragment();
 | |
|         $files[] = $curFile;
 | |
|         $curParent = $curFile;
 | |
|         $curSize = 0;
 | |
| 
 | |
|         $bodyLen = strlen($xmlDoc->saveXML($body->item(0)));
 | |
|         $headLen = strlen($xmlDoc->saveXML($head->item(0))) + $headerLength;
 | |
| 
 | |
|         $partSize = $this->splitDefaultSize - $headLen;
 | |
| 
 | |
|         if ($bodyLen > $partSize) {
 | |
|             $parts = ceil($bodyLen / $partSize);
 | |
|             $partSize = ($bodyLen / $parts)  - $headLen;
 | |
|         }
 | |
| 
 | |
|         $node = $body->item(0)->firstChild;
 | |
| 
 | |
|         do {
 | |
|             $nodeData = $xmlDoc->saveXML($node);
 | |
|             $nodeLen = strlen($nodeData);
 | |
| 
 | |
|             if ($nodeLen > $partSize && $node->hasChildNodes()) {
 | |
|                 $domPath[] = $node;
 | |
|                 $domClonedPath[] = $node->cloneNode(false);
 | |
|                 $domDepth++;
 | |
| 
 | |
|                 $node = $node->firstChild;
 | |
|             }
 | |
| 
 | |
|             $node2 = $node->nextSibling;
 | |
| 
 | |
|             if ($node != null && $node->nodeName != "#text") {
 | |
|                 $doSplit = false;
 | |
|                 if ($splitOnSearchString) {
 | |
|                     $doSplit = preg_match($searchString, $nodeData) == 1;
 | |
|                     if ($doSplit) {
 | |
|                         $chapterNames[] = trim($nodeData);
 | |
|                     }
 | |
|                 }
 | |
| 
 | |
|                 if ($curSize > 0 && ($doSplit || (!$splitOnSearchString && $curSize + $nodeLen > $partSize))) {
 | |
|                     $curFile = $xmlDoc->createDocumentFragment();
 | |
|                     $files[] = $curFile;
 | |
|                     $curParent = $curFile;
 | |
|                     if ($domDepth > 0) {
 | |
|                         reset($domPath);
 | |
|                         reset($domClonedPath);
 | |
|                         $oneDomClonedPath = each($domClonedPath);
 | |
|                         while ($oneDomClonedPath) {
 | |
|                             list($k, $v) = $oneDomClonedPath;
 | |
|                             $newParent = $v->cloneNode(false);
 | |
|                             $curParent->appendChild($newParent);
 | |
|                             $curParent = $newParent;
 | |
|                             $oneDomClonedPath = each($domClonedPath);
 | |
|                         }
 | |
|                     }
 | |
|                     $curSize = strlen($xmlDoc->saveXML($curFile));
 | |
|                 }
 | |
|                 $curParent->appendChild($node->cloneNode(true));
 | |
|                 $curSize += $nodeLen;
 | |
|             }
 | |
| 
 | |
|             $node = $node2;
 | |
|             while ($node == null && $domDepth > 0) {
 | |
|                 $domDepth--;
 | |
|                 $node = end($domPath)->nextSibling;
 | |
|                 array_pop($domPath);
 | |
|                 array_pop($domClonedPath);
 | |
|                 $curParent = $curParent->parentNode;
 | |
|             }
 | |
|         } while ($node != null);
 | |
| 
 | |
|         $curFile = null;
 | |
|         $curSize = 0;
 | |
| 
 | |
|         $xml = new DOMDocument('1.0', $xmlDoc->xmlEncoding);
 | |
|         $xml->lookupPrefix("http://www.w3.org/1999/xhtml");
 | |
|         $xml->preserveWhiteSpace = false;
 | |
|         $xml->formatOutput = true;
 | |
| 
 | |
|         for ($idx = 0; $idx < count($files); $idx++) {
 | |
|             $xml2Doc = new DOMDocument('1.0', $xmlDoc->xmlEncoding);
 | |
|             $xml2Doc->lookupPrefix("http://www.w3.org/1999/xhtml");
 | |
|             $xml2Doc->loadXML($newXML);
 | |
|             $html = $xml2Doc->getElementsByTagName("html")->item(0);
 | |
|             $html->appendChild($xml2Doc->importNode($head->item(0), true));
 | |
|             $body = $xml2Doc->createElement("body");
 | |
|             $html->appendChild($body);
 | |
|             $body->appendChild($xml2Doc->importNode($files[$idx], true));
 | |
| 
 | |
|             // force pretty printing and correct formatting, should not be needed, but it is.
 | |
|             $xml->loadXML($xml2Doc->saveXML());
 | |
| 
 | |
| 			$doc = $xml->saveXML();
 | |
| 
 | |
| 			if ($this->bookVersion === EPub::BOOK_VERSION_EPUB3) {
 | |
| 				$doc = preg_replace('#^\s*<!DOCTYPE\ .+?>\s*#im', '', $doc);
 | |
| 			}
 | |
| 
 | |
|             $chapterData[$splitOnSearchString ? $chapterNames[$idx] : $idx] = $doc;
 | |
|         }
 | |
| 
 | |
|         return $chapterData;
 | |
|     }
 | |
| }
 | |
| ?>
 |