StringParser.php 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Pdf
  17. * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
  18. * @license http://framework.zend.com/license/new-bsd New BSD License
  19. * @version $Id$
  20. */
  21. /** Zend_Pdf_Element */
  22. require_once 'Zend/Pdf/Element.php';
  23. /** Zend_Pdf_Element_Array */
  24. require_once 'Zend/Pdf/Element/Array.php';
  25. /** Zend_Pdf_Element_String_Binary */
  26. require_once 'Zend/Pdf/Element/String/Binary.php';
  27. /** Zend_Pdf_Element_Boolean */
  28. require_once 'Zend/Pdf/Element/Boolean.php';
  29. /** Zend_Pdf_Element_Dictionary */
  30. require_once 'Zend/Pdf/Element/Dictionary.php';
  31. /** Zend_Pdf_Element_Name */
  32. require_once 'Zend/Pdf/Element/Name.php';
  33. /** Zend_Pdf_Element_Numeric */
  34. require_once 'Zend/Pdf/Element/Numeric.php';
  35. /** Zend_Pdf_Element_Object */
  36. require_once 'Zend/Pdf/Element/Object.php';
  37. /** Zend_Pdf_Element_Reference */
  38. require_once 'Zend/Pdf/Element/Reference.php';
  39. /** Zend_Pdf_Element_Object_Stream */
  40. require_once 'Zend/Pdf/Element/Object/Stream.php';
  41. /** Zend_Pdf_Element_String */
  42. require_once 'Zend/Pdf/Element/String.php';
  43. /** Zend_Pdf_Element_Null */
  44. require_once 'Zend/Pdf/Element/Null.php';
  45. /** Zend_Pdf_Element_Reference_Context */
  46. require_once 'Zend/Pdf/Element/Reference/Context.php';
  47. /** Zend_Pdf_Element_Reference_Table */
  48. require_once 'Zend/Pdf/Element/Reference/Table.php';
  49. /** Zend_Pdf_ElementFactory_Interface */
  50. require_once 'Zend/Pdf/ElementFactory/Interface.php';
  51. /** Zend_Pdf_PhpArray */
  52. require_once 'Zend/Pdf/PhpArray.php';
  53. /**
  54. * PDF string parser
  55. *
  56. * @package Zend_Pdf
  57. * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
  58. * @license http://framework.zend.com/license/new-bsd New BSD License
  59. */
  60. class Zend_Pdf_StringParser
  61. {
  62. /**
  63. * Source PDF
  64. *
  65. * @var string
  66. */
  67. public $data = '';
  68. /**
  69. * Current position in a data
  70. *
  71. * @var integer
  72. */
  73. public $offset = 0;
  74. /**
  75. * Current reference context
  76. *
  77. * @var Zend_Pdf_Element_Reference_Context
  78. */
  79. private $_context = null;
  80. /**
  81. * Array of elements of the currently parsed object/trailer
  82. *
  83. * @var array
  84. */
  85. private $_elements = array();
  86. /**
  87. * PDF objects factory.
  88. *
  89. * @var Zend_Pdf_ElementFactory_Interface
  90. */
  91. private $_objFactory = null;
  92. /**
  93. * Clean up resources.
  94. *
  95. * Clear current state to remove cyclic object references
  96. */
  97. public function cleanUp()
  98. {
  99. $this->_context = null;
  100. $this->_elements = array();
  101. $this->_objFactory = null;
  102. }
  103. /**
  104. * Character with code $chCode is white space
  105. *
  106. * @param integer $chCode
  107. * @return boolean
  108. */
  109. public static function isWhiteSpace($chCode)
  110. {
  111. if ($chCode == 0x00 || // null character
  112. $chCode == 0x09 || // Tab
  113. $chCode == 0x0A || // Line feed
  114. $chCode == 0x0C || // Form Feed
  115. $chCode == 0x0D || // Carriage return
  116. $chCode == 0x20 // Space
  117. ) {
  118. return true;
  119. } else {
  120. return false;
  121. }
  122. }
  123. /**
  124. * Character with code $chCode is a delimiter character
  125. *
  126. * @param integer $chCode
  127. * @return boolean
  128. */
  129. public static function isDelimiter($chCode )
  130. {
  131. if ($chCode == 0x28 || // '('
  132. $chCode == 0x29 || // ')'
  133. $chCode == 0x3C || // '<'
  134. $chCode == 0x3E || // '>'
  135. $chCode == 0x5B || // '['
  136. $chCode == 0x5D || // ']'
  137. $chCode == 0x7B || // '{'
  138. $chCode == 0x7D || // '}'
  139. $chCode == 0x2F || // '/'
  140. $chCode == 0x25 // '%'
  141. ) {
  142. return true;
  143. } else {
  144. return false;
  145. }
  146. }
  147. /**
  148. * Skip white space
  149. *
  150. * @param boolean $skipComment
  151. */
  152. public function skipWhiteSpace($skipComment = true)
  153. {
  154. while ($this->offset < strlen($this->data)) {
  155. if (self::isWhiteSpace( ord($this->data[$this->offset]) )) {
  156. $this->offset++;
  157. } else if (ord($this->data[$this->offset]) == 0x25 && $skipComment) { // '%'
  158. $this->skipComment();
  159. } else {
  160. return;
  161. }
  162. }
  163. }
  164. /**
  165. * Skip comment
  166. */
  167. public function skipComment()
  168. {
  169. while ($this->offset < strlen($this->data))
  170. {
  171. if (ord($this->data[$this->offset]) != 0x0A || // Line feed
  172. ord($this->data[$this->offset]) != 0x0d // Carriage return
  173. ) {
  174. $this->offset++;
  175. } else {
  176. return;
  177. }
  178. }
  179. }
  180. /**
  181. * Read comment line
  182. *
  183. * @return string
  184. */
  185. public function readComment()
  186. {
  187. $this->skipWhiteSpace(false);
  188. /** Check if it's a comment line */
  189. if ($this->data[$this->offset] != '%') {
  190. return '';
  191. }
  192. for ($start = $this->offset;
  193. $this->offset < strlen($this->data);
  194. $this->offset++) {
  195. if (ord($this->data[$this->offset]) == 0x0A || // Line feed
  196. ord($this->data[$this->offset]) == 0x0d // Carriage return
  197. ) {
  198. break;
  199. }
  200. }
  201. return substr($this->data, $start, $this->offset-$start);
  202. }
  203. /**
  204. * Returns next lexeme from a pdf stream
  205. *
  206. * @return string
  207. */
  208. public function readLexeme()
  209. {
  210. $this->skipWhiteSpace();
  211. if ($this->offset >= strlen($this->data)) {
  212. return '';
  213. }
  214. $start = $this->offset;
  215. if (self::isDelimiter( ord($this->data[$start]) )) {
  216. if ($this->data[$start] == '<' && $this->offset + 1 < strlen($this->data) && $this->data[$start+1] == '<') {
  217. $this->offset += 2;
  218. return '<<';
  219. } else if ($this->data[$start] == '>' && $this->offset + 1 < strlen($this->data) && $this->data[$start+1] == '>') {
  220. $this->offset += 2;
  221. return '>>';
  222. } else {
  223. $this->offset++;
  224. return $this->data[$start];
  225. }
  226. } else {
  227. while ( ($this->offset < strlen($this->data)) &&
  228. (!self::isDelimiter( ord($this->data[$this->offset]) )) &&
  229. (!self::isWhiteSpace( ord($this->data[$this->offset]) )) ) {
  230. $this->offset++;
  231. }
  232. return substr($this->data, $start, $this->offset - $start);
  233. }
  234. }
  235. /**
  236. * Read elemental object from a PDF stream
  237. *
  238. * @return Zend_Pdf_Element
  239. * @throws Zend_Pdf_Exception
  240. */
  241. public function readElement($nextLexeme = null)
  242. {
  243. if ($nextLexeme === null) {
  244. $nextLexeme = $this->readLexeme();
  245. }
  246. /**
  247. * Note: readElement() method is a public method and could be invoked from other classes.
  248. * If readElement() is used not by Zend_Pdf_StringParser::getObject() method, then we should not care
  249. * about _elements member management.
  250. */
  251. switch ($nextLexeme) {
  252. case '(':
  253. return ($this->_elements[] = $this->_readString());
  254. case '<':
  255. return ($this->_elements[] = $this->_readBinaryString());
  256. case '/':
  257. return ($this->_elements[] = new Zend_Pdf_Element_Name(
  258. Zend_Pdf_Element_Name::unescape( $this->readLexeme() )
  259. ));
  260. case '[':
  261. return ($this->_elements[] = $this->_readArray());
  262. case '<<':
  263. return ($this->_elements[] = $this->_readDictionary());
  264. case ')':
  265. // fall through to next case
  266. case '>':
  267. // fall through to next case
  268. case ']':
  269. // fall through to next case
  270. case '>>':
  271. // fall through to next case
  272. case '{':
  273. // fall through to next case
  274. case '}':
  275. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Offset - 0x%X.',
  276. $this->offset));
  277. default:
  278. if (strcasecmp($nextLexeme, 'true') == 0) {
  279. return ($this->_elements[] = new Zend_Pdf_Element_Boolean(true));
  280. } else if (strcasecmp($nextLexeme, 'false') == 0) {
  281. return ($this->_elements[] = new Zend_Pdf_Element_Boolean(false));
  282. } else if (strcasecmp($nextLexeme, 'null') == 0) {
  283. return ($this->_elements[] = new Zend_Pdf_Element_Null());
  284. }
  285. $ref = $this->_readReference($nextLexeme);
  286. if ($ref !== null) {
  287. return ($this->_elements[] = $ref);
  288. }
  289. return ($this->_elements[] = $this->_readNumeric($nextLexeme));
  290. }
  291. }
  292. /**
  293. * Read string PDF object
  294. * Also reads trailing ')' from a pdf stream
  295. *
  296. * @return Zend_Pdf_Element_String
  297. * @throws Zend_Pdf_Exception
  298. */
  299. private function _readString()
  300. {
  301. $start = $this->offset;
  302. $openedBrackets = 1;
  303. while ($this->offset < strlen($this->data)) {
  304. switch (ord( $this->data[$this->offset] )) {
  305. case 0x28: // '(' - opened bracket in the string, needs balanced pair.
  306. $openedBrackets++;
  307. break;
  308. case 0x29: // ')' - pair to the opened bracket
  309. $openedBrackets--;
  310. break;
  311. case 0x5C: // '\\' - escape sequence, skip next char from a check
  312. $this->offset++;
  313. }
  314. $this->offset++;
  315. if ($openedBrackets == 0) {
  316. break; // end of string
  317. }
  318. }
  319. if ($openedBrackets != 0) {
  320. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Unexpected end of file while string reading. Offset - 0x%X. \')\' expected.', $start));
  321. }
  322. return new Zend_Pdf_Element_String(Zend_Pdf_Element_String::unescape( substr($this->data,
  323. $start,
  324. $this->offset - $start - 1) ));
  325. }
  326. /**
  327. * Read binary string PDF object
  328. * Also reads trailing '>' from a pdf stream
  329. *
  330. * @return Zend_Pdf_Element_String_Binary
  331. * @throws Zend_Pdf_Exception
  332. */
  333. private function _readBinaryString()
  334. {
  335. $start = $this->offset;
  336. while ($this->offset < strlen($this->data)) {
  337. if (self::isWhiteSpace( ord($this->data[$this->offset]) ) ||
  338. ctype_xdigit( $this->data[$this->offset] ) ) {
  339. $this->offset++;
  340. } else if ($this->data[$this->offset] == '>') {
  341. $this->offset++;
  342. return new Zend_Pdf_Element_String_Binary(
  343. Zend_Pdf_Element_String_Binary::unescape( substr($this->data,
  344. $start,
  345. $this->offset - $start - 1) ));
  346. } else {
  347. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Unexpected character while binary string reading. Offset - 0x%X.', $this->offset));
  348. }
  349. }
  350. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Unexpected end of file while binary string reading. Offset - 0x%X. \'>\' expected.', $start));
  351. }
  352. /**
  353. * Read array PDF object
  354. * Also reads trailing ']' from a pdf stream
  355. *
  356. * @return Zend_Pdf_Element_Array
  357. * @throws Zend_Pdf_Exception
  358. */
  359. private function _readArray()
  360. {
  361. $elements = array();
  362. while ( strlen($nextLexeme = $this->readLexeme()) != 0 ) {
  363. if ($nextLexeme != ']') {
  364. $elements[] = $this->readElement($nextLexeme);
  365. } else {
  366. return new Zend_Pdf_Element_Array($elements);
  367. }
  368. }
  369. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Unexpected end of file while array reading. Offset - 0x%X. \']\' expected.', $this->offset));
  370. }
  371. /**
  372. * Read dictionary PDF object
  373. * Also reads trailing '>>' from a pdf stream
  374. *
  375. * @return Zend_Pdf_Element_Dictionary
  376. * @throws Zend_Pdf_Exception
  377. */
  378. private function _readDictionary()
  379. {
  380. $dictionary = new Zend_Pdf_Element_Dictionary();
  381. while ( strlen($nextLexeme = $this->readLexeme()) != 0 ) {
  382. if ($nextLexeme != '>>') {
  383. $nameStart = $this->offset - strlen($nextLexeme);
  384. $name = $this->readElement($nextLexeme);
  385. $value = $this->readElement();
  386. if (!$name instanceof Zend_Pdf_Element_Name) {
  387. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Name object expected while dictionary reading. Offset - 0x%X.', $nameStart));
  388. }
  389. $dictionary->add($name, $value);
  390. } else {
  391. return $dictionary;
  392. }
  393. }
  394. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Unexpected end of file while dictionary reading. Offset - 0x%X. \'>>\' expected.', $this->offset));
  395. }
  396. /**
  397. * Read reference PDF object
  398. *
  399. * @param string $nextLexeme
  400. * @return Zend_Pdf_Element_Reference
  401. */
  402. private function _readReference($nextLexeme = null)
  403. {
  404. $start = $this->offset;
  405. if ($nextLexeme === null) {
  406. $objNum = $this->readLexeme();
  407. } else {
  408. $objNum = $nextLexeme;
  409. }
  410. if (!ctype_digit($objNum)) { // it's not a reference
  411. $this->offset = $start;
  412. return null;
  413. }
  414. $genNum = $this->readLexeme();
  415. if (!ctype_digit($genNum)) { // it's not a reference
  416. $this->offset = $start;
  417. return null;
  418. }
  419. $rMark = $this->readLexeme();
  420. if ($rMark != 'R') { // it's not a reference
  421. $this->offset = $start;
  422. return null;
  423. }
  424. $ref = new Zend_Pdf_Element_Reference((int)$objNum, (int)$genNum, $this->_context, $this->_objFactory->resolve());
  425. return $ref;
  426. }
  427. /**
  428. * Read numeric PDF object
  429. *
  430. * @param string $nextLexeme
  431. * @return Zend_Pdf_Element_Numeric
  432. */
  433. private function _readNumeric($nextLexeme = null)
  434. {
  435. if ($nextLexeme === null) {
  436. $nextLexeme = $this->readLexeme();
  437. }
  438. return new Zend_Pdf_Element_Numeric($nextLexeme);
  439. }
  440. /**
  441. * Read inderect object from a PDF stream
  442. *
  443. * @param integer $offset
  444. * @param Zend_Pdf_Element_Reference_Context $context
  445. * @return Zend_Pdf_Element_Object
  446. */
  447. public function getObject($offset, Zend_Pdf_Element_Reference_Context $context)
  448. {
  449. if ($offset === null ) {
  450. return new Zend_Pdf_Element_Null();
  451. }
  452. // Save current offset to make getObject() reentrant
  453. $offsetSave = $this->offset;
  454. $this->offset = $offset;
  455. $this->_context = $context;
  456. $this->_elements = array();
  457. $objNum = $this->readLexeme();
  458. if (!ctype_digit($objNum)) {
  459. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Offset - 0x%X. Object number expected.', $this->offset - strlen($objNum)));
  460. }
  461. $genNum = $this->readLexeme();
  462. if (!ctype_digit($genNum)) {
  463. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Offset - 0x%X. Object generation number expected.', $this->offset - strlen($genNum)));
  464. }
  465. $objKeyword = $this->readLexeme();
  466. if ($objKeyword != 'obj') {
  467. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Offset - 0x%X. \'obj\' keyword expected.', $this->offset - strlen($objKeyword)));
  468. }
  469. $objValue = $this->readElement();
  470. $nextLexeme = $this->readLexeme();
  471. if( $nextLexeme == 'endobj' ) {
  472. /**
  473. * Object is not generated by factory (thus it's not marked as modified object).
  474. * But factory is assigned to the obect.
  475. */
  476. $obj = new Zend_Pdf_Element_Object($objValue, (int)$objNum, (int)$genNum, $this->_objFactory->resolve());
  477. foreach ($this->_elements as $element) {
  478. $element->setParentObject($obj);
  479. }
  480. // Restore offset value
  481. $this->offset = $offsetSave;
  482. return $obj;
  483. }
  484. /**
  485. * It's a stream object
  486. */
  487. if ($nextLexeme != 'stream') {
  488. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Offset - 0x%X. \'endobj\' or \'stream\' keywords expected.', $this->offset - strlen($nextLexeme)));
  489. }
  490. if (!$objValue instanceof Zend_Pdf_Element_Dictionary) {
  491. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Offset - 0x%X. Stream extent must be preceded by stream dictionary.', $this->offset - strlen($nextLexeme)));
  492. }
  493. /**
  494. * References are automatically dereferenced at this moment.
  495. */
  496. $streamLength = $objValue->Length->value;
  497. /**
  498. * 'stream' keyword must be followed by either cr-lf sequence or lf character only.
  499. * This restriction gives the possibility to recognize all cases exactly
  500. */
  501. if ($this->data[$this->offset] == "\r" &&
  502. $this->data[$this->offset + 1] == "\n" ) {
  503. $this->offset += 2;
  504. } else if ($this->data[$this->offset] == "\n" ) {
  505. $this->offset++;
  506. } else {
  507. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Offset - 0x%X. \'stream\' must be followed by either cr-lf sequence or lf character only.', $this->offset - strlen($nextLexeme)));
  508. }
  509. $dataOffset = $this->offset;
  510. $this->offset += $streamLength;
  511. $nextLexeme = $this->readLexeme();
  512. if ($nextLexeme != 'endstream') {
  513. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Offset - 0x%X. \'endstream\' keyword expected.', $this->offset - strlen($nextLexeme)));
  514. }
  515. $nextLexeme = $this->readLexeme();
  516. if ($nextLexeme != 'endobj') {
  517. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Offset - 0x%X. \'endobj\' keyword expected.', $this->offset - strlen($nextLexeme)));
  518. }
  519. $obj = new Zend_Pdf_Element_Object_Stream(substr($this->data,
  520. $dataOffset,
  521. $streamLength),
  522. (int)$objNum,
  523. (int)$genNum,
  524. $this->_objFactory->resolve(),
  525. $objValue);
  526. foreach ($this->_elements as $element) {
  527. $element->setParentObject($obj);
  528. }
  529. // Restore offset value
  530. $this->offset = $offsetSave;
  531. return $obj;
  532. }
  533. /**
  534. * Get length of source string
  535. *
  536. * @return integer
  537. */
  538. public function getLength()
  539. {
  540. return strlen($this->data);
  541. }
  542. /**
  543. * Get source string
  544. *
  545. * @return string
  546. */
  547. public function getString()
  548. {
  549. return $this->data;
  550. }
  551. /**
  552. * Parse integer value from a binary stream
  553. *
  554. * @param string $stream
  555. * @param integer $offset
  556. * @param integer $size
  557. * @return integer
  558. */
  559. public static function parseIntFromStream($stream, $offset, $size)
  560. {
  561. $value = 0;
  562. for ($count = 0; $count < $size; $count++) {
  563. $value *= 256;
  564. $value += ord($stream[$offset + $count]);
  565. }
  566. return $value;
  567. }
  568. /**
  569. * Set current context
  570. *
  571. * @param Zend_Pdf_Element_Reference_Context $context
  572. */
  573. public function setContext(Zend_Pdf_Element_Reference_Context $context)
  574. {
  575. $this->_context = $context;
  576. }
  577. /**
  578. * Object constructor
  579. *
  580. * Note: PHP duplicates string, which is sent by value, only of it's updated.
  581. * Thus we don't need to care about overhead
  582. *
  583. * @param string $pdfString
  584. * @param Zend_Pdf_ElementFactory_Interface $factory
  585. */
  586. public function __construct($source, Zend_Pdf_ElementFactory_Interface $factory)
  587. {
  588. $this->data = $source;
  589. $this->_objFactory = $factory;
  590. }
  591. }