StringParser.php 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Pdf
  17. * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
  18. * @license http://framework.zend.com/license/new-bsd New BSD License
  19. * @version $Id$
  20. */
  21. /** Zend_Pdf_Element */
  22. require_once 'Zend/Pdf/Element.php';
  23. /** Zend_Pdf_Element_Array */
  24. require_once 'Zend/Pdf/Element/Array.php';
  25. /** Zend_Pdf_Element_String_Binary */
  26. require_once 'Zend/Pdf/Element/String/Binary.php';
  27. /** Zend_Pdf_Element_Boolean */
  28. require_once 'Zend/Pdf/Element/Boolean.php';
  29. /** Zend_Pdf_Element_Dictionary */
  30. require_once 'Zend/Pdf/Element/Dictionary.php';
  31. /** Zend_Pdf_Element_Name */
  32. require_once 'Zend/Pdf/Element/Name.php';
  33. /** Zend_Pdf_Element_Numeric */
  34. require_once 'Zend/Pdf/Element/Numeric.php';
  35. /** Zend_Pdf_Element_Object */
  36. require_once 'Zend/Pdf/Element/Object.php';
  37. /** Zend_Pdf_Element_Reference */
  38. require_once 'Zend/Pdf/Element/Reference.php';
  39. /** Zend_Pdf_Element_Object_Stream */
  40. require_once 'Zend/Pdf/Element/Object/Stream.php';
  41. /** Zend_Pdf_Element_String */
  42. require_once 'Zend/Pdf/Element/String.php';
  43. /** Zend_Pdf_Element_Null */
  44. require_once 'Zend/Pdf/Element/Null.php';
  45. /** Zend_Pdf_Element_Reference_Context */
  46. require_once 'Zend/Pdf/Element/Reference/Context.php';
  47. /** Zend_Pdf_Element_Reference_Table */
  48. require_once 'Zend/Pdf/Element/Reference/Table.php';
  49. /** Zend_Pdf_ElementFactory_Interface */
  50. require_once 'Zend/Pdf/ElementFactory/Interface.php';
  51. /**
  52. * PDF string parser
  53. *
  54. * @package Zend_Pdf
  55. * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
  56. * @license http://framework.zend.com/license/new-bsd New BSD License
  57. */
  58. class Zend_Pdf_StringParser
  59. {
  60. /**
  61. * Source PDF
  62. *
  63. * @var string
  64. */
  65. public $data = '';
  66. /**
  67. * Current position in a data
  68. *
  69. * @var integer
  70. */
  71. public $offset = 0;
  72. /**
  73. * Current reference context
  74. *
  75. * @var Zend_Pdf_Element_Reference_Context
  76. */
  77. private $_context = null;
  78. /**
  79. * Array of elements of the currently parsed object/trailer
  80. *
  81. * @var array
  82. */
  83. private $_elements = array();
  84. /**
  85. * PDF objects factory.
  86. *
  87. * @var Zend_Pdf_ElementFactory_Interface
  88. */
  89. private $_objFactory = null;
  90. /**
  91. * Clean up resources.
  92. *
  93. * Clear current state to remove cyclic object references
  94. */
  95. public function cleanUp()
  96. {
  97. $this->_context = null;
  98. $this->_elements = array();
  99. $this->_objFactory = null;
  100. }
  101. /**
  102. * Character with code $chCode is white space
  103. *
  104. * @param integer $chCode
  105. * @return boolean
  106. */
  107. public static function isWhiteSpace($chCode)
  108. {
  109. if ($chCode == 0x00 || // null character
  110. $chCode == 0x09 || // Tab
  111. $chCode == 0x0A || // Line feed
  112. $chCode == 0x0C || // Form Feed
  113. $chCode == 0x0D || // Carriage return
  114. $chCode == 0x20 // Space
  115. ) {
  116. return true;
  117. } else {
  118. return false;
  119. }
  120. }
  121. /**
  122. * Character with code $chCode is a delimiter character
  123. *
  124. * @param integer $chCode
  125. * @return boolean
  126. */
  127. public static function isDelimiter($chCode )
  128. {
  129. if ($chCode == 0x28 || // '('
  130. $chCode == 0x29 || // ')'
  131. $chCode == 0x3C || // '<'
  132. $chCode == 0x3E || // '>'
  133. $chCode == 0x5B || // '['
  134. $chCode == 0x5D || // ']'
  135. $chCode == 0x7B || // '{'
  136. $chCode == 0x7D || // '}'
  137. $chCode == 0x2F || // '/'
  138. $chCode == 0x25 // '%'
  139. ) {
  140. return true;
  141. } else {
  142. return false;
  143. }
  144. }
  145. /**
  146. * Skip white space
  147. *
  148. * @param boolean $skipComment
  149. */
  150. public function skipWhiteSpace($skipComment = true)
  151. {
  152. while ($this->offset < strlen($this->data)) {
  153. if (self::isWhiteSpace( ord($this->data[$this->offset]) )) {
  154. $this->offset++;
  155. } else if (ord($this->data[$this->offset]) == 0x25 && $skipComment) { // '%'
  156. $this->skipComment();
  157. } else {
  158. return;
  159. }
  160. }
  161. }
  162. /**
  163. * Skip comment
  164. */
  165. public function skipComment()
  166. {
  167. while ($this->offset < strlen($this->data))
  168. {
  169. if (ord($this->data[$this->offset]) != 0x0A || // Line feed
  170. ord($this->data[$this->offset]) != 0x0d // Carriage return
  171. ) {
  172. $this->offset++;
  173. } else {
  174. return;
  175. }
  176. }
  177. }
  178. /**
  179. * Read comment line
  180. *
  181. * @return string
  182. */
  183. public function readComment()
  184. {
  185. $this->skipWhiteSpace(false);
  186. /** Check if it's a comment line */
  187. if ($this->data[$this->offset] != '%') {
  188. return '';
  189. }
  190. for ($start = $this->offset;
  191. $this->offset < strlen($this->data);
  192. $this->offset++) {
  193. if (ord($this->data[$this->offset]) == 0x0A || // Line feed
  194. ord($this->data[$this->offset]) == 0x0d // Carriage return
  195. ) {
  196. break;
  197. }
  198. }
  199. return substr($this->data, $start, $this->offset-$start);
  200. }
  201. /**
  202. * Returns next lexeme from a pdf stream
  203. *
  204. * @return string
  205. */
  206. public function readLexeme()
  207. {
  208. // $this->skipWhiteSpace();
  209. while (true) {
  210. $this->offset += strspn($this->data, "\x00\t\n\f\r ", $this->offset);
  211. if ($this->data[$this->offset] == '%') {
  212. preg_match('/[\r\n]/', $this->data, $matches, PREG_OFFSET_CAPTURE, $this->offset);
  213. if (count($matches) > 0) {
  214. $this->offset += strlen($matches[0][0]) + $matches[0][1];
  215. } else {
  216. $this->offset = strlen($this->data);
  217. }
  218. } else {
  219. break;
  220. }
  221. }
  222. if ($this->offset >= strlen($this->data)) {
  223. return '';
  224. }
  225. $start = $this->offset;
  226. if (self::isDelimiter( ord($this->data[$start]) )) {
  227. if ($this->data[$start] == '<' && $this->offset + 1 < strlen($this->data) && $this->data[$start+1] == '<') {
  228. $this->offset += 2;
  229. return '<<';
  230. } else if ($this->data[$start] == '>' && $this->offset + 1 < strlen($this->data) && $this->data[$start+1] == '>') {
  231. $this->offset += 2;
  232. return '>>';
  233. } else {
  234. $this->offset++;
  235. return $this->data[$start];
  236. }
  237. } else {
  238. while ( ($this->offset < strlen($this->data)) &&
  239. (!self::isDelimiter( ord($this->data[$this->offset]) )) &&
  240. (!self::isWhiteSpace( ord($this->data[$this->offset]) )) ) {
  241. $this->offset++;
  242. }
  243. return substr($this->data, $start, $this->offset - $start);
  244. }
  245. }
  246. /**
  247. * Read elemental object from a PDF stream
  248. *
  249. * @return Zend_Pdf_Element
  250. * @throws Zend_Pdf_Exception
  251. */
  252. public function readElement($nextLexeme = null)
  253. {
  254. if ($nextLexeme === null) {
  255. $nextLexeme = $this->readLexeme();
  256. }
  257. /**
  258. * Note: readElement() method is a public method and could be invoked from other classes.
  259. * If readElement() is used not by Zend_Pdf_StringParser::getObject() method, then we should not care
  260. * about _elements member management.
  261. */
  262. switch ($nextLexeme) {
  263. case '(':
  264. return ($this->_elements[] = $this->_readString());
  265. case '<':
  266. return ($this->_elements[] = $this->_readBinaryString());
  267. case '/':
  268. return ($this->_elements[] = new Zend_Pdf_Element_Name(
  269. Zend_Pdf_Element_Name::unescape( $this->readLexeme() )
  270. ));
  271. case '[':
  272. return ($this->_elements[] = $this->_readArray());
  273. case '<<':
  274. return ($this->_elements[] = $this->_readDictionary());
  275. case ')':
  276. // fall through to next case
  277. case '>':
  278. // fall through to next case
  279. case ']':
  280. // fall through to next case
  281. case '>>':
  282. // fall through to next case
  283. case '{':
  284. // fall through to next case
  285. case '}':
  286. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Offset - 0x%X.',
  287. $this->offset));
  288. default:
  289. if (strcasecmp($nextLexeme, 'true') == 0) {
  290. return ($this->_elements[] = new Zend_Pdf_Element_Boolean(true));
  291. } else if (strcasecmp($nextLexeme, 'false') == 0) {
  292. return ($this->_elements[] = new Zend_Pdf_Element_Boolean(false));
  293. } else if (strcasecmp($nextLexeme, 'null') == 0) {
  294. return ($this->_elements[] = new Zend_Pdf_Element_Null());
  295. }
  296. $ref = $this->_readReference($nextLexeme);
  297. if ($ref !== null) {
  298. return ($this->_elements[] = $ref);
  299. }
  300. return ($this->_elements[] = $this->_readNumeric($nextLexeme));
  301. }
  302. }
  303. /**
  304. * Read string PDF object
  305. * Also reads trailing ')' from a pdf stream
  306. *
  307. * @return Zend_Pdf_Element_String
  308. * @throws Zend_Pdf_Exception
  309. */
  310. private function _readString()
  311. {
  312. $start = $this->offset;
  313. $openedBrackets = 1;
  314. while ($this->offset < strlen($this->data)) {
  315. switch (ord( $this->data[$this->offset] )) {
  316. case 0x28: // '(' - opened bracket in the string, needs balanced pair.
  317. $openedBrackets++;
  318. break;
  319. case 0x29: // ')' - pair to the opened bracket
  320. $openedBrackets--;
  321. break;
  322. case 0x5C: // '\\' - escape sequence, skip next char from a check
  323. $this->offset++;
  324. }
  325. $this->offset++;
  326. if ($openedBrackets == 0) {
  327. break; // end of string
  328. }
  329. }
  330. if ($openedBrackets != 0) {
  331. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Unexpected end of file while string reading. Offset - 0x%X. \')\' expected.', $start));
  332. }
  333. return new Zend_Pdf_Element_String(Zend_Pdf_Element_String::unescape( substr($this->data,
  334. $start,
  335. $this->offset - $start - 1) ));
  336. }
  337. /**
  338. * Read binary string PDF object
  339. * Also reads trailing '>' from a pdf stream
  340. *
  341. * @return Zend_Pdf_Element_String_Binary
  342. * @throws Zend_Pdf_Exception
  343. */
  344. private function _readBinaryString()
  345. {
  346. $start = $this->offset;
  347. while ($this->offset < strlen($this->data)) {
  348. if (self::isWhiteSpace( ord($this->data[$this->offset]) ) ||
  349. ctype_xdigit( $this->data[$this->offset] ) ) {
  350. $this->offset++;
  351. } else if ($this->data[$this->offset] == '>') {
  352. $this->offset++;
  353. return new Zend_Pdf_Element_String_Binary(
  354. Zend_Pdf_Element_String_Binary::unescape( substr($this->data,
  355. $start,
  356. $this->offset - $start - 1) ));
  357. } else {
  358. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Unexpected character while binary string reading. Offset - 0x%X.', $this->offset));
  359. }
  360. }
  361. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Unexpected end of file while binary string reading. Offset - 0x%X. \'>\' expected.', $start));
  362. }
  363. /**
  364. * Read array PDF object
  365. * Also reads trailing ']' from a pdf stream
  366. *
  367. * @return Zend_Pdf_Element_Array
  368. * @throws Zend_Pdf_Exception
  369. */
  370. private function _readArray()
  371. {
  372. $elements = array();
  373. while ( strlen($nextLexeme = $this->readLexeme()) != 0 ) {
  374. if ($nextLexeme != ']') {
  375. $elements[] = $this->readElement($nextLexeme);
  376. } else {
  377. return new Zend_Pdf_Element_Array($elements);
  378. }
  379. }
  380. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Unexpected end of file while array reading. Offset - 0x%X. \']\' expected.', $this->offset));
  381. }
  382. /**
  383. * Read dictionary PDF object
  384. * Also reads trailing '>>' from a pdf stream
  385. *
  386. * @return Zend_Pdf_Element_Dictionary
  387. * @throws Zend_Pdf_Exception
  388. */
  389. private function _readDictionary()
  390. {
  391. $dictionary = new Zend_Pdf_Element_Dictionary();
  392. while ( strlen($nextLexeme = $this->readLexeme()) != 0 ) {
  393. if ($nextLexeme != '>>') {
  394. $nameStart = $this->offset - strlen($nextLexeme);
  395. $name = $this->readElement($nextLexeme);
  396. $value = $this->readElement();
  397. if (!$name instanceof Zend_Pdf_Element_Name) {
  398. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Name object expected while dictionary reading. Offset - 0x%X.', $nameStart));
  399. }
  400. $dictionary->add($name, $value);
  401. } else {
  402. return $dictionary;
  403. }
  404. }
  405. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Unexpected end of file while dictionary reading. Offset - 0x%X. \'>>\' expected.', $this->offset));
  406. }
  407. /**
  408. * Read reference PDF object
  409. *
  410. * @param string $nextLexeme
  411. * @return Zend_Pdf_Element_Reference
  412. */
  413. private function _readReference($nextLexeme = null)
  414. {
  415. $start = $this->offset;
  416. if ($nextLexeme === null) {
  417. $objNum = $this->readLexeme();
  418. } else {
  419. $objNum = $nextLexeme;
  420. }
  421. if (!ctype_digit($objNum)) { // it's not a reference
  422. $this->offset = $start;
  423. return null;
  424. }
  425. $genNum = $this->readLexeme();
  426. if (!ctype_digit($genNum)) { // it's not a reference
  427. $this->offset = $start;
  428. return null;
  429. }
  430. $rMark = $this->readLexeme();
  431. if ($rMark != 'R') { // it's not a reference
  432. $this->offset = $start;
  433. return null;
  434. }
  435. $ref = new Zend_Pdf_Element_Reference((int)$objNum, (int)$genNum, $this->_context, $this->_objFactory->resolve());
  436. return $ref;
  437. }
  438. /**
  439. * Read numeric PDF object
  440. *
  441. * @param string $nextLexeme
  442. * @return Zend_Pdf_Element_Numeric
  443. */
  444. private function _readNumeric($nextLexeme = null)
  445. {
  446. if ($nextLexeme === null) {
  447. $nextLexeme = $this->readLexeme();
  448. }
  449. return new Zend_Pdf_Element_Numeric($nextLexeme);
  450. }
  451. /**
  452. * Read inderect object from a PDF stream
  453. *
  454. * @param integer $offset
  455. * @param Zend_Pdf_Element_Reference_Context $context
  456. * @return Zend_Pdf_Element_Object
  457. */
  458. public function getObject($offset, Zend_Pdf_Element_Reference_Context $context)
  459. {
  460. if ($offset === null ) {
  461. return new Zend_Pdf_Element_Null();
  462. }
  463. // Save current offset to make getObject() reentrant
  464. $offsetSave = $this->offset;
  465. $this->offset = $offset;
  466. $this->_context = $context;
  467. $this->_elements = array();
  468. $objNum = $this->readLexeme();
  469. if (!ctype_digit($objNum)) {
  470. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Offset - 0x%X. Object number expected.', $this->offset - strlen($objNum)));
  471. }
  472. $genNum = $this->readLexeme();
  473. if (!ctype_digit($genNum)) {
  474. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Offset - 0x%X. Object generation number expected.', $this->offset - strlen($genNum)));
  475. }
  476. $objKeyword = $this->readLexeme();
  477. if ($objKeyword != 'obj') {
  478. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Offset - 0x%X. \'obj\' keyword expected.', $this->offset - strlen($objKeyword)));
  479. }
  480. $objValue = $this->readElement();
  481. $nextLexeme = $this->readLexeme();
  482. if( $nextLexeme == 'endobj' ) {
  483. /**
  484. * Object is not generated by factory (thus it's not marked as modified object).
  485. * But factory is assigned to the obect.
  486. */
  487. $obj = new Zend_Pdf_Element_Object($objValue, (int)$objNum, (int)$genNum, $this->_objFactory->resolve());
  488. foreach ($this->_elements as $element) {
  489. $element->setParentObject($obj);
  490. }
  491. // Restore offset value
  492. $this->offset = $offsetSave;
  493. return $obj;
  494. }
  495. /**
  496. * It's a stream object
  497. */
  498. if ($nextLexeme != 'stream') {
  499. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Offset - 0x%X. \'endobj\' or \'stream\' keywords expected.', $this->offset - strlen($nextLexeme)));
  500. }
  501. if (!$objValue instanceof Zend_Pdf_Element_Dictionary) {
  502. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Offset - 0x%X. Stream extent must be preceded by stream dictionary.', $this->offset - strlen($nextLexeme)));
  503. }
  504. /**
  505. * References are automatically dereferenced at this moment.
  506. */
  507. $streamLength = $objValue->Length->value;
  508. /**
  509. * 'stream' keyword must be followed by either cr-lf sequence or lf character only.
  510. * This restriction gives the possibility to recognize all cases exactly
  511. */
  512. if ($this->data[$this->offset] == "\r" &&
  513. $this->data[$this->offset + 1] == "\n" ) {
  514. $this->offset += 2;
  515. } else if ($this->data[$this->offset] == "\n" ) {
  516. $this->offset++;
  517. } else {
  518. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Offset - 0x%X. \'stream\' must be followed by either cr-lf sequence or lf character only.', $this->offset - strlen($nextLexeme)));
  519. }
  520. $dataOffset = $this->offset;
  521. $this->offset += $streamLength;
  522. $nextLexeme = $this->readLexeme();
  523. if ($nextLexeme != 'endstream') {
  524. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Offset - 0x%X. \'endstream\' keyword expected.', $this->offset - strlen($nextLexeme)));
  525. }
  526. $nextLexeme = $this->readLexeme();
  527. if ($nextLexeme != 'endobj') {
  528. throw new Zend_Pdf_Exception(sprintf('PDF file syntax error. Offset - 0x%X. \'endobj\' keyword expected.', $this->offset - strlen($nextLexeme)));
  529. }
  530. $obj = new Zend_Pdf_Element_Object_Stream(substr($this->data,
  531. $dataOffset,
  532. $streamLength),
  533. (int)$objNum,
  534. (int)$genNum,
  535. $this->_objFactory->resolve(),
  536. $objValue);
  537. foreach ($this->_elements as $element) {
  538. $element->setParentObject($obj);
  539. }
  540. // Restore offset value
  541. $this->offset = $offsetSave;
  542. return $obj;
  543. }
  544. /**
  545. * Get length of source string
  546. *
  547. * @return integer
  548. */
  549. public function getLength()
  550. {
  551. return strlen($this->data);
  552. }
  553. /**
  554. * Get source string
  555. *
  556. * @return string
  557. */
  558. public function getString()
  559. {
  560. return $this->data;
  561. }
  562. /**
  563. * Parse integer value from a binary stream
  564. *
  565. * @param string $stream
  566. * @param integer $offset
  567. * @param integer $size
  568. * @return integer
  569. */
  570. public static function parseIntFromStream($stream, $offset, $size)
  571. {
  572. $value = 0;
  573. for ($count = 0; $count < $size; $count++) {
  574. $value *= 256;
  575. $value += ord($stream[$offset + $count]);
  576. }
  577. return $value;
  578. }
  579. /**
  580. * Set current context
  581. *
  582. * @param Zend_Pdf_Element_Reference_Context $context
  583. */
  584. public function setContext(Zend_Pdf_Element_Reference_Context $context)
  585. {
  586. $this->_context = $context;
  587. }
  588. /**
  589. * Object constructor
  590. *
  591. * Note: PHP duplicates string, which is sent by value, only of it's updated.
  592. * Thus we don't need to care about overhead
  593. *
  594. * @param string $pdfString
  595. * @param Zend_Pdf_ElementFactory_Interface $factory
  596. */
  597. public function __construct($source, Zend_Pdf_ElementFactory_Interface $factory)
  598. {
  599. $this->data = $source;
  600. $this->_objFactory = $factory;
  601. }
  602. }