2
0

FileParser.php 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @package Zend_Pdf
  16. * @subpackage FileParser
  17. * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
  18. * @license http://framework.zend.com/license/new-bsd New BSD License
  19. */
  20. /**
  21. * Abstract utility class for parsing binary files.
  22. *
  23. * Provides a library of methods to quickly navigate and extract various data
  24. * types (signed and unsigned integers, floating- and fixed-point numbers,
  25. * strings, etc.) from the file.
  26. *
  27. * File access is managed via a {@link Zend_Pdf_FileParserDataSource} object.
  28. * This allows the same parser code to work with many different data sources:
  29. * in-memory objects, filesystem files, etc.
  30. *
  31. * @package Zend_Pdf
  32. * @subpackage FileParser
  33. * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
  34. * @license http://framework.zend.com/license/new-bsd New BSD License
  35. */
  36. abstract class Zend_Pdf_FileParser
  37. {
  38. /**** Class Constants ****/
  39. /**
  40. * Little-endian byte order (0x04 0x03 0x02 0x01).
  41. */
  42. const BYTE_ORDER_LITTLE_ENDIAN = 0;
  43. /**
  44. * Big-endian byte order (0x01 0x02 0x03 0x04).
  45. */
  46. const BYTE_ORDER_BIG_ENDIAN = 1;
  47. /**** Instance Variables ****/
  48. /**
  49. * Flag indicating that the file has passed a cursory validation check.
  50. * @var boolean
  51. */
  52. protected $_isScreened = false;
  53. /**
  54. * Flag indicating that the file has been sucessfully parsed.
  55. * @var boolean
  56. */
  57. protected $_isParsed = false;
  58. /**
  59. * Object representing the data source to be parsed.
  60. * @var Zend_Pdf_FileParserDataSource
  61. */
  62. protected $_dataSource = null;
  63. /**** Public Interface ****/
  64. /* Abstract Methods */
  65. /**
  66. * Performs a cursory check to verify that the binary file is in the expected
  67. * format. Intended to quickly weed out obviously bogus files.
  68. *
  69. * Must set $this->_isScreened to true if successful.
  70. *
  71. * @throws Zend_Pdf_Exception
  72. */
  73. abstract public function screen();
  74. /**
  75. * Reads and parses the complete binary file.
  76. *
  77. * Must set $this->_isParsed to true if successful.
  78. *
  79. * @throws Zend_Pdf_Exception
  80. */
  81. abstract public function parse();
  82. /* Object Lifecycle */
  83. /**
  84. * Object constructor.
  85. *
  86. * Verifies that the data source has been properly initialized.
  87. *
  88. * @param Zend_Pdf_FileParserDataSource $dataSource
  89. * @throws Zend_Pdf_Exception
  90. */
  91. public function __construct(Zend_Pdf_FileParserDataSource $dataSource)
  92. {
  93. if ($dataSource->getSize() == 0) {
  94. require_once 'Zend/Pdf/Exception.php';
  95. throw new Zend_Pdf_Exception('The data source has not been properly initialized',
  96. Zend_Pdf_Exception::BAD_DATA_SOURCE);
  97. }
  98. $this->_dataSource = $dataSource;
  99. }
  100. /**
  101. * Object destructor.
  102. *
  103. * Discards the data source object.
  104. */
  105. public function __destruct()
  106. {
  107. $this->_dataSource = null;
  108. }
  109. /* Accessors */
  110. /**
  111. * Returns true if the file has passed a cursory validation check.
  112. *
  113. * @return boolean
  114. */
  115. public function isScreened()
  116. {
  117. return $this->_isScreened;
  118. }
  119. /**
  120. * Returns true if the file has been successfully parsed.
  121. *
  122. * @return boolean
  123. */
  124. public function isParsed()
  125. {
  126. return $this->_isParsed;
  127. }
  128. /**
  129. * Returns the data source object representing the file being parsed.
  130. *
  131. * @return Zend_Pdf_FileParserDataSource
  132. */
  133. public function getDataSource()
  134. {
  135. return $this->_dataSource;
  136. }
  137. /* Primitive Methods */
  138. /**
  139. * Convenience wrapper for the data source object's moveToOffset() method.
  140. *
  141. * @param integer $offset Destination byte offset.
  142. * @throws Zend_Pdf_Exception
  143. */
  144. public function moveToOffset($offset)
  145. {
  146. $this->_dataSource->moveToOffset($offset);
  147. }
  148. public function getOffset() {
  149. return $this->_dataSource->getOffset();
  150. }
  151. public function getSize() {
  152. return $this->_dataSource->getSize();
  153. }
  154. /**
  155. * Convenience wrapper for the data source object's readBytes() method.
  156. *
  157. * @param integer $byteCount Number of bytes to read.
  158. * @return string
  159. * @throws Zend_Pdf_Exception
  160. */
  161. public function readBytes($byteCount)
  162. {
  163. return $this->_dataSource->readBytes($byteCount);
  164. }
  165. /**
  166. * Convenience wrapper for the data source object's skipBytes() method.
  167. *
  168. * @param integer $byteCount Number of bytes to skip.
  169. * @throws Zend_Pdf_Exception
  170. */
  171. public function skipBytes($byteCount)
  172. {
  173. $this->_dataSource->skipBytes($byteCount);
  174. }
  175. /* Parser Methods */
  176. /**
  177. * Reads the signed integer value from the binary file at the current byte
  178. * offset.
  179. *
  180. * Advances the offset by the number of bytes read. Throws an exception if
  181. * an error occurs.
  182. *
  183. * @param integer $size Size of integer in bytes: 1-4
  184. * @param integer $byteOrder (optional) Big- or little-endian byte order.
  185. * Use the BYTE_ORDER_ constants defined in {@link Zend_Pdf_FileParser}.
  186. * If omitted, uses big-endian.
  187. * @return integer
  188. * @throws Zend_Pdf_Exception
  189. */
  190. public function readInt($size, $byteOrder = Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN)
  191. {
  192. if (($size < 1) || ($size > 4)) {
  193. require_once 'Zend/Pdf/Exception.php';
  194. throw new Zend_Pdf_Exception("Invalid signed integer size: $size",
  195. Zend_Pdf_Exception::INVALID_INTEGER_SIZE);
  196. }
  197. $bytes = $this->_dataSource->readBytes($size);
  198. /* unpack() will not work for this method because it always works in
  199. * the host byte order for signed integers. It also does not allow for
  200. * variable integer sizes.
  201. */
  202. if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN) {
  203. $number = ord($bytes[0]);
  204. if (($number & 0x80) == 0x80) {
  205. /* This number is negative. Extract the positive equivalent.
  206. */
  207. $number = (~ $number) & 0xff;
  208. for ($i = 1; $i < $size; $i++) {
  209. $number = ($number << 8) | ((~ ord($bytes[$i])) & 0xff);
  210. }
  211. /* Now turn this back into a negative number by taking the
  212. * two's complement (we didn't add one above so won't
  213. * subtract it below). This works reliably on both 32- and
  214. * 64-bit systems.
  215. */
  216. $number = ~$number;
  217. } else {
  218. for ($i = 1; $i < $size; $i++) {
  219. $number = ($number << 8) | ord($bytes[$i]);
  220. }
  221. }
  222. } else if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_LITTLE_ENDIAN) {
  223. $number = ord($bytes[$size - 1]);
  224. if (($number & 0x80) == 0x80) {
  225. /* Negative number. See discussion above.
  226. */
  227. $number = 0;
  228. for ($i = --$size; $i >= 0; $i--) {
  229. $number |= ((~ ord($bytes[$i])) & 0xff) << ($i * 8);
  230. }
  231. $number = ~$number;
  232. } else {
  233. $number = 0;
  234. for ($i = --$size; $i >= 0; $i--) {
  235. $number |= ord($bytes[$i]) << ($i * 8);
  236. }
  237. }
  238. } else {
  239. require_once 'Zend/Pdf/Exception.php';
  240. throw new Zend_Pdf_Exception("Invalid byte order: $byteOrder",
  241. Zend_Pdf_Exception::INVALID_BYTE_ORDER);
  242. }
  243. return $number;
  244. }
  245. /**
  246. * Reads the unsigned integer value from the binary file at the current byte
  247. * offset.
  248. *
  249. * Advances the offset by the number of bytes read. Throws an exception if
  250. * an error occurs.
  251. *
  252. * NOTE: If you ask for a 4-byte unsigned integer on a 32-bit machine, the
  253. * resulting value WILL BE SIGNED because PHP uses signed integers internally
  254. * for everything. To guarantee portability, be sure to use bitwise operators
  255. * operators on large unsigned integers!
  256. *
  257. * @param integer $size Size of integer in bytes: 1-4
  258. * @param integer $byteOrder (optional) Big- or little-endian byte order.
  259. * Use the BYTE_ORDER_ constants defined in {@link Zend_Pdf_FileParser}.
  260. * If omitted, uses big-endian.
  261. * @return integer
  262. * @throws Zend_Pdf_Exception
  263. */
  264. public function readUInt($size, $byteOrder = Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN)
  265. {
  266. if (($size < 1) || ($size > 4)) {
  267. require_once 'Zend/Pdf/Exception.php';
  268. throw new Zend_Pdf_Exception("Invalid unsigned integer size: $size",
  269. Zend_Pdf_Exception::INVALID_INTEGER_SIZE);
  270. }
  271. $bytes = $this->_dataSource->readBytes($size);
  272. /* unpack() is a bit heavyweight for this simple conversion. Just
  273. * work the bytes directly.
  274. */
  275. if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN) {
  276. $number = ord($bytes[0]);
  277. for ($i = 1; $i < $size; $i++) {
  278. $number = ($number << 8) | ord($bytes[$i]);
  279. }
  280. } else if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_LITTLE_ENDIAN) {
  281. $number = 0;
  282. for ($i = --$size; $i >= 0; $i--) {
  283. $number |= ord($bytes[$i]) << ($i * 8);
  284. }
  285. } else {
  286. require_once 'Zend/Pdf/Exception.php';
  287. throw new Zend_Pdf_Exception("Invalid byte order: $byteOrder",
  288. Zend_Pdf_Exception::INVALID_BYTE_ORDER);
  289. }
  290. return $number;
  291. }
  292. /**
  293. * Returns true if the specified bit is set in the integer bitfield.
  294. *
  295. * @param integer $bit Bit number to test (i.e. - 0-31)
  296. * @param integer $bitField
  297. * @return boolean
  298. */
  299. public function isBitSet($bit, $bitField)
  300. {
  301. $bitMask = 1 << $bit;
  302. $isSet = (($bitField & $bitMask) == $bitMask);
  303. return $isSet;
  304. }
  305. /**
  306. * Reads the signed fixed-point number from the binary file at the current
  307. * byte offset.
  308. *
  309. * Common fixed-point sizes are 2.14 and 16.16.
  310. *
  311. * Advances the offset by the number of bytes read. Throws an exception if
  312. * an error occurs.
  313. *
  314. * @param integer $mantissaBits Number of bits in the mantissa
  315. * @param integer $fractionBits Number of bits in the fraction
  316. * @param integer $byteOrder (optional) Big- or little-endian byte order.
  317. * Use the BYTE_ORDER_ constants defined in {@link Zend_Pdf_FileParser}.
  318. * If omitted, uses big-endian.
  319. * @return float
  320. * @throws Zend_Pdf_Exception
  321. */
  322. public function readFixed($mantissaBits, $fractionBits,
  323. $byteOrder = Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN)
  324. {
  325. $bitsToRead = $mantissaBits + $fractionBits;
  326. if (($bitsToRead % 8) !== 0) {
  327. require_once 'Zend/Pdf/Exception.php';
  328. throw new Zend_Pdf_Exception('Fixed-point numbers are whole bytes',
  329. Zend_Pdf_Exception::BAD_FIXED_POINT_SIZE);
  330. }
  331. $number = $this->readInt(($bitsToRead >> 3), $byteOrder) / (1 << $fractionBits);
  332. return $number;
  333. }
  334. /**
  335. * Reads the Unicode UTF-16-encoded string from the binary file at the
  336. * current byte offset.
  337. *
  338. * The byte order of the UTF-16 string must be specified. You must also
  339. * supply the desired resulting character set.
  340. *
  341. * Advances the offset by the number of bytes read. Throws an exception if
  342. * an error occurs.
  343. *
  344. * @todo Consider changing $byteCount to a character count. They are not
  345. * always equivalent (in the case of surrogates).
  346. * @todo Make $byteOrder optional if there is a byte-order mark (BOM) in the
  347. * string being extracted.
  348. *
  349. * @param integer $byteCount Number of bytes (characters * 2) to return.
  350. * @param integer $byteOrder (optional) Big- or little-endian byte order.
  351. * Use the BYTE_ORDER_ constants defined in {@link Zend_Pdf_FileParser}.
  352. * If omitted, uses big-endian.
  353. * @param string $characterSet (optional) Desired resulting character set.
  354. * You may use any character set supported by {@link iconv()}. If omitted,
  355. * uses 'current locale'.
  356. * @return string
  357. * @throws Zend_Pdf_Exception
  358. */
  359. public function readStringUTF16($byteCount,
  360. $byteOrder = Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN,
  361. $characterSet = '')
  362. {
  363. if ($byteCount == 0) {
  364. return '';
  365. }
  366. $bytes = $this->_dataSource->readBytes($byteCount);
  367. if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN) {
  368. if ($characterSet == 'UTF-16BE') {
  369. return $bytes;
  370. }
  371. return iconv('UTF-16BE', $characterSet, $bytes);
  372. } else if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_LITTLE_ENDIAN) {
  373. if ($characterSet == 'UTF-16LE') {
  374. return $bytes;
  375. }
  376. return iconv('UTF-16LE', $characterSet, $bytes);
  377. } else {
  378. require_once 'Zend/Pdf/Exception.php';
  379. throw new Zend_Pdf_Exception("Invalid byte order: $byteOrder",
  380. Zend_Pdf_Exception::INVALID_BYTE_ORDER);
  381. }
  382. }
  383. /**
  384. * Reads the Mac Roman-encoded string from the binary file at the current
  385. * byte offset.
  386. *
  387. * You must supply the desired resulting character set.
  388. *
  389. * Advances the offset by the number of bytes read. Throws an exception if
  390. * an error occurs.
  391. *
  392. * @param integer $byteCount Number of bytes (characters) to return.
  393. * @param string $characterSet (optional) Desired resulting character set.
  394. * You may use any character set supported by {@link iconv()}. If omitted,
  395. * uses 'current locale'.
  396. * @return string
  397. * @throws Zend_Pdf_Exception
  398. */
  399. public function readStringMacRoman($byteCount, $characterSet = '')
  400. {
  401. if ($byteCount == 0) {
  402. return '';
  403. }
  404. $bytes = $this->_dataSource->readBytes($byteCount);
  405. if ($characterSet == 'MacRoman') {
  406. return $bytes;
  407. }
  408. return iconv('MacRoman', $characterSet, $bytes);
  409. }
  410. /**
  411. * Reads the Pascal string from the binary file at the current byte offset.
  412. *
  413. * The length of the Pascal string is determined by reading the length bytes
  414. * which preceed the character data. You must supply the desired resulting
  415. * character set.
  416. *
  417. * Advances the offset by the number of bytes read. Throws an exception if
  418. * an error occurs.
  419. *
  420. * @param string $characterSet (optional) Desired resulting character set.
  421. * You may use any character set supported by {@link iconv()}. If omitted,
  422. * uses 'current locale'.
  423. * @param integer $lengthBytes (optional) Number of bytes that make up the
  424. * length. Default is 1.
  425. * @return string
  426. * @throws Zend_Pdf_Exception
  427. */
  428. public function readStringPascal($characterSet = '', $lengthBytes = 1)
  429. {
  430. $byteCount = $this->readUInt($lengthBytes);
  431. if ($byteCount == 0) {
  432. return '';
  433. }
  434. $bytes = $this->_dataSource->readBytes($byteCount);
  435. if ($characterSet == 'ASCII') {
  436. return $bytes;
  437. }
  438. return iconv('ASCII', $characterSet, $bytes);
  439. }
  440. }