Pptx.php 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Document
  18. * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id$
  21. */
  22. /** Zend_Xml_Security */
  23. require_once 'Zend/Xml/Security.php';
  24. /** Zend_Search_Lucene_Document_OpenXml */
  25. require_once 'Zend/Search/Lucene/Document/OpenXml.php';
  26. /**
  27. * Pptx document.
  28. *
  29. * @category Zend
  30. * @package Zend_Search_Lucene
  31. * @subpackage Document
  32. * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com)
  33. * @license http://framework.zend.com/license/new-bsd New BSD License
  34. */
  35. class Zend_Search_Lucene_Document_Pptx extends Zend_Search_Lucene_Document_OpenXml
  36. {
  37. /**
  38. * Xml Schema - PresentationML
  39. *
  40. * @var string
  41. */
  42. const SCHEMA_PRESENTATIONML = 'http://schemas.openxmlformats.org/presentationml/2006/main';
  43. /**
  44. * Xml Schema - DrawingML
  45. *
  46. * @var string
  47. */
  48. const SCHEMA_DRAWINGML = 'http://schemas.openxmlformats.org/drawingml/2006/main';
  49. /**
  50. * Xml Schema - Slide relation
  51. *
  52. * @var string
  53. */
  54. const SCHEMA_SLIDERELATION = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide';
  55. /**
  56. * Xml Schema - Slide notes relation
  57. *
  58. * @var string
  59. */
  60. const SCHEMA_SLIDENOTESRELATION = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide';
  61. /**
  62. * Object constructor
  63. *
  64. * @param string $fileName
  65. * @param boolean $storeContent
  66. * @throws Zend_Search_Lucene_Exception
  67. */
  68. private function __construct($fileName, $storeContent)
  69. {
  70. if (!class_exists('ZipArchive', false)) {
  71. require_once 'Zend/Search/Lucene/Exception.php';
  72. throw new Zend_Search_Lucene_Exception('MS Office documents processing functionality requires Zip extension to be loaded');
  73. }
  74. // Document data holders
  75. $slides = array();
  76. $slideNotes = array();
  77. $documentBody = array();
  78. $coreProperties = array();
  79. // Open OpenXML package
  80. $package = new ZipArchive();
  81. $package->open($fileName);
  82. // Read relations and search for officeDocument
  83. $relationsXml = $package->getFromName('_rels/.rels');
  84. if ($relationsXml === false) {
  85. require_once 'Zend/Search/Lucene/Exception.php';
  86. throw new Zend_Search_Lucene_Exception('Invalid archive or corrupted .pptx file.');
  87. }
  88. $relations = Zend_Xml_Security::scan($relationsXml);
  89. foreach ($relations->Relationship as $rel) {
  90. if ($rel["Type"] == Zend_Search_Lucene_Document_OpenXml::SCHEMA_OFFICEDOCUMENT) {
  91. // Found office document! Search for slides...
  92. $slideRelations = Zend_Xml_Security::scan($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/_rels/" . basename($rel["Target"]) . ".rels")) );
  93. foreach ($slideRelations->Relationship as $slideRel) {
  94. if ($slideRel["Type"] == Zend_Search_Lucene_Document_Pptx::SCHEMA_SLIDERELATION) {
  95. // Found slide!
  96. $slides[ str_replace( 'rId', '', (string)$slideRel["Id"] ) ] = Zend_Xml_Security::scan(
  97. $package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . basename($slideRel["Target"])) )
  98. );
  99. // Search for slide notes
  100. $slideNotesRelations = Zend_Xml_Security::scan($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/_rels/" . basename($slideRel["Target"]) . ".rels")) );
  101. foreach ($slideNotesRelations->Relationship as $slideNoteRel) {
  102. if ($slideNoteRel["Type"] == Zend_Search_Lucene_Document_Pptx::SCHEMA_SLIDENOTESRELATION) {
  103. // Found slide notes!
  104. $slideNotes[ str_replace( 'rId', '', (string)$slideRel["Id"] ) ] = Zend_Xml_Security::scan(
  105. $package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . dirname($slideNoteRel["Target"]) . "/" . basename($slideNoteRel["Target"])) )
  106. );
  107. break;
  108. }
  109. }
  110. }
  111. }
  112. break;
  113. }
  114. }
  115. // Sort slides
  116. ksort($slides);
  117. ksort($slideNotes);
  118. // Extract contents from slides
  119. foreach ($slides as $slideKey => $slide) {
  120. // Register namespaces
  121. $slide->registerXPathNamespace("p", Zend_Search_Lucene_Document_Pptx::SCHEMA_PRESENTATIONML);
  122. $slide->registerXPathNamespace("a", Zend_Search_Lucene_Document_Pptx::SCHEMA_DRAWINGML);
  123. // Fetch all text
  124. $textElements = $slide->xpath('//a:t');
  125. foreach ($textElements as $textElement) {
  126. $documentBody[] = (string)$textElement;
  127. }
  128. // Extract contents from slide notes
  129. if (isset($slideNotes[$slideKey])) {
  130. // Fetch slide note
  131. $slideNote = $slideNotes[$slideKey];
  132. // Register namespaces
  133. $slideNote->registerXPathNamespace("p", Zend_Search_Lucene_Document_Pptx::SCHEMA_PRESENTATIONML);
  134. $slideNote->registerXPathNamespace("a", Zend_Search_Lucene_Document_Pptx::SCHEMA_DRAWINGML);
  135. // Fetch all text
  136. $textElements = $slideNote->xpath('//a:t');
  137. foreach ($textElements as $textElement) {
  138. $documentBody[] = (string)$textElement;
  139. }
  140. }
  141. }
  142. // Read core properties
  143. $coreProperties = $this->extractMetaData($package);
  144. // Close file
  145. $package->close();
  146. // Store filename
  147. $this->addField(Zend_Search_Lucene_Field::Text('filename', $fileName, 'UTF-8'));
  148. // Store contents
  149. if ($storeContent) {
  150. $this->addField(Zend_Search_Lucene_Field::Text('body', implode(' ', $documentBody), 'UTF-8'));
  151. } else {
  152. $this->addField(Zend_Search_Lucene_Field::UnStored('body', implode(' ', $documentBody), 'UTF-8'));
  153. }
  154. // Store meta data properties
  155. foreach ($coreProperties as $key => $value)
  156. {
  157. $this->addField(Zend_Search_Lucene_Field::Text($key, $value, 'UTF-8'));
  158. }
  159. // Store title (if not present in meta data)
  160. if (!isset($coreProperties['title']))
  161. {
  162. $this->addField(Zend_Search_Lucene_Field::Text('title', $fileName, 'UTF-8'));
  163. }
  164. }
  165. /**
  166. * Load Pptx document from a file
  167. *
  168. * @param string $fileName
  169. * @param boolean $storeContent
  170. * @return Zend_Search_Lucene_Document_Pptx
  171. */
  172. public static function loadPptxFile($fileName, $storeContent = false)
  173. {
  174. return new Zend_Search_Lucene_Document_Pptx($fileName, $storeContent);
  175. }
  176. }