Docx.php 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Document
  18. * @copyright Copyright (c) 2005-2014 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id$
  21. */
  22. /** Zend_Search_Lucene_Document_OpenXml */
  23. require_once 'Zend/Search/Lucene/Document/OpenXml.php';
  24. /** Zend_Xml_Security */
  25. require_once 'Zend/Xml/Security.php';
  26. /**
  27. * Docx document.
  28. *
  29. * @category Zend
  30. * @package Zend_Search_Lucene
  31. * @subpackage Document
  32. * @copyright Copyright (c) 2005-2014 Zend Technologies USA Inc. (http://www.zend.com)
  33. * @license http://framework.zend.com/license/new-bsd New BSD License
  34. */
  35. class Zend_Search_Lucene_Document_Docx extends Zend_Search_Lucene_Document_OpenXml {
  36. /**
  37. * Xml Schema - WordprocessingML
  38. *
  39. * @var string
  40. */
  41. const SCHEMA_WORDPROCESSINGML = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main';
  42. /**
  43. * Object constructor
  44. *
  45. * @param string $fileName
  46. * @param boolean $storeContent
  47. * @throws Zend_Search_Lucene_Exception
  48. */
  49. private function __construct($fileName, $storeContent) {
  50. if (!class_exists('ZipArchive', false)) {
  51. require_once 'Zend/Search/Lucene/Exception.php';
  52. throw new Zend_Search_Lucene_Exception('MS Office documents processing functionality requires Zip extension to be loaded');
  53. }
  54. // Document data holders
  55. $documentBody = array();
  56. $coreProperties = array();
  57. // Open OpenXML package
  58. $package = new ZipArchive();
  59. $package->open($fileName);
  60. // Read relations and search for officeDocument
  61. $relationsXml = $package->getFromName('_rels/.rels');
  62. if ($relationsXml === false) {
  63. require_once 'Zend/Search/Lucene/Exception.php';
  64. throw new Zend_Search_Lucene_Exception('Invalid archive or corrupted .docx file.');
  65. }
  66. $relations = Zend_Xml_Security::scan($relationsXml);
  67. foreach($relations->Relationship as $rel) {
  68. if ($rel ["Type"] == Zend_Search_Lucene_Document_OpenXml::SCHEMA_OFFICEDOCUMENT) {
  69. // Found office document! Read in contents...
  70. $contents = Zend_Xml_Security::scan($package->getFromName(
  71. $this->absoluteZipPath(dirname($rel['Target'])
  72. . '/'
  73. . basename($rel['Target']))
  74. ));
  75. $contents->registerXPathNamespace('w', Zend_Search_Lucene_Document_Docx::SCHEMA_WORDPROCESSINGML);
  76. $paragraphs = $contents->xpath('//w:body/w:p');
  77. foreach ($paragraphs as $paragraph) {
  78. $runs = $paragraph->xpath('.//w:r/*[name() = "w:t" or name() = "w:br"]');
  79. if ($runs === false) {
  80. // Paragraph doesn't contain any text or breaks
  81. continue;
  82. }
  83. foreach ($runs as $run) {
  84. if ($run->getName() == 'br') {
  85. // Break element
  86. $documentBody[] = ' ';
  87. } else {
  88. $documentBody[] = (string)$run;
  89. }
  90. }
  91. // Add space after each paragraph. So they are not bound together.
  92. $documentBody[] = ' ';
  93. }
  94. break;
  95. }
  96. }
  97. // Read core properties
  98. $coreProperties = $this->extractMetaData($package);
  99. // Close file
  100. $package->close();
  101. // Store filename
  102. $this->addField(Zend_Search_Lucene_Field::Text('filename', $fileName, 'UTF-8'));
  103. // Store contents
  104. if ($storeContent) {
  105. $this->addField(Zend_Search_Lucene_Field::Text('body', implode('', $documentBody), 'UTF-8'));
  106. } else {
  107. $this->addField(Zend_Search_Lucene_Field::UnStored('body', implode('', $documentBody), 'UTF-8'));
  108. }
  109. // Store meta data properties
  110. foreach ($coreProperties as $key => $value) {
  111. $this->addField(Zend_Search_Lucene_Field::Text($key, $value, 'UTF-8'));
  112. }
  113. // Store title (if not present in meta data)
  114. if (! isset($coreProperties['title'])) {
  115. $this->addField(Zend_Search_Lucene_Field::Text('title', $fileName, 'UTF-8'));
  116. }
  117. }
  118. /**
  119. * Load Docx document from a file
  120. *
  121. * @param string $fileName
  122. * @param boolean $storeContent
  123. * @return Zend_Search_Lucene_Document_Docx
  124. * @throws Zend_Search_Lucene_Document_Exception
  125. */
  126. public static function loadDocxFile($fileName, $storeContent = false) {
  127. if (!is_readable($fileName)) {
  128. require_once 'Zend/Search/Lucene/Document/Exception.php';
  129. throw new Zend_Search_Lucene_Document_Exception('Provided file \'' . $fileName . '\' is not readable.');
  130. }
  131. return new Zend_Search_Lucene_Document_Docx($fileName, $storeContent);
  132. }
  133. }