DocumentWriter.php 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Index
  18. * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id$
  21. */
  22. /** Zend_Search_Lucene_Index_SegmentWriter */
  23. require_once 'Zend/Search/Lucene/Index/SegmentWriter.php';
  24. /**
  25. * @category Zend
  26. * @package Zend_Search_Lucene
  27. * @subpackage Index
  28. * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
  29. * @license http://framework.zend.com/license/new-bsd New BSD License
  30. */
  31. class Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter extends Zend_Search_Lucene_Index_SegmentWriter
  32. {
  33. /**
  34. * Term Dictionary
  35. * Array of the Zend_Search_Lucene_Index_Term objects
  36. * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
  37. *
  38. * @var array
  39. */
  40. protected $_termDictionary;
  41. /**
  42. * Documents, which contain the term
  43. *
  44. * @var array
  45. */
  46. protected $_termDocs;
  47. /**
  48. * Object constructor.
  49. *
  50. * @param Zend_Search_Lucene_Storage_Directory $directory
  51. * @param string $name
  52. */
  53. public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
  54. {
  55. parent::__construct($directory, $name);
  56. $this->_termDocs = array();
  57. $this->_termDictionary = array();
  58. }
  59. /**
  60. * Adds a document to this segment.
  61. *
  62. * @param Zend_Search_Lucene_Document $document
  63. * @throws Zend_Search_Lucene_Exception
  64. */
  65. public function addDocument(Zend_Search_Lucene_Document $document)
  66. {
  67. /** Zend_Search_Lucene_Search_Similarity */
  68. require_once 'Zend/Search/Lucene/Search/Similarity.php';
  69. $storedFields = array();
  70. $docNorms = array();
  71. $similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
  72. foreach ($document->getFieldNames() as $fieldName) {
  73. $field = $document->getField($fieldName);
  74. $this->addField($field);
  75. if ($field->storeTermVector) {
  76. /**
  77. * @todo term vector storing support
  78. */
  79. require_once 'Zend/Search/Lucene/Exception.php';
  80. throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
  81. }
  82. if ($field->isIndexed) {
  83. if ($field->isTokenized) {
  84. /** Zend_Search_Lucene_Analysis_Analyzer */
  85. require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
  86. $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
  87. $analyzer->setInput($field->value, $field->encoding);
  88. $position = 0;
  89. $tokenCounter = 0;
  90. while (($token = $analyzer->nextToken()) !== null) {
  91. $tokenCounter++;
  92. $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
  93. $termKey = $term->key();
  94. if (!isset($this->_termDictionary[$termKey])) {
  95. // New term
  96. $this->_termDictionary[$termKey] = $term;
  97. $this->_termDocs[$termKey] = array();
  98. $this->_termDocs[$termKey][$this->_docCount] = array();
  99. } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
  100. // Existing term, but new term entry
  101. $this->_termDocs[$termKey][$this->_docCount] = array();
  102. }
  103. $position += $token->getPositionIncrement();
  104. $this->_termDocs[$termKey][$this->_docCount][] = $position;
  105. }
  106. $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name,
  107. $tokenCounter)*
  108. $document->boost*
  109. $field->boost ));
  110. } else {
  111. $term = new Zend_Search_Lucene_Index_Term($field->getUtf8Value(), $field->name);
  112. $termKey = $term->key();
  113. if (!isset($this->_termDictionary[$termKey])) {
  114. // New term
  115. $this->_termDictionary[$termKey] = $term;
  116. $this->_termDocs[$termKey] = array();
  117. $this->_termDocs[$termKey][$this->_docCount] = array();
  118. } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
  119. // Existing term, but new term entry
  120. $this->_termDocs[$termKey][$this->_docCount] = array();
  121. }
  122. $this->_termDocs[$termKey][$this->_docCount][] = 0; // position
  123. $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name, 1)*
  124. $document->boost*
  125. $field->boost ));
  126. }
  127. }
  128. if ($field->isStored) {
  129. $storedFields[] = $field;
  130. }
  131. }
  132. foreach ($this->_fields as $fieldName => $field) {
  133. if (!$field->isIndexed) {
  134. continue;
  135. }
  136. if (!isset($this->_norms[$fieldName])) {
  137. $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )),
  138. $this->_docCount);
  139. }
  140. if (isset($docNorms[$fieldName])){
  141. $this->_norms[$fieldName] .= $docNorms[$fieldName];
  142. } else {
  143. $this->_norms[$fieldName] .= chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) ));
  144. }
  145. }
  146. $this->addStoredFields($storedFields);
  147. }
  148. /**
  149. * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files
  150. */
  151. protected function _dumpDictionary()
  152. {
  153. ksort($this->_termDictionary, SORT_STRING);
  154. $this->initializeDictionaryFiles();
  155. foreach ($this->_termDictionary as $termId => $term) {
  156. $this->addTerm($term, $this->_termDocs[$termId]);
  157. }
  158. $this->closeDictionaryFiles();
  159. }
  160. /**
  161. * Close segment, write it to disk and return segment info
  162. *
  163. * @return Zend_Search_Lucene_Index_SegmentInfo
  164. */
  165. public function close()
  166. {
  167. if ($this->_docCount == 0) {
  168. return null;
  169. }
  170. $this->_dumpFNM();
  171. $this->_dumpDictionary();
  172. $this->_generateCFS();
  173. /** Zend_Search_Lucene_Index_SegmentInfo */
  174. require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
  175. return new Zend_Search_Lucene_Index_SegmentInfo($this->_directory,
  176. $this->_name,
  177. $this->_docCount,
  178. -1,
  179. null,
  180. true,
  181. true);
  182. }
  183. }