DocumentWriter.php 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Index
  18. * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. */
  21. /** Zend_Search_Lucene_Analysis_Analyzer */
  22. require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
  23. /** Zend_Search_Lucene_Index_SegmentWriter */
  24. require_once 'Zend/Search/Lucene/Index/SegmentWriter.php';
  25. /**
  26. * @category Zend
  27. * @package Zend_Search_Lucene
  28. * @subpackage Index
  29. * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
  30. * @license http://framework.zend.com/license/new-bsd New BSD License
  31. */
  32. class Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter extends Zend_Search_Lucene_Index_SegmentWriter
  33. {
  34. /**
  35. * Term Dictionary
  36. * Array of the Zend_Search_Lucene_Index_Term objects
  37. * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
  38. *
  39. * @var array
  40. */
  41. protected $_termDictionary;
  42. /**
  43. * Documents, which contain the term
  44. *
  45. * @var array
  46. */
  47. protected $_termDocs;
  48. /**
  49. * Object constructor.
  50. *
  51. * @param Zend_Search_Lucene_Storage_Directory $directory
  52. * @param string $name
  53. */
  54. public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
  55. {
  56. parent::__construct($directory, $name);
  57. $this->_termDocs = array();
  58. $this->_termDictionary = array();
  59. }
  60. /**
  61. * Adds a document to this segment.
  62. *
  63. * @param Zend_Search_Lucene_Document $document
  64. * @throws Zend_Search_Lucene_Exception
  65. */
  66. public function addDocument(Zend_Search_Lucene_Document $document)
  67. {
  68. $storedFields = array();
  69. $docNorms = array();
  70. $similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
  71. foreach ($document->getFieldNames() as $fieldName) {
  72. $field = $document->getField($fieldName);
  73. $this->addField($field);
  74. if ($field->storeTermVector) {
  75. /**
  76. * @todo term vector storing support
  77. */
  78. require_once 'Zend/Search/Lucene/Exception.php';
  79. throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
  80. }
  81. if ($field->isIndexed) {
  82. if ($field->isTokenized) {
  83. $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
  84. $analyzer->setInput($field->value, $field->encoding);
  85. $position = 0;
  86. $tokenCounter = 0;
  87. while (($token = $analyzer->nextToken()) !== null) {
  88. $tokenCounter++;
  89. $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
  90. $termKey = $term->key();
  91. if (!isset($this->_termDictionary[$termKey])) {
  92. // New term
  93. $this->_termDictionary[$termKey] = $term;
  94. $this->_termDocs[$termKey] = array();
  95. $this->_termDocs[$termKey][$this->_docCount] = array();
  96. } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
  97. // Existing term, but new term entry
  98. $this->_termDocs[$termKey][$this->_docCount] = array();
  99. }
  100. $position += $token->getPositionIncrement();
  101. $this->_termDocs[$termKey][$this->_docCount][] = $position;
  102. }
  103. $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name,
  104. $tokenCounter)*
  105. $document->boost*
  106. $field->boost ));
  107. } else {
  108. $term = new Zend_Search_Lucene_Index_Term($field->getUtf8Value(), $field->name);
  109. $termKey = $term->key();
  110. if (!isset($this->_termDictionary[$termKey])) {
  111. // New term
  112. $this->_termDictionary[$termKey] = $term;
  113. $this->_termDocs[$termKey] = array();
  114. $this->_termDocs[$termKey][$this->_docCount] = array();
  115. } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
  116. // Existing term, but new term entry
  117. $this->_termDocs[$termKey][$this->_docCount] = array();
  118. }
  119. $this->_termDocs[$termKey][$this->_docCount][] = 0; // position
  120. $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name, 1)*
  121. $document->boost*
  122. $field->boost ));
  123. }
  124. }
  125. if ($field->isStored) {
  126. $storedFields[] = $field;
  127. }
  128. }
  129. foreach ($this->_fields as $fieldName => $field) {
  130. if (!$field->isIndexed) {
  131. continue;
  132. }
  133. if (!isset($this->_norms[$fieldName])) {
  134. $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )),
  135. $this->_docCount);
  136. }
  137. if (isset($docNorms[$fieldName])){
  138. $this->_norms[$fieldName] .= $docNorms[$fieldName];
  139. } else {
  140. $this->_norms[$fieldName] .= chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) ));
  141. }
  142. }
  143. $this->addStoredFields($storedFields);
  144. }
  145. /**
  146. * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files
  147. */
  148. protected function _dumpDictionary()
  149. {
  150. ksort($this->_termDictionary, SORT_STRING);
  151. $this->initializeDictionaryFiles();
  152. foreach ($this->_termDictionary as $termId => $term) {
  153. $this->addTerm($term, $this->_termDocs[$termId]);
  154. }
  155. $this->closeDictionaryFiles();
  156. }
  157. /**
  158. * Close segment, write it to disk and return segment info
  159. *
  160. * @return Zend_Search_Lucene_Index_SegmentInfo
  161. */
  162. public function close()
  163. {
  164. if ($this->_docCount == 0) {
  165. return null;
  166. }
  167. $this->_dumpFNM();
  168. $this->_dumpDictionary();
  169. $this->_generateCFS();
  170. return new Zend_Search_Lucene_Index_SegmentInfo($this->_directory,
  171. $this->_name,
  172. $this->_docCount,
  173. -1,
  174. null,
  175. true,
  176. true);
  177. }
  178. }