SegmentMerger.php 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Index
  18. * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id$
  21. */
  22. /** Zend_Search_Lucene_Index_SegmentInfo */
  23. require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
  24. /** Zend_Search_Lucene_Index_SegmentWriter_StreamWriter */
  25. require_once 'Zend/Search/Lucene/Index/SegmentWriter/StreamWriter.php';
  26. /** Zend_Search_Lucene_Index_TermsPriorityQueue */
  27. require_once 'Zend/Search/Lucene/Index/TermsPriorityQueue.php';
  28. /**
  29. * @category Zend
  30. * @package Zend_Search_Lucene
  31. * @subpackage Index
  32. * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
  33. * @license http://framework.zend.com/license/new-bsd New BSD License
  34. */
  35. class Zend_Search_Lucene_Index_SegmentMerger
  36. {
  37. /**
  38. * Target segment writer
  39. *
  40. * @var Zend_Search_Lucene_Index_SegmentWriter_StreamWriter
  41. */
  42. private $_writer;
  43. /**
  44. * Number of docs in a new segment
  45. *
  46. * @var integer
  47. */
  48. private $_docCount;
  49. /**
  50. * A set of segments to be merged
  51. *
  52. * @var array Zend_Search_Lucene_Index_SegmentInfo
  53. */
  54. private $_segmentInfos = array();
  55. /**
  56. * Flag to signal, that merge is already done
  57. *
  58. * @var boolean
  59. */
  60. private $_mergeDone = false;
  61. /**
  62. * Field map
  63. * [<segment_name>][<field_number>] => <target_field_number>
  64. *
  65. * @var array
  66. */
  67. private $_fieldsMap = array();
  68. /**
  69. * Object constructor.
  70. *
  71. * Creates new segment merger with $directory as target to merge segments into
  72. * and $name as a name of new segment
  73. *
  74. * @param Zend_Search_Lucene_Storage_Directory $directory
  75. * @param string $name
  76. */
  77. public function __construct($directory, $name)
  78. {
  79. $this->_writer = new Zend_Search_Lucene_Index_SegmentWriter_StreamWriter($directory, $name);
  80. }
  81. /**
  82. * Add segmnet to a collection of segments to be merged
  83. *
  84. * @param Zend_Search_Lucene_Index_SegmentInfo $segment
  85. */
  86. public function addSource(Zend_Search_Lucene_Index_SegmentInfo $segmentInfo)
  87. {
  88. $this->_segmentInfos[$segmentInfo->getName()] = $segmentInfo;
  89. }
  90. /**
  91. * Do merge.
  92. *
  93. * Returns number of documents in newly created segment
  94. *
  95. * @return Zend_Search_Lucene_Index_SegmentInfo
  96. * @throws Zend_Search_Lucene_Exception
  97. */
  98. public function merge()
  99. {
  100. if ($this->_mergeDone) {
  101. require_once 'Zend/Search/Lucene/Exception.php';
  102. throw new Zend_Search_Lucene_Exception('Merge is already done.');
  103. }
  104. if (count($this->_segmentInfos) < 1) {
  105. require_once 'Zend/Search/Lucene/Exception.php';
  106. throw new Zend_Search_Lucene_Exception('Wrong number of segments to be merged ('
  107. . count($this->_segmentInfos)
  108. . ').');
  109. }
  110. $this->_mergeFields();
  111. $this->_mergeNorms();
  112. $this->_mergeStoredFields();
  113. $this->_mergeTerms();
  114. $this->_mergeDone = true;
  115. return $this->_writer->close();
  116. }
  117. /**
  118. * Merge fields information
  119. */
  120. private function _mergeFields()
  121. {
  122. foreach ($this->_segmentInfos as $segName => $segmentInfo) {
  123. foreach ($segmentInfo->getFieldInfos() as $fieldInfo) {
  124. $this->_fieldsMap[$segName][$fieldInfo->number] = $this->_writer->addFieldInfo($fieldInfo);
  125. }
  126. }
  127. }
  128. /**
  129. * Merge field's normalization factors
  130. */
  131. private function _mergeNorms()
  132. {
  133. foreach ($this->_writer->getFieldInfos() as $fieldInfo) {
  134. if ($fieldInfo->isIndexed) {
  135. foreach ($this->_segmentInfos as $segName => $segmentInfo) {
  136. if ($segmentInfo->hasDeletions()) {
  137. $srcNorm = $segmentInfo->normVector($fieldInfo->name);
  138. $norm = '';
  139. $docs = $segmentInfo->count();
  140. for ($count = 0; $count < $docs; $count++) {
  141. if (!$segmentInfo->isDeleted($count)) {
  142. $norm .= $srcNorm[$count];
  143. }
  144. }
  145. $this->_writer->addNorm($fieldInfo->name, $norm);
  146. } else {
  147. $this->_writer->addNorm($fieldInfo->name, $segmentInfo->normVector($fieldInfo->name));
  148. }
  149. }
  150. }
  151. }
  152. }
  153. /**
  154. * Merge fields information
  155. */
  156. private function _mergeStoredFields()
  157. {
  158. $this->_docCount = 0;
  159. foreach ($this->_segmentInfos as $segName => $segmentInfo) {
  160. $fdtFile = $segmentInfo->openCompoundFile('.fdt');
  161. for ($count = 0; $count < $segmentInfo->count(); $count++) {
  162. $fieldCount = $fdtFile->readVInt();
  163. $storedFields = array();
  164. for ($count2 = 0; $count2 < $fieldCount; $count2++) {
  165. $fieldNum = $fdtFile->readVInt();
  166. $bits = $fdtFile->readByte();
  167. $fieldInfo = $segmentInfo->getField($fieldNum);
  168. if (!($bits & 2)) { // Text data
  169. $storedFields[] =
  170. new Zend_Search_Lucene_Field($fieldInfo->name,
  171. $fdtFile->readString(),
  172. 'UTF-8',
  173. true,
  174. $fieldInfo->isIndexed,
  175. $bits & 1 );
  176. } else { // Binary data
  177. $storedFields[] =
  178. new Zend_Search_Lucene_Field($fieldInfo->name,
  179. $fdtFile->readBinary(),
  180. '',
  181. true,
  182. $fieldInfo->isIndexed,
  183. $bits & 1,
  184. true);
  185. }
  186. }
  187. if (!$segmentInfo->isDeleted($count)) {
  188. $this->_docCount++;
  189. $this->_writer->addStoredFields($storedFields);
  190. }
  191. }
  192. }
  193. }
  194. /**
  195. * Merge fields information
  196. */
  197. private function _mergeTerms()
  198. {
  199. $segmentInfoQueue = new Zend_Search_Lucene_Index_TermsPriorityQueue();
  200. $segmentStartId = 0;
  201. foreach ($this->_segmentInfos as $segName => $segmentInfo) {
  202. $segmentStartId = $segmentInfo->resetTermsStream($segmentStartId, Zend_Search_Lucene_Index_SegmentInfo::SM_MERGE_INFO);
  203. // Skip "empty" segments
  204. if ($segmentInfo->currentTerm() !== null) {
  205. $segmentInfoQueue->put($segmentInfo);
  206. }
  207. }
  208. $this->_writer->initializeDictionaryFiles();
  209. $termDocs = array();
  210. while (($segmentInfo = $segmentInfoQueue->pop()) !== null) {
  211. // Merge positions array
  212. $termDocs += $segmentInfo->currentTermPositions();
  213. if ($segmentInfoQueue->top() === null ||
  214. $segmentInfoQueue->top()->currentTerm()->key() !=
  215. $segmentInfo->currentTerm()->key()) {
  216. // We got new term
  217. ksort($termDocs, SORT_NUMERIC);
  218. // Add term if it's contained in any document
  219. if (count($termDocs) > 0) {
  220. $this->_writer->addTerm($segmentInfo->currentTerm(), $termDocs);
  221. }
  222. $termDocs = array();
  223. }
  224. $segmentInfo->nextTerm();
  225. // check, if segment dictionary is finished
  226. if ($segmentInfo->currentTerm() !== null) {
  227. // Put segment back into the priority queue
  228. $segmentInfoQueue->put($segmentInfo);
  229. }
  230. }
  231. $this->_writer->closeDictionaryFiles();
  232. }
  233. }