| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130 |
- <?php
- /**
- * Zend Framework
- *
- * LICENSE
- *
- * This source file is subject to the new BSD license that is bundled
- * with this package in the file LICENSE.txt.
- * It is also available through the world-wide-web at this URL:
- * http://framework.zend.com/license/new-bsd
- * If you did not receive a copy of the license and are unable to
- * obtain it through the world-wide-web, please send an email
- * to license@zend.com so we can send you a copy immediately.
- *
- * @category Zend
- * @package Zend_Search_Lucene
- * @subpackage Index
- * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
- * @license http://framework.zend.com/license/new-bsd New BSD License
- * @version $Id$
- */
- /** Zend_Search_Lucene_Index_TermsStream_Interface */
- require_once 'Zend/Search/Lucene/Index/TermsStream/Interface.php';
- /** Zend_Search_Lucene_Search_Similarity */
- require_once 'Zend/Search/Lucene/Search/Similarity.php';
- /** Zend_Search_Lucene_Index_FieldInfo */
- require_once 'Zend/Search/Lucene/Index/FieldInfo.php';
- /** Zend_Search_Lucene_Index_Term */
- require_once 'Zend/Search/Lucene/Index/Term.php';
- /** Zend_Search_Lucene_Index_TermInfo */
- require_once 'Zend/Search/Lucene/Index/TermInfo.php';
- /**
- * @category Zend
- * @package Zend_Search_Lucene
- * @subpackage Index
- * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
- * @license http://framework.zend.com/license/new-bsd New BSD License
- */
- class Zend_Search_Lucene_Index_SegmentInfo implements Zend_Search_Lucene_Index_TermsStream_Interface
- {
- /**
- * "Full scan vs fetch" boundary.
- *
- * If filter selectivity is less than this value, then full scan is performed
- * (since term entries fetching has some additional overhead).
- */
- const FULL_SCAN_VS_FETCH_BOUNDARY = 5;
- /**
- * Number of docs in a segment
- *
- * @var integer
- */
- private $_docCount;
- /**
- * Segment name
- *
- * @var string
- */
- private $_name;
- /**
- * Term Dictionary Index
- *
- * Array of arrays (Zend_Search_Lucene_Index_Term objects are represented as arrays because
- * of performance considerations)
- * [0] -> $termValue
- * [1] -> $termFieldNum
- *
- * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
- *
- * @var array
- */
- private $_termDictionary;
- /**
- * Term Dictionary Index TermInfos
- *
- * Array of arrays (Zend_Search_Lucene_Index_TermInfo objects are represented as arrays because
- * of performance considerations)
- * [0] -> $docFreq
- * [1] -> $freqPointer
- * [2] -> $proxPointer
- * [3] -> $skipOffset
- * [4] -> $indexPointer
- *
- * @var array
- */
- private $_termDictionaryInfos;
- /**
- * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
- *
- * @var array
- */
- private $_fields;
- /**
- * Field positions in a dictionary.
- * (Term dictionary contains filelds ordered by names)
- *
- * @var array
- */
- private $_fieldsDicPositions;
- /**
- * Associative array where the key is the file name and the value is data offset
- * in a compound segment file (.csf).
- *
- * @var array
- */
- private $_segFiles;
- /**
- * Associative array where the key is the file name and the value is file size (.csf).
- *
- * @var array
- */
- private $_segFileSizes;
- /**
- * Delete file generation number
- *
- * -2 means autodetect latest delete generation
- * -1 means 'there is no delete file'
- * 0 means pre-2.1 format delete file
- * X specifies used delete file
- *
- * @var integer
- */
- private $_delGen;
- /**
- * Segment has single norms file
- *
- * If true then one .nrm file is used for all fields
- * Otherwise .fN files are used
- *
- * @var boolean
- */
- private $_hasSingleNormFile;
- /**
- * Use compound segment file (*.cfs) to collect all other segment files
- * (excluding .del files)
- *
- * @var boolean
- */
- private $_isCompound;
- /**
- * File system adapter.
- *
- * @var Zend_Search_Lucene_Storage_Directory_Filesystem
- */
- private $_directory;
- /**
- * Normalization factors.
- * An array fieldName => normVector
- * normVector is a binary string.
- * Each byte corresponds to an indexed document in a segment and
- * encodes normalization factor (float value, encoded by
- * Zend_Search_Lucene_Search_Similarity::encodeNorm())
- *
- * @var array
- */
- private $_norms = array();
- /**
- * List of deleted documents.
- * bitset if bitset extension is loaded or array otherwise.
- *
- * @var mixed
- */
- private $_deleted = null;
- /**
- * $this->_deleted update flag
- *
- * @var boolean
- */
- private $_deletedDirty = false;
- /**
- * True if segment uses shared doc store
- *
- * @var boolean
- */
- private $_usesSharedDocStore;
- /*
- * Shared doc store options.
- * It's an assotiative array with the following items:
- * - 'offset' => $docStoreOffset The starting document in the shared doc store files where this segment's documents begin
- * - 'segment' => $docStoreSegment The name of the segment that has the shared doc store files.
- * - 'isCompound' => $docStoreIsCompoundFile True, if compound file format is used for the shared doc store files (.cfx file).
- */
- private $_sharedDocStoreOptions;
- /**
- * Zend_Search_Lucene_Index_SegmentInfo constructor
- *
- * @param Zend_Search_Lucene_Storage_Directory $directory
- * @param string $name
- * @param integer $docCount
- * @param integer $delGen
- * @param array|null $docStoreOptions
- * @param boolean $hasSingleNormFile
- * @param boolean $isCompound
- */
- public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name, $docCount, $delGen = 0, $docStoreOptions = null, $hasSingleNormFile = false, $isCompound = null)
- {
- $this->_directory = $directory;
- $this->_name = $name;
- $this->_docCount = $docCount;
- if ($docStoreOptions !== null) {
- $this->_usesSharedDocStore = true;
- $this->_sharedDocStoreOptions = $docStoreOptions;
- if ($docStoreOptions['isCompound']) {
- $cfxFile = $this->_directory->getFileObject($docStoreOptions['segment'] . '.cfx');
- $cfxFilesCount = $cfxFile->readVInt();
- $cfxFiles = array();
- $cfxFileSizes = array();
- for ($count = 0; $count < $cfxFilesCount; $count++) {
- $dataOffset = $cfxFile->readLong();
- if ($count != 0) {
- $cfxFileSizes[$fileName] = $dataOffset - end($cfxFiles);
- }
- $fileName = $cfxFile->readString();
- $cfxFiles[$fileName] = $dataOffset;
- }
- if ($count != 0) {
- $cfxFileSizes[$fileName] = $this->_directory->fileLength($docStoreOptions['segment'] . '.cfx') - $dataOffset;
- }
- $this->_sharedDocStoreOptions['files'] = $cfxFiles;
- $this->_sharedDocStoreOptions['fileSizes'] = $cfxFileSizes;
- }
- }
- $this->_hasSingleNormFile = $hasSingleNormFile;
- $this->_delGen = $delGen;
- $this->_termDictionary = null;
- if ($isCompound !== null) {
- $this->_isCompound = $isCompound;
- } else {
- // It's a pre-2.1 segment or isCompound is set to 'unknown'
- // Detect if segment uses compound file
- require_once 'Zend/Search/Lucene/Exception.php';
- try {
- // Try to open compound file
- $this->_directory->getFileObject($name . '.cfs');
- // Compound file is found
- $this->_isCompound = true;
- } catch (Zend_Search_Lucene_Exception $e) {
- if (strpos($e->getMessage(), 'is not readable') !== false) {
- // Compound file is not found or is not readable
- $this->_isCompound = false;
- } else {
- throw new Zend_Search_Lucene_Exception($e->getMessage(), $e->getCode(), $e);
- }
- }
- }
- $this->_segFiles = array();
- if ($this->_isCompound) {
- $cfsFile = $this->_directory->getFileObject($name . '.cfs');
- $segFilesCount = $cfsFile->readVInt();
- for ($count = 0; $count < $segFilesCount; $count++) {
- $dataOffset = $cfsFile->readLong();
- if ($count != 0) {
- $this->_segFileSizes[$fileName] = $dataOffset - end($this->_segFiles);
- }
- $fileName = $cfsFile->readString();
- $this->_segFiles[$fileName] = $dataOffset;
- }
- if ($count != 0) {
- $this->_segFileSizes[$fileName] = $this->_directory->fileLength($name . '.cfs') - $dataOffset;
- }
- }
- $fnmFile = $this->openCompoundFile('.fnm');
- $fieldsCount = $fnmFile->readVInt();
- $fieldNames = array();
- $fieldNums = array();
- $this->_fields = array();
- for ($count=0; $count < $fieldsCount; $count++) {
- $fieldName = $fnmFile->readString();
- $fieldBits = $fnmFile->readByte();
- $this->_fields[$count] = new Zend_Search_Lucene_Index_FieldInfo($fieldName,
- $fieldBits & 0x01 /* field is indexed */,
- $count,
- $fieldBits & 0x02 /* termvectors are stored */,
- $fieldBits & 0x10 /* norms are omitted */,
- $fieldBits & 0x20 /* payloads are stored */);
- if ($fieldBits & 0x10) {
- // norms are omitted for the indexed field
- $this->_norms[$count] = str_repeat(chr(Zend_Search_Lucene_Search_Similarity::encodeNorm(1.0)), $docCount);
- }
- $fieldNums[$count] = $count;
- $fieldNames[$count] = $fieldName;
- }
- array_multisort($fieldNames, SORT_ASC, SORT_REGULAR, $fieldNums);
- $this->_fieldsDicPositions = array_flip($fieldNums);
- if ($this->_delGen == -2) {
- // SegmentInfo constructor is invoked from index writer
- // Autodetect current delete file generation number
- $this->_delGen = $this->_detectLatestDelGen();
- }
- // Load deletions
- $this->_deleted = $this->_loadDelFile();
- }
- /**
- * Load detetions file
- *
- * Returns bitset or an array depending on bitset extension availability
- *
- * @return mixed
- * @throws Zend_Search_Lucene_Exception
- */
- private function _loadDelFile()
- {
- if ($this->_delGen == -1) {
- // There is no delete file for this segment
- return null;
- } else if ($this->_delGen == 0) {
- // It's a segment with pre-2.1 format delete file
- // Try to load deletions file
- return $this->_loadPre21DelFile();
- } else {
- // It's 2.1+ format deleteions file
- return $this->_load21DelFile();
- }
- }
- /**
- * Load pre-2.1 detetions file
- *
- * Returns bitset or an array depending on bitset extension availability
- *
- * @return mixed
- * @throws Zend_Search_Lucene_Exception
- */
- private function _loadPre21DelFile()
- {
- require_once 'Zend/Search/Lucene/Exception.php';
- try {
- // '.del' files always stored in a separate file
- // Segment compound is not used
- $delFile = $this->_directory->getFileObject($this->_name . '.del');
- $byteCount = $delFile->readInt();
- $byteCount = ceil($byteCount/8);
- $bitCount = $delFile->readInt();
- if ($bitCount == 0) {
- $delBytes = '';
- } else {
- $delBytes = $delFile->readBytes($byteCount);
- }
- if (extension_loaded('bitset')) {
- return $delBytes;
- } else {
- $deletions = array();
- for ($count = 0; $count < $byteCount; $count++) {
- $byte = ord($delBytes[$count]);
- for ($bit = 0; $bit < 8; $bit++) {
- if ($byte & (1<<$bit)) {
- $deletions[$count*8 + $bit] = 1;
- }
- }
- }
- return $deletions;
- }
- } catch(Zend_Search_Lucene_Exception $e) {
- if (strpos($e->getMessage(), 'is not readable') === false) {
- throw new Zend_Search_Lucene_Exception($e->getMessage(), $e->getCode(), $e);
- }
- // There is no deletion file
- $this->_delGen = -1;
- return null;
- }
- }
- /**
- * Load 2.1+ format detetions file
- *
- * Returns bitset or an array depending on bitset extension availability
- *
- * @return mixed
- */
- private function _load21DelFile()
- {
- $delFile = $this->_directory->getFileObject($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del');
- $format = $delFile->readInt();
- if ($format == (int)0xFFFFFFFF) {
- if (extension_loaded('bitset')) {
- $deletions = bitset_empty();
- } else {
- $deletions = array();
- }
- $byteCount = $delFile->readInt();
- $bitCount = $delFile->readInt();
- $delFileSize = $this->_directory->fileLength($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del');
- $byteNum = 0;
- do {
- $dgap = $delFile->readVInt();
- $nonZeroByte = $delFile->readByte();
- $byteNum += $dgap;
- if (extension_loaded('bitset')) {
- for ($bit = 0; $bit < 8; $bit++) {
- if ($nonZeroByte & (1<<$bit)) {
- bitset_incl($deletions, $byteNum*8 + $bit);
- }
- }
- return $deletions;
- } else {
- for ($bit = 0; $bit < 8; $bit++) {
- if ($nonZeroByte & (1<<$bit)) {
- $deletions[$byteNum*8 + $bit] = 1;
- }
- }
- return (count($deletions) > 0) ? $deletions : null;
- }
- } while ($delFile->tell() < $delFileSize);
- } else {
- // $format is actually byte count
- $byteCount = ceil($format/8);
- $bitCount = $delFile->readInt();
- if ($bitCount == 0) {
- $delBytes = '';
- } else {
- $delBytes = $delFile->readBytes($byteCount);
- }
- if (extension_loaded('bitset')) {
- return $delBytes;
- } else {
- $deletions = array();
- for ($count = 0; $count < $byteCount; $count++) {
- $byte = ord($delBytes[$count]);
- for ($bit = 0; $bit < 8; $bit++) {
- if ($byte & (1<<$bit)) {
- $deletions[$count*8 + $bit] = 1;
- }
- }
- }
- return (count($deletions) > 0) ? $deletions : null;
- }
- }
- }
- /**
- * Opens index file stoted within compound index file
- *
- * @param string $extension
- * @param boolean $shareHandler
- * @throws Zend_Search_Lucene_Exception
- * @return Zend_Search_Lucene_Storage_File
- */
- public function openCompoundFile($extension, $shareHandler = true)
- {
- if (($extension == '.fdx' || $extension == '.fdt') && $this->_usesSharedDocStore) {
- $fdxFName = $this->_sharedDocStoreOptions['segment'] . '.fdx';
- $fdtFName = $this->_sharedDocStoreOptions['segment'] . '.fdt';
- if (!$this->_sharedDocStoreOptions['isCompound']) {
- $fdxFile = $this->_directory->getFileObject($fdxFName, $shareHandler);
- $fdxFile->seek($this->_sharedDocStoreOptions['offset']*8, SEEK_CUR);
- if ($extension == '.fdx') {
- // '.fdx' file is requested
- return $fdxFile;
- } else {
- // '.fdt' file is requested
- $fdtStartOffset = $fdxFile->readLong();
- $fdtFile = $this->_directory->getFileObject($fdtFName, $shareHandler);
- $fdtFile->seek($fdtStartOffset, SEEK_CUR);
- return $fdtFile;
- }
- }
- if( !isset($this->_sharedDocStoreOptions['files'][$fdxFName]) ) {
- require_once 'Zend/Search/Lucene/Exception.php';
- throw new Zend_Search_Lucene_Exception('Shared doc storage segment compound file doesn\'t contain '
- . $fdxFName . ' file.' );
- }
- if( !isset($this->_sharedDocStoreOptions['files'][$fdtFName]) ) {
- require_once 'Zend/Search/Lucene/Exception.php';
- throw new Zend_Search_Lucene_Exception('Shared doc storage segment compound file doesn\'t contain '
- . $fdtFName . ' file.' );
- }
- // Open shared docstore segment file
- $cfxFile = $this->_directory->getFileObject($this->_sharedDocStoreOptions['segment'] . '.cfx', $shareHandler);
- // Seek to the start of '.fdx' file within compound file
- $cfxFile->seek($this->_sharedDocStoreOptions['files'][$fdxFName]);
- // Seek to the start of current segment documents section
- $cfxFile->seek($this->_sharedDocStoreOptions['offset']*8, SEEK_CUR);
- if ($extension == '.fdx') {
- // '.fdx' file is requested
- return $cfxFile;
- } else {
- // '.fdt' file is requested
- $fdtStartOffset = $cfxFile->readLong();
- // Seek to the start of '.fdt' file within compound file
- $cfxFile->seek($this->_sharedDocStoreOptions['files'][$fdtFName]);
- // Seek to the start of current segment documents section
- $cfxFile->seek($fdtStartOffset, SEEK_CUR);
- return $fdtFile;
- }
- }
- $filename = $this->_name . $extension;
- if (!$this->_isCompound) {
- return $this->_directory->getFileObject($filename, $shareHandler);
- }
- if( !isset($this->_segFiles[$filename]) ) {
- require_once 'Zend/Search/Lucene/Exception.php';
- throw new Zend_Search_Lucene_Exception('Segment compound file doesn\'t contain '
- . $filename . ' file.' );
- }
- $file = $this->_directory->getFileObject($this->_name . '.cfs', $shareHandler);
- $file->seek($this->_segFiles[$filename]);
- return $file;
- }
- /**
- * Get compound file length
- *
- * @param string $extension
- * @return integer
- */
- public function compoundFileLength($extension)
- {
- if (($extension == '.fdx' || $extension == '.fdt') && $this->_usesSharedDocStore) {
- $filename = $this->_sharedDocStoreOptions['segment'] . $extension;
- if (!$this->_sharedDocStoreOptions['isCompound']) {
- return $this->_directory->fileLength($filename);
- }
- if( !isset($this->_sharedDocStoreOptions['fileSizes'][$filename]) ) {
- require_once 'Zend/Search/Lucene/Exception.php';
- throw new Zend_Search_Lucene_Exception('Shared doc store compound file doesn\'t contain '
- . $filename . ' file.' );
- }
- return $this->_sharedDocStoreOptions['fileSizes'][$filename];
- }
- $filename = $this->_name . $extension;
- // Try to get common file first
- if ($this->_directory->fileExists($filename)) {
- return $this->_directory->fileLength($filename);
- }
- if( !isset($this->_segFileSizes[$filename]) ) {
- require_once 'Zend/Search/Lucene/Exception.php';
- throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain '
- . $filename . ' file.' );
- }
- return $this->_segFileSizes[$filename];
- }
- /**
- * Returns field index or -1 if field is not found
- *
- * @param string $fieldName
- * @return integer
- */
- public function getFieldNum($fieldName)
- {
- foreach( $this->_fields as $field ) {
- if( $field->name == $fieldName ) {
- return $field->number;
- }
- }
- return -1;
- }
- /**
- * Returns field info for specified field
- *
- * @param integer $fieldNum
- * @return Zend_Search_Lucene_Index_FieldInfo
- */
- public function getField($fieldNum)
- {
- return $this->_fields[$fieldNum];
- }
- /**
- * Returns array of fields.
- * if $indexed parameter is true, then returns only indexed fields.
- *
- * @param boolean $indexed
- * @return array
- */
- public function getFields($indexed = false)
- {
- $result = array();
- foreach( $this->_fields as $field ) {
- if( (!$indexed) || $field->isIndexed ) {
- $result[ $field->name ] = $field->name;
- }
- }
- return $result;
- }
- /**
- * Returns array of FieldInfo objects.
- *
- * @return array
- */
- public function getFieldInfos()
- {
- return $this->_fields;
- }
- /**
- * Returns actual deletions file generation number.
- *
- * @return integer
- */
- public function getDelGen()
- {
- return $this->_delGen;
- }
- /**
- * Returns the total number of documents in this segment (including deleted documents).
- *
- * @return integer
- */
- public function count()
- {
- return $this->_docCount;
- }
- /**
- * Returns number of deleted documents.
- *
- * @return integer
- */
- private function _deletedCount()
- {
- if ($this->_deleted === null) {
- return 0;
- }
- if (extension_loaded('bitset')) {
- return count(bitset_to_array($this->_deleted));
- } else {
- return count($this->_deleted);
- }
- }
- /**
- * Returns the total number of non-deleted documents in this segment.
- *
- * @return integer
- */
- public function numDocs()
- {
- if ($this->hasDeletions()) {
- return $this->_docCount - $this->_deletedCount();
- } else {
- return $this->_docCount;
- }
- }
- /**
- * Get field position in a fields dictionary
- *
- * @param integer $fieldNum
- * @return integer
- */
- private function _getFieldPosition($fieldNum) {
- // Treat values which are not in a translation table as a 'direct value'
- return isset($this->_fieldsDicPositions[$fieldNum]) ?
- $this->_fieldsDicPositions[$fieldNum] : $fieldNum;
- }
- /**
- * Return segment name
- *
- * @return string
- */
- public function getName()
- {
- return $this->_name;
- }
- /**
- * TermInfo cache
- *
- * Size is 1024.
- * Numbers are used instead of class constants because of performance considerations
- *
- * @var array
- */
- private $_termInfoCache = array();
- private function _cleanUpTermInfoCache()
- {
- // Clean 256 term infos
- foreach ($this->_termInfoCache as $key => $termInfo) {
- unset($this->_termInfoCache[$key]);
- // leave 768 last used term infos
- if (count($this->_termInfoCache) == 768) {
- break;
- }
- }
- }
- /**
- * Load terms dictionary index
- *
- * @throws Zend_Search_Lucene_Exception
- */
- private function _loadDictionaryIndex()
- {
- // Check, if index is already serialized
- if ($this->_directory->fileExists($this->_name . '.sti')) {
- // Load serialized dictionary index data
- $stiFile = $this->_directory->getFileObject($this->_name . '.sti');
- $stiFileData = $stiFile->readBytes($this->_directory->fileLength($this->_name . '.sti'));
- // Load dictionary index data
- if (($unserializedData = @unserialize($stiFileData)) !== false) {
- list($this->_termDictionary, $this->_termDictionaryInfos) = $unserializedData;
- return;
- }
- }
- // Load data from .tii file and generate .sti file
- // Prefetch dictionary index data
- $tiiFile = $this->openCompoundFile('.tii');
- $tiiFileData = $tiiFile->readBytes($this->compoundFileLength('.tii'));
- /** Zend_Search_Lucene_Index_DictionaryLoader */
- require_once 'Zend/Search/Lucene/Index/DictionaryLoader.php';
- // Load dictionary index data
- list($this->_termDictionary, $this->_termDictionaryInfos) =
- Zend_Search_Lucene_Index_DictionaryLoader::load($tiiFileData);
- $stiFileData = serialize(array($this->_termDictionary, $this->_termDictionaryInfos));
- $stiFile = $this->_directory->createFile($this->_name . '.sti');
- $stiFile->writeBytes($stiFileData);
- }
- /**
- * Scans terms dictionary and returns term info
- *
- * @param Zend_Search_Lucene_Index_Term $term
- * @return Zend_Search_Lucene_Index_TermInfo
- */
- public function getTermInfo(Zend_Search_Lucene_Index_Term $term)
- {
- $termKey = $term->key();
- if (isset($this->_termInfoCache[$termKey])) {
- $termInfo = $this->_termInfoCache[$termKey];
- // Move termInfo to the end of cache
- unset($this->_termInfoCache[$termKey]);
- $this->_termInfoCache[$termKey] = $termInfo;
- return $termInfo;
- }
- if ($this->_termDictionary === null) {
- $this->_loadDictionaryIndex();
- }
- $searchField = $this->getFieldNum($term->field);
- if ($searchField == -1) {
- return null;
- }
- $searchDicField = $this->_getFieldPosition($searchField);
- // search for appropriate value in dictionary
- $lowIndex = 0;
- $highIndex = count($this->_termDictionary)-1;
- while ($highIndex >= $lowIndex) {
- // $mid = ($highIndex - $lowIndex)/2;
- $mid = ($highIndex + $lowIndex) >> 1;
- $midTerm = $this->_termDictionary[$mid];
- $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */);
- $delta = $searchDicField - $fieldNum;
- if ($delta == 0) {
- $delta = strcmp($term->text, $midTerm[1] /* text */);
- }
- if ($delta < 0) {
- $highIndex = $mid-1;
- } elseif ($delta > 0) {
- $lowIndex = $mid+1;
- } else {
- // return $this->_termDictionaryInfos[$mid]; // We got it!
- $a = $this->_termDictionaryInfos[$mid];
- $termInfo = new Zend_Search_Lucene_Index_TermInfo($a[0], $a[1], $a[2], $a[3], $a[4]);
- // Put loaded termInfo into cache
- $this->_termInfoCache[$termKey] = $termInfo;
- return $termInfo;
- }
- }
- if ($highIndex == -1) {
- // Term is out of the dictionary range
- return null;
- }
- $prevPosition = $highIndex;
- $prevTerm = $this->_termDictionary[$prevPosition];
- $prevTermInfo = $this->_termDictionaryInfos[$prevPosition];
- $tisFile = $this->openCompoundFile('.tis');
- $tiVersion = $tisFile->readInt();
- if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ &&
- $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) {
- require_once 'Zend/Search/Lucene/Exception.php';
- throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
- }
- $termCount = $tisFile->readLong();
- $indexInterval = $tisFile->readInt();
- $skipInterval = $tisFile->readInt();
- if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
- $maxSkipLevels = $tisFile->readInt();
- }
- $tisFile->seek($prevTermInfo[4] /* indexPointer */ - (($tiVersion == (int)0xFFFFFFFD)? 24 : 20) /* header size*/, SEEK_CUR);
- $termValue = $prevTerm[1] /* text */;
- $termFieldNum = $prevTerm[0] /* field */;
- $freqPointer = $prevTermInfo[1] /* freqPointer */;
- $proxPointer = $prevTermInfo[2] /* proxPointer */;
- for ($count = $prevPosition*$indexInterval + 1;
- $count <= $termCount &&
- ( $this->_getFieldPosition($termFieldNum) < $searchDicField ||
- ($this->_getFieldPosition($termFieldNum) == $searchDicField &&
- strcmp($termValue, $term->text) < 0) );
- $count++) {
- $termPrefixLength = $tisFile->readVInt();
- $termSuffix = $tisFile->readString();
- $termFieldNum = $tisFile->readVInt();
- $termValue = Zend_Search_Lucene_Index_Term::getPrefix($termValue, $termPrefixLength) . $termSuffix;
- $docFreq = $tisFile->readVInt();
- $freqPointer += $tisFile->readVInt();
- $proxPointer += $tisFile->readVInt();
- if( $docFreq >= $skipInterval ) {
- $skipOffset = $tisFile->readVInt();
- } else {
- $skipOffset = 0;
- }
- }
- if ($termFieldNum == $searchField && $termValue == $term->text) {
- $termInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
- } else {
- $termInfo = null;
- }
- // Put loaded termInfo into cache
- $this->_termInfoCache[$termKey] = $termInfo;
- if (count($this->_termInfoCache) == 1024) {
- $this->_cleanUpTermInfoCache();
- }
- return $termInfo;
- }
- /**
- * Returns IDs of all the documents containing term.
- *
- * @param Zend_Search_Lucene_Index_Term $term
- * @param integer $shift
- * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
- * @return array
- */
- public function termDocs(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null)
- {
- $termInfo = $this->getTermInfo($term);
- if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
- if ($docsFilter !== null && $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
- $docsFilter->segmentFilters[$this->_name] = array();
- }
- return array();
- }
- $frqFile = $this->openCompoundFile('.frq');
- $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
- $docId = 0;
- $result = array();
- if ($docsFilter !== null) {
- if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
- require_once 'Zend/Search/Lucene/Exception.php';
- throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.');
- }
- if (isset($docsFilter->segmentFilters[$this->_name])) {
- // Filter already has some data for the current segment
- // Make short name for the filter (which doesn't need additional dereferencing)
- $filter = &$docsFilter->segmentFilters[$this->_name];
- // Check if filter is not empty
- if (count($filter) == 0) {
- return array();
- }
- if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) {
- // Perform fetching
- // ---------------------------------------------------------------
- $updatedFilterData = array();
- for( $count=0; $count < $termInfo->docFreq; $count++ ) {
- $docDelta = $frqFile->readVInt();
- if( $docDelta % 2 == 1 ) {
- $docId += ($docDelta-1)/2;
- } else {
- $docId += $docDelta/2;
- // read freq
- $frqFile->readVInt();
- }
- if (isset($filter[$docId])) {
- $result[] = $shift + $docId;
- $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
- }
- }
- $docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
- // ---------------------------------------------------------------
- } else {
- // Perform full scan
- $updatedFilterData = array();
- for( $count=0; $count < $termInfo->docFreq; $count++ ) {
- $docDelta = $frqFile->readVInt();
- if( $docDelta % 2 == 1 ) {
- $docId += ($docDelta-1)/2;
- } else {
- $docId += $docDelta/2;
- // read freq
- $frqFile->readVInt();
- }
- if (isset($filter[$docId])) {
- $result[] = $shift + $docId;
- $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
- }
- }
- $docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
- }
- } else {
- // Filter is present, but doesn't has data for the current segment yet
- $filterData = array();
- for( $count=0; $count < $termInfo->docFreq; $count++ ) {
- $docDelta = $frqFile->readVInt();
- if( $docDelta % 2 == 1 ) {
- $docId += ($docDelta-1)/2;
- } else {
- $docId += $docDelta/2;
- // read freq
- $frqFile->readVInt();
- }
- $result[] = $shift + $docId;
- $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
- }
- $docsFilter->segmentFilters[$this->_name] = $filterData;
- }
- } else {
- for( $count=0; $count < $termInfo->docFreq; $count++ ) {
- $docDelta = $frqFile->readVInt();
- if( $docDelta % 2 == 1 ) {
- $docId += ($docDelta-1)/2;
- } else {
- $docId += $docDelta/2;
- // read freq
- $frqFile->readVInt();
- }
- $result[] = $shift + $docId;
- }
- }
- return $result;
- }
- /**
- * Returns term freqs array.
- * Result array structure: array(docId => freq, ...)
- *
- * @param Zend_Search_Lucene_Index_Term $term
- * @param integer $shift
- * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
- * @return Zend_Search_Lucene_Index_TermInfo
- */
- public function termFreqs(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null)
- {
- $termInfo = $this->getTermInfo($term);
- if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
- if ($docsFilter !== null && $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
- $docsFilter->segmentFilters[$this->_name] = array();
- }
- return array();
- }
- $frqFile = $this->openCompoundFile('.frq');
- $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
- $result = array();
- $docId = 0;
- $result = array();
- if ($docsFilter !== null) {
- if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
- require_once 'Zend/Search/Lucene/Exception.php';
- throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.');
- }
- if (isset($docsFilter->segmentFilters[$this->_name])) {
- // Filter already has some data for the current segment
- // Make short name for the filter (which doesn't need additional dereferencing)
- $filter = &$docsFilter->segmentFilters[$this->_name];
- // Check if filter is not empty
- if (count($filter) == 0) {
- return array();
- }
- if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) {
- // Perform fetching
- // ---------------------------------------------------------------
- $updatedFilterData = array();
- for ($count = 0; $count < $termInfo->docFreq; $count++) {
- $docDelta = $frqFile->readVInt();
- if ($docDelta % 2 == 1) {
- $docId += ($docDelta-1)/2;
- if (isset($filter[$docId])) {
- $result[$shift + $docId] = 1;
- $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
- }
- } else {
- $docId += $docDelta/2;
- if (isset($filter[$docId])) {
- $result[$shift + $docId] = $frqFile->readVInt();
- $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
- }
- }
- }
- $docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
- // ---------------------------------------------------------------
- } else {
- // Perform full scan
- $updatedFilterData = array();
- for ($count = 0; $count < $termInfo->docFreq; $count++) {
- $docDelta = $frqFile->readVInt();
- if ($docDelta % 2 == 1) {
- $docId += ($docDelta-1)/2;
- if (isset($filter[$docId])) {
- $result[$shift + $docId] = 1;
- $updatedFilterData[$docId] = 1; // 1 is just some constant value, so we don't need additional var dereference here
- }
- } else {
- $docId += $docDelta/2;
- if (isset($filter[$docId])) {
- $result[$shift + $docId] = $frqFile->readVInt();
- $updatedFilterData[$docId] = 1; // 1 is just some constant value, so we don't need additional var dereference here
- }
- }
- }
- $docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
- }
- } else {
- // Filter doesn't has data for current segment
- $filterData = array();
- for ($count = 0; $count < $termInfo->docFreq; $count++) {
- $docDelta = $frqFile->readVInt();
- if ($docDelta % 2 == 1) {
- $docId += ($docDelta-1)/2;
- $result[$shift + $docId] = 1;
- $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
- } else {
- $docId += $docDelta/2;
- $result[$shift + $docId] = $frqFile->readVInt();
- $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
- }
- }
- $docsFilter->segmentFilters[$this->_name] = $filterData;
- }
- } else {
- for ($count = 0; $count < $termInfo->docFreq; $count++) {
- $docDelta = $frqFile->readVInt();
- if ($docDelta % 2 == 1) {
- $docId += ($docDelta-1)/2;
- $result[$shift + $docId] = 1;
- } else {
- $docId += $docDelta/2;
- $result[$shift + $docId] = $frqFile->readVInt();
- }
- }
- }
- return $result;
- }
- /**
- * Returns term positions array.
- * Result array structure: array(docId => array(pos1, pos2, ...), ...)
- *
- * @param Zend_Search_Lucene_Index_Term $term
- * @param integer $shift
- * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
- * @return Zend_Search_Lucene_Index_TermInfo
- */
- public function termPositions(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null)
- {
- $termInfo = $this->getTermInfo($term);
- if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
- if ($docsFilter !== null && $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
- $docsFilter->segmentFilters[$this->_name] = array();
- }
- return array();
- }
- $frqFile = $this->openCompoundFile('.frq');
- $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
- $docId = 0;
- $freqs = array();
- if ($docsFilter !== null) {
- if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
- require_once 'Zend/Search/Lucene/Exception.php';
- throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.');
- }
- if (isset($docsFilter->segmentFilters[$this->_name])) {
- // Filter already has some data for the current segment
- // Make short name for the filter (which doesn't need additional dereferencing)
- $filter = &$docsFilter->segmentFilters[$this->_name];
- // Check if filter is not empty
- if (count($filter) == 0) {
- return array();
- }
- if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) {
- // Perform fetching
- // ---------------------------------------------------------------
- for ($count = 0; $count < $termInfo->docFreq; $count++) {
- $docDelta = $frqFile->readVInt();
- if ($docDelta % 2 == 1) {
- $docId += ($docDelta-1)/2;
- $freqs[$docId] = 1;
- } else {
- $docId += $docDelta/2;
- $freqs[$docId] = $frqFile->readVInt();
- }
- }
- $updatedFilterData = array();
- $result = array();
- $prxFile = $this->openCompoundFile('.prx');
- $prxFile->seek($termInfo->proxPointer, SEEK_CUR);
- foreach ($freqs as $docId => $freq) {
- $termPosition = 0;
- $positions = array();
- // we have to read .prx file to get right position for next doc
- // even filter doesn't match current document
- for ($count = 0; $count < $freq; $count++ ) {
- $termPosition += $prxFile->readVInt();
- $positions[] = $termPosition;
- }
- // Include into updated filter and into result only if doc is matched by filter
- if (isset($filter[$docId])) {
- $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
- $result[$shift + $docId] = $positions;
- }
- }
- $docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
- // ---------------------------------------------------------------
- } else {
- // Perform full scan
- for ($count = 0; $count < $termInfo->docFreq; $count++) {
- $docDelta = $frqFile->readVInt();
- if ($docDelta % 2 == 1) {
- $docId += ($docDelta-1)/2;
- $freqs[$docId] = 1;
- } else {
- $docId += $docDelta/2;
- $freqs[$docId] = $frqFile->readVInt();
- }
- }
- $updatedFilterData = array();
- $result = array();
- $prxFile = $this->openCompoundFile('.prx');
- $prxFile->seek($termInfo->proxPointer, SEEK_CUR);
- foreach ($freqs as $docId => $freq) {
- $termPosition = 0;
- $positions = array();
- // we have to read .prx file to get right position for next doc
- // even filter doesn't match current document
- for ($count = 0; $count < $freq; $count++ ) {
- $termPosition += $prxFile->readVInt();
- $positions[] = $termPosition;
- }
- // Include into updated filter and into result only if doc is matched by filter
- if (isset($filter[$docId])) {
- $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
- $result[$shift + $docId] = $positions;
- }
- }
- $docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
- }
- } else {
- // Filter doesn't has data for current segment
- for ($count = 0; $count < $termInfo->docFreq; $count++) {
- $docDelta = $frqFile->readVInt();
- if ($docDelta % 2 == 1) {
- $docId += ($docDelta-1)/2;
- $freqs[$docId] = 1;
- } else {
- $docId += $docDelta/2;
- $freqs[$docId] = $frqFile->readVInt();
- }
- }
- $filterData = array();
- $result = array();
- $prxFile = $this->openCompoundFile('.prx');
- $prxFile->seek($termInfo->proxPointer, SEEK_CUR);
- foreach ($freqs as $docId => $freq) {
- $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
- $termPosition = 0;
- $positions = array();
- for ($count = 0; $count < $freq; $count++ ) {
- $termPosition += $prxFile->readVInt();
- $positions[] = $termPosition;
- }
- $result[$shift + $docId] = $positions;
- }
- $docsFilter->segmentFilters[$this->_name] = $filterData;
- }
- } else {
- for ($count = 0; $count < $termInfo->docFreq; $count++) {
- $docDelta = $frqFile->readVInt();
- if ($docDelta % 2 == 1) {
- $docId += ($docDelta-1)/2;
- $freqs[$docId] = 1;
- } else {
- $docId += $docDelta/2;
- $freqs[$docId] = $frqFile->readVInt();
- }
- }
- $result = array();
- $prxFile = $this->openCompoundFile('.prx');
- $prxFile->seek($termInfo->proxPointer, SEEK_CUR);
- foreach ($freqs as $docId => $freq) {
- $termPosition = 0;
- $positions = array();
- for ($count = 0; $count < $freq; $count++ ) {
- $termPosition += $prxFile->readVInt();
- $positions[] = $termPosition;
- }
- $result[$shift + $docId] = $positions;
- }
- }
- return $result;
- }
- /**
- * Load normalizatin factors from an index file
- *
- * @param integer $fieldNum
- * @throws Zend_Search_Lucene_Exception
- */
- private function _loadNorm($fieldNum)
- {
- if ($this->_hasSingleNormFile) {
- $normfFile = $this->openCompoundFile('.nrm');
- $header = $normfFile->readBytes(3);
- $headerFormatVersion = $normfFile->readByte();
- if ($header != 'NRM' || $headerFormatVersion != (int)0xFF) {
- require_once 'Zend/Search/Lucene/Exception.php';
- throw new Zend_Search_Lucene_Exception('Wrong norms file format.');
- }
- foreach ($this->_fields as $fNum => $fieldInfo) {
- if ($fieldInfo->isIndexed) {
- $this->_norms[$fNum] = $normfFile->readBytes($this->_docCount);
- }
- }
- } else {
- $fFile = $this->openCompoundFile('.f' . $fieldNum);
- $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount);
- }
- }
- /**
- * Returns normalization factor for specified documents
- *
- * @param integer $id
- * @param string $fieldName
- * @return float
- */
- public function norm($id, $fieldName)
- {
- $fieldNum = $this->getFieldNum($fieldName);
- if ( !($this->_fields[$fieldNum]->isIndexed) ) {
- return null;
- }
- if (!isset($this->_norms[$fieldNum])) {
- $this->_loadNorm($fieldNum);
- }
- return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum][$id]) );
- }
- /**
- * Returns norm vector, encoded in a byte string
- *
- * @param string $fieldName
- * @return string
- */
- public function normVector($fieldName)
- {
- $fieldNum = $this->getFieldNum($fieldName);
- if ($fieldNum == -1 || !($this->_fields[$fieldNum]->isIndexed)) {
- $similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
- return str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )),
- $this->_docCount);
- }
- if (!isset($this->_norms[$fieldNum])) {
- $this->_loadNorm($fieldNum);
- }
- return $this->_norms[$fieldNum];
- }
- /**
- * Returns true if any documents have been deleted from this index segment.
- *
- * @return boolean
- */
- public function hasDeletions()
- {
- return $this->_deleted !== null;
- }
- /**
- * Returns true if segment has single norms file.
- *
- * @return boolean
- */
- public function hasSingleNormFile()
- {
- return $this->_hasSingleNormFile ? true : false;
- }
- /**
- * Returns true if segment is stored using compound segment file.
- *
- * @return boolean
- */
- public function isCompound()
- {
- return $this->_isCompound;
- }
- /**
- * Deletes a document from the index segment.
- * $id is an internal document id
- *
- * @param integer
- */
- public function delete($id)
- {
- $this->_deletedDirty = true;
- if (extension_loaded('bitset')) {
- if ($this->_deleted === null) {
- $this->_deleted = bitset_empty($id);
- }
- bitset_incl($this->_deleted, $id);
- } else {
- if ($this->_deleted === null) {
- $this->_deleted = array();
- }
- $this->_deleted[$id] = 1;
- }
- }
- /**
- * Checks, that document is deleted
- *
- * @param integer
- * @return boolean
- */
- public function isDeleted($id)
- {
- if ($this->_deleted === null) {
- return false;
- }
- if (extension_loaded('bitset')) {
- return bitset_in($this->_deleted, $id);
- } else {
- return isset($this->_deleted[$id]);
- }
- }
- /**
- * Detect latest delete generation
- *
- * Is actualy used from writeChanges() method or from the constructor if it's invoked from
- * Index writer. In both cases index write lock is already obtained, so we shouldn't care
- * about it
- *
- * @return integer
- */
- private function _detectLatestDelGen()
- {
- $delFileList = array();
- foreach ($this->_directory->fileList() as $file) {
- if ($file == $this->_name . '.del') {
- // Matches <segment_name>.del file name
- $delFileList[] = 0;
- } else if (preg_match('/^' . $this->_name . '_([a-zA-Z0-9]+)\.del$/i', $file, $matches)) {
- // Matches <segment_name>_NNN.del file names
- $delFileList[] = (int)base_convert($matches[1], 36, 10);
- }
- }
- if (count($delFileList) == 0) {
- // There is no deletions file for current segment in the directory
- // Set deletions file generation number to 1
- return -1;
- } else {
- // There are some deletions files for current segment in the directory
- // Set deletions file generation number to the highest nuber
- return max($delFileList);
- }
- }
- /**
- * Write changes if it's necessary.
- *
- * This method must be invoked only from the Writer _updateSegments() method,
- * so index Write lock has to be already obtained.
- *
- * @internal
- * @throws Zend_Search_Lucene_Exceptions
- */
- public function writeChanges()
- {
- // Get new generation number
- $latestDelGen = $this->_detectLatestDelGen();
- if (!$this->_deletedDirty) {
- // There was no deletions by current process
- if ($latestDelGen == $this->_delGen) {
- // Delete file hasn't been updated by any concurrent process
- return;
- } else if ($latestDelGen > $this->_delGen) {
- // Delete file has been updated by some concurrent process
- // Reload deletions file
- $this->_delGen = $latestDelGen;
- $this->_deleted = $this->_loadDelFile();
- return;
- } else {
- require_once 'Zend/Search/Lucene/Exception.php';
- throw new Zend_Search_Lucene_Exception('Delete file processing workflow is corrupted for the segment \'' . $this->_name . '\'.');
- }
- }
- if ($latestDelGen > $this->_delGen) {
- // Merge current deletions with latest deletions file
- $this->_delGen = $latestDelGen;
- $latestDelete = $this->_loadDelFile();
- if (extension_loaded('bitset')) {
- $this->_deleted = bitset_union($this->_deleted, $latestDelete);
- } else {
- $this->_deleted += $latestDelete;
- }
- }
- if (extension_loaded('bitset')) {
- $delBytes = $this->_deleted;
- $bitCount = count(bitset_to_array($delBytes));
- } else {
- $byteCount = floor($this->_docCount/8)+1;
- $delBytes = str_repeat(chr(0), $byteCount);
- for ($count = 0; $count < $byteCount; $count++) {
- $byte = 0;
- for ($bit = 0; $bit < 8; $bit++) {
- if (isset($this->_deleted[$count*8 + $bit])) {
- $byte |= (1<<$bit);
- }
- }
- $delBytes[$count] = chr($byte);
- }
- $bitCount = count($this->_deleted);
- }
- if ($this->_delGen == -1) {
- // Set delete file generation number to 1
- $this->_delGen = 1;
- } else {
- // Increase delete file generation number by 1
- $this->_delGen++;
- }
- $delFile = $this->_directory->createFile($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del');
- $delFile->writeInt($this->_docCount);
- $delFile->writeInt($bitCount);
- $delFile->writeBytes($delBytes);
- $this->_deletedDirty = false;
- }
- /**
- * Term Dictionary File object for stream like terms reading
- *
- * @var Zend_Search_Lucene_Storage_File
- */
- private $_tisFile = null;
- /**
- * Actual offset of the .tis file data
- *
- * @var integer
- */
- private $_tisFileOffset;
- /**
- * Frequencies File object for stream like terms reading
- *
- * @var Zend_Search_Lucene_Storage_File
- */
- private $_frqFile = null;
- /**
- * Actual offset of the .frq file data
- *
- * @var integer
- */
- private $_frqFileOffset;
- /**
- * Positions File object for stream like terms reading
- *
- * @var Zend_Search_Lucene_Storage_File
- */
- private $_prxFile = null;
- /**
- * Actual offset of the .prx file in the compound file
- *
- * @var integer
- */
- private $_prxFileOffset;
- /**
- * Actual number of terms in term stream
- *
- * @var integer
- */
- private $_termCount = 0;
- /**
- * Overall number of terms in term stream
- *
- * @var integer
- */
- private $_termNum = 0;
- /**
- * Segment index interval
- *
- * @var integer
- */
- private $_indexInterval;
- /**
- * Segment skip interval
- *
- * @var integer
- */
- private $_skipInterval;
- /**
- * Last TermInfo in a terms stream
- *
- * @var Zend_Search_Lucene_Index_TermInfo
- */
- private $_lastTermInfo = null;
- /**
- * Last Term in a terms stream
- *
- * @var Zend_Search_Lucene_Index_Term
- */
- private $_lastTerm = null;
- /**
- * Map of the document IDs
- * Used to get new docID after removing deleted documents.
- * It's not very effective from memory usage point of view,
- * but much more faster, then other methods
- *
- * @var array|null
- */
- private $_docMap = null;
- /**
- * An array of all term positions in the documents.
- * Array structure: array( docId => array( pos1, pos2, ...), ...)
- *
- * Is set to null if term positions loading has to be skipped
- *
- * @var array|null
- */
- private $_lastTermPositions;
- /**
- * Terms scan mode
- *
- * Values:
- *
- * self::SM_TERMS_ONLY - terms are scanned, no additional info is retrieved
- * self::SM_FULL_INFO - terms are scanned, frequency and position info is retrieved
- * self::SM_MERGE_INFO - terms are scanned, frequency and position info is retrieved
- * document numbers are compacted (shifted if segment has deleted documents)
- *
- * @var integer
- */
- private $_termsScanMode;
- /** Scan modes */
- const SM_TERMS_ONLY = 0; // terms are scanned, no additional info is retrieved
- const SM_FULL_INFO = 1; // terms are scanned, frequency and position info is retrieved
- const SM_MERGE_INFO = 2; // terms are scanned, frequency and position info is retrieved
- // document numbers are compacted (shifted if segment contains deleted documents)
- /**
- * Reset terms stream
- *
- * $startId - id for the fist document
- * $compact - remove deleted documents
- *
- * Returns start document id for the next segment
- *
- * @param integer $startId
- * @param integer $mode
- * @throws Zend_Search_Lucene_Exception
- * @return integer
- */
- public function resetTermsStream(/** $startId = 0, $mode = self::SM_TERMS_ONLY */)
- {
- /**
- * SegmentInfo->resetTermsStream() method actually takes two optional parameters:
- * $startId (default value is 0)
- * $mode (default value is self::SM_TERMS_ONLY)
- */
- $argList = func_get_args();
- if (count($argList) > 2) {
- require_once 'Zend/Search/Lucene/Exception.php';
- throw new Zend_Search_Lucene_Exception('Wrong number of arguments');
- } else if (count($argList) == 2) {
- $startId = $argList[0];
- $mode = $argList[1];
- } else if (count($argList) == 1) {
- $startId = $argList[0];
- $mode = self::SM_TERMS_ONLY;
- } else {
- $startId = 0;
- $mode = self::SM_TERMS_ONLY;
- }
- if ($this->_tisFile !== null) {
- $this->_tisFile = null;
- }
- $this->_tisFile = $this->openCompoundFile('.tis', false);
- $this->_tisFileOffset = $this->_tisFile->tell();
- $tiVersion = $this->_tisFile->readInt();
- if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ &&
- $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) {
- require_once 'Zend/Search/Lucene/Exception.php';
- throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
- }
- $this->_termCount =
- $this->_termNum = $this->_tisFile->readLong(); // Read terms count
- $this->_indexInterval = $this->_tisFile->readInt(); // Read Index interval
- $this->_skipInterval = $this->_tisFile->readInt(); // Read skip interval
- if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
- $maxSkipLevels = $this->_tisFile->readInt();
- }
- if ($this->_frqFile !== null) {
- $this->_frqFile = null;
- }
- if ($this->_prxFile !== null) {
- $this->_prxFile = null;
- }
- $this->_docMap = array();
- $this->_lastTerm = new Zend_Search_Lucene_Index_Term('', -1);
- $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo(0, 0, 0, 0);
- $this->_lastTermPositions = null;
- $this->_termsScanMode = $mode;
- switch ($mode) {
- case self::SM_TERMS_ONLY:
- // Do nothing
- break;
- case self::SM_FULL_INFO:
- // break intentionally omitted
- case self::SM_MERGE_INFO:
- $this->_frqFile = $this->openCompoundFile('.frq', false);
- $this->_frqFileOffset = $this->_frqFile->tell();
- $this->_prxFile = $this->openCompoundFile('.prx', false);
- $this->_prxFileOffset = $this->_prxFile->tell();
- for ($count = 0; $count < $this->_docCount; $count++) {
- if (!$this->isDeleted($count)) {
- $this->_docMap[$count] = $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $count);
- }
- }
- break;
- default:
- require_once 'Zend/Search/Lucene/Exception.php';
- throw new Zend_Search_Lucene_Exception('Wrong terms scaning mode specified.');
- break;
- }
- // Calculate next segment start id (since $this->_docMap structure may be cleaned by $this->nextTerm() call)
- $nextSegmentStartId = $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $this->_docCount);
- $this->nextTerm();
- return $nextSegmentStartId;
- }
- /**
- * Skip terms stream up to specified term preffix.
- *
- * Prefix contains fully specified field info and portion of searched term
- *
- * @param Zend_Search_Lucene_Index_Term $prefix
- * @throws Zend_Search_Lucene_Exception
- */
- public function skipTo(Zend_Search_Lucene_Index_Term $prefix)
- {
- if ($this->_termDictionary === null) {
- $this->_loadDictionaryIndex();
- }
- $searchField = $this->getFieldNum($prefix->field);
- if ($searchField == -1) {
- /**
- * Field is not presented in this segment
- * Go to the end of dictionary
- */
- $this->_tisFile = null;
- $this->_frqFile = null;
- $this->_prxFile = null;
- $this->_lastTerm = null;
- $this->_lastTermInfo = null;
- $this->_lastTermPositions = null;
- return;
- }
- $searchDicField = $this->_getFieldPosition($searchField);
- // search for appropriate value in dictionary
- $lowIndex = 0;
- $highIndex = count($this->_termDictionary)-1;
- while ($highIndex >= $lowIndex) {
- // $mid = ($highIndex - $lowIndex)/2;
- $mid = ($highIndex + $lowIndex) >> 1;
- $midTerm = $this->_termDictionary[$mid];
- $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */);
- $delta = $searchDicField - $fieldNum;
- if ($delta == 0) {
- $delta = strcmp($prefix->text, $midTerm[1] /* text */);
- }
- if ($delta < 0) {
- $highIndex = $mid-1;
- } elseif ($delta > 0) {
- $lowIndex = $mid+1;
- } else {
- // We have reached term we are looking for
- break;
- }
- }
- if ($highIndex == -1) {
- // Term is out of the dictionary range
- $this->_tisFile = null;
- $this->_frqFile = null;
- $this->_prxFile = null;
- $this->_lastTerm = null;
- $this->_lastTermInfo = null;
- $this->_lastTermPositions = null;
- return;
- }
- $prevPosition = $highIndex;
- $prevTerm = $this->_termDictionary[$prevPosition];
- $prevTermInfo = $this->_termDictionaryInfos[$prevPosition];
- if ($this->_tisFile === null) {
- // The end of terms stream is reached and terms dictionary file is closed
- // Perform mini-reset operation
- $this->_tisFile = $this->openCompoundFile('.tis', false);
- if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) {
- $this->_frqFile = $this->openCompoundFile('.frq', false);
- $this->_prxFile = $this->openCompoundFile('.prx', false);
- }
- }
- $this->_tisFile->seek($this->_tisFileOffset + $prevTermInfo[4], SEEK_SET);
- $this->_lastTerm = new Zend_Search_Lucene_Index_Term($prevTerm[1] /* text */,
- ($prevTerm[0] == -1) ? '' : $this->_fields[$prevTerm[0] /* field */]->name);
- $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($prevTermInfo[0] /* docFreq */,
- $prevTermInfo[1] /* freqPointer */,
- $prevTermInfo[2] /* proxPointer */,
- $prevTermInfo[3] /* skipOffset */);
- $this->_termCount = $this->_termNum - $prevPosition*$this->_indexInterval;
- if ($highIndex == 0) {
- // skip start entry
- $this->nextTerm();
- } else if ($prefix->field == $this->_lastTerm->field && $prefix->text == $this->_lastTerm->text) {
- // We got exact match in the dictionary index
- if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) {
- $this->_lastTermPositions = array();
- $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET);
- $freqs = array(); $docId = 0;
- for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) {
- $docDelta = $this->_frqFile->readVInt();
- if( $docDelta % 2 == 1 ) {
- $docId += ($docDelta-1)/2;
- $freqs[ $docId ] = 1;
- } else {
- $docId += $docDelta/2;
- $freqs[ $docId ] = $this->_frqFile->readVInt();
- }
- }
- $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET);
- foreach ($freqs as $docId => $freq) {
- $termPosition = 0; $positions = array();
- for ($count = 0; $count < $freq; $count++ ) {
- $termPosition += $this->_prxFile->readVInt();
- $positions[] = $termPosition;
- }
- if (isset($this->_docMap[$docId])) {
- $this->_lastTermPositions[$this->_docMap[$docId]] = $positions;
- }
- }
- }
- return;
- }
- // Search term matching specified prefix
- while ($this->_lastTerm !== null) {
- if ( strcmp($this->_lastTerm->field, $prefix->field) > 0 ||
- ($prefix->field == $this->_lastTerm->field && strcmp($this->_lastTerm->text, $prefix->text) >= 0) ) {
- // Current term matches or greate than the pattern
- return;
- }
- $this->nextTerm();
- }
- }
- /**
- * Scans terms dictionary and returns next term
- *
- * @return Zend_Search_Lucene_Index_Term|null
- */
- public function nextTerm()
- {
- if ($this->_tisFile === null || $this->_termCount == 0) {
- $this->_lastTerm = null;
- $this->_lastTermInfo = null;
- $this->_lastTermPositions = null;
- $this->_docMap = null;
- // may be necessary for "empty" segment
- $this->_tisFile = null;
- $this->_frqFile = null;
- $this->_prxFile = null;
- return null;
- }
- $termPrefixLength = $this->_tisFile->readVInt();
- $termSuffix = $this->_tisFile->readString();
- $termFieldNum = $this->_tisFile->readVInt();
- $termValue = Zend_Search_Lucene_Index_Term::getPrefix($this->_lastTerm->text, $termPrefixLength) . $termSuffix;
- $this->_lastTerm = new Zend_Search_Lucene_Index_Term($termValue, $this->_fields[$termFieldNum]->name);
- $docFreq = $this->_tisFile->readVInt();
- $freqPointer = $this->_lastTermInfo->freqPointer + $this->_tisFile->readVInt();
- $proxPointer = $this->_lastTermInfo->proxPointer + $this->_tisFile->readVInt();
- if ($docFreq >= $this->_skipInterval) {
- $skipOffset = $this->_tisFile->readVInt();
- } else {
- $skipOffset = 0;
- }
- $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
- if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) {
- $this->_lastTermPositions = array();
- $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET);
- $freqs = array(); $docId = 0;
- for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) {
- $docDelta = $this->_frqFile->readVInt();
- if( $docDelta % 2 == 1 ) {
- $docId += ($docDelta-1)/2;
- $freqs[ $docId ] = 1;
- } else {
- $docId += $docDelta/2;
- $freqs[ $docId ] = $this->_frqFile->readVInt();
- }
- }
- $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET);
- foreach ($freqs as $docId => $freq) {
- $termPosition = 0; $positions = array();
- for ($count = 0; $count < $freq; $count++ ) {
- $termPosition += $this->_prxFile->readVInt();
- $positions[] = $termPosition;
- }
- if (isset($this->_docMap[$docId])) {
- $this->_lastTermPositions[$this->_docMap[$docId]] = $positions;
- }
- }
- }
- $this->_termCount--;
- if ($this->_termCount == 0) {
- $this->_tisFile = null;
- $this->_frqFile = null;
- $this->_prxFile = null;
- }
- return $this->_lastTerm;
- }
- /**
- * Close terms stream
- *
- * Should be used for resources clean up if stream is not read up to the end
- */
- public function closeTermsStream()
- {
- $this->_tisFile = null;
- $this->_frqFile = null;
- $this->_prxFile = null;
- $this->_lastTerm = null;
- $this->_lastTermInfo = null;
- $this->_lastTermPositions = null;
- $this->_docMap = null;
- }
- /**
- * Returns term in current position
- *
- * @return Zend_Search_Lucene_Index_Term|null
- */
- public function currentTerm()
- {
- return $this->_lastTerm;
- }
- /**
- * Returns an array of all term positions in the documents.
- * Return array structure: array( docId => array( pos1, pos2, ...), ...)
- *
- * @return array
- */
- public function currentTermPositions()
- {
- return $this->_lastTermPositions;
- }
- }
|