Browse Source

Zend_Search_Lucene: fixed problem with empty documents indexing. ZF-6088.

git-svn-id: http://framework.zend.com/svn/framework/standard/trunk@19204 44c647ce-9c0f-0410-b52a-842ac1e357ba
alexander 16 years ago
parent
commit
2da755e1ee

+ 4 - 2
library/Zend/Search/Lucene/Index/SegmentInfo.php

@@ -1850,9 +1850,11 @@ class Zend_Search_Lucene_Index_SegmentInfo implements Zend_Search_Lucene_Index_T
                 break;
         }
 
-
+        // Calculate next segment start id (since $this->_docMap structure may be cleaned by $this->nextTerm() call)
+        $nextSegmentStartId = $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $this->_docCount);
         $this->nextTerm();
-        return $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $this->_docCount);
+
+        return $nextSegmentStartId;
     }
 
 

+ 17 - 7
library/Zend/Search/Lucene/Index/SegmentWriter/DocumentWriter.php

@@ -80,7 +80,6 @@ class Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter extends Zend_Search_
 
         foreach ($document->getFieldNames() as $fieldName) {
             $field = $document->getField($fieldName);
-            $this->addField($field);
 
             if ($field->storeTermVector) {
                 /**
@@ -119,12 +118,22 @@ class Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter extends Zend_Search_
                         $this->_termDocs[$termKey][$this->_docCount][] = $position;
                     }
 
-                    $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name,
-                                                                                                   $tokenCounter)*
-                                                                           $document->boost*
-                                                                           $field->boost ));
+                    if ($tokenCounter == 0) {
+                        // Field contains empty value. Treat it as non-indexed and non-tokenized
+                        $field = clone($field);
+                        $field->isIndexed = $field->isTokenized = false;
+                    } else {
+                        $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name,
+                                                                                                       $tokenCounter)*
+                                                                               $document->boost*
+                                                                               $field->boost ));
+                    }
+                } else if (($fieldUtf8Value = $field->getUtf8Value()) == '') {
+                    // Field contains empty value. Treat it as non-indexed and non-tokenized
+                    $field = clone($field);
+                    $field->isIndexed = $field->isTokenized = false;
                 } else {
-                    $term = new Zend_Search_Lucene_Index_Term($field->getUtf8Value(), $field->name);
+                    $term = new Zend_Search_Lucene_Index_Term($fieldUtf8Value, $field->name);
                     $termKey = $term->key();
 
                     if (!isset($this->_termDictionary[$termKey])) {
@@ -147,8 +156,9 @@ class Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter extends Zend_Search_
             if ($field->isStored) {
                 $storedFields[] = $field;
             }
-        }
 
+            $this->addField($field);
+        }
 
         foreach ($this->_fields as $fieldName => $field) {
             if (!$field->isIndexed) {