Sfoglia il codice sorgente

Zend_Search_Lucene: Fix for ZF-4252 (Zend_Search_Lucene_Document_Html add spaces for html tags). Applying patch provided by Christopher Thomas.

git-svn-id: http://framework.zend.com/svn/framework/standard/trunk@21939 44c647ce-9c0f-0410-b52a-842ac1e357ba
alexander 15 anni fa
parent
commit
c0aa412a71

+ 15 - 2
library/Zend/Search/Lucene/Document/Html.php

@@ -68,6 +68,17 @@ class Zend_Search_Lucene_Document_Html extends Zend_Search_Lucene_Document
     private static $_excludeNoFollowLinks = false;
 
     /**
+     *
+     * List of inline tags
+     *
+     * @var array
+     */
+    private $_inlineTags = array('a', 'abbr', 'acronym', 'dfn', 'em', 'strong', 'code',
+                                'samp', 'kbd', 'var', 'b', 'i', 'big', 'small', 'strike',
+                                'tt', 'u', 'font', 'span', 'bdo', 'cite', 'del', 'ins',
+                                'q', 'sub', 'sup');
+
+    /**
      * Object constructor
      *
      * @param string  $data         HTML string (may be HTML fragment, )
@@ -197,8 +208,10 @@ class Zend_Search_Lucene_Document_Html extends Zend_Search_Lucene_Document
     private function _retrieveNodeText(DOMNode $node, &$text)
     {
         if ($node->nodeType == XML_TEXT_NODE) {
-            $text .= $node->nodeValue ;
-            $text .= ' ';
+            $text .= $node->nodeValue;
+            if(!in_array($node->parentNode->tagName, $this->_inlineTags)) {
+                $text .= ' ';
+            }
         } else if ($node->nodeType == XML_ELEMENT_NODE  &&  $node->nodeName != 'script') {
             foreach ($node->childNodes as $childNode) {
                 $this->_retrieveNodeText($childNode, $text);

+ 43 - 0
tests/Zend/Search/Lucene/DocumentTest.php

@@ -55,6 +55,23 @@ require_once 'PHPUnit/Framework/TestCase.php';
  */
 class Zend_Search_Lucene_DocumentTest extends PHPUnit_Framework_TestCase
 {
+
+    private function _clearDirectory($dirName)
+    {
+        if (!file_exists($dirName) || !is_dir($dirName))  {
+            return;
+        }
+
+        // remove files from temporary direcytory
+        $dir = opendir($dirName);
+        while (($file = readdir($dir)) !== false) {
+            if (!is_dir($dirName . '/' . $file)) {
+                @unlink($dirName . '/' . $file);
+            }
+        }
+        closedir($dir);
+    }
+
     public function testCreate()
     {
         $document =  new Zend_Search_Lucene_Document();
@@ -171,6 +188,32 @@ class Zend_Search_Lucene_DocumentTest extends PHPUnit_Framework_TestCase
                                 'contributing.html'));
     }
 
+
+    /**
+     * @group ZF-4252
+     */
+    public function testHtmlInlineTagsIndexing()
+    {
+        $index = Zend_Search_Lucene::create(dirname(__FILE__) . '/_index/_files');
+
+        $htmlString = '<html><head><title>Hello World</title></head>'
+                    . '<body><b>Zend</b>Framework' . "\n" . ' <div>Foo</div>Bar ' . "\n"
+                    . ' <strong>Test</strong></body></html>';
+
+        $doc = Zend_Search_Lucene_Document_Html::loadHTML($htmlString);
+
+        $index->addDocument($doc);
+
+        $hits = $index->find('FooBar');
+        $this->assertEquals(count($hits), 0);
+
+        $hits = $index->find('ZendFramework');
+        $this->assertEquals(count($hits), 1);
+        
+        unset($index);
+        $this->_clearDirectory(dirname(__FILE__) . '/_index/_files');
+    }
+
     public function testHtmlNoFollowLinks()
     {
         $html = '<HTML>'