DocumentTest.php 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage UnitTests
  18. * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id$
  21. */
  22. /**
  23. * Zend_Search_Lucene_Document
  24. */
  25. require_once 'Zend/Search/Lucene/Document.php';
  26. /**
  27. * Zend_Search_Lucene_Document_Docx
  28. */
  29. require_once 'Zend/Search/Lucene/Document/Docx.php';
  30. /**
  31. * Zend_Search_Lucene_Document_Pptx
  32. */
  33. require_once 'Zend/Search/Lucene/Document/Pptx.php';
  34. /**
  35. * Zend_Search_Lucene_Document_Xlsx
  36. */
  37. require_once 'Zend/Search/Lucene/Document/Xlsx.php';
  38. /**
  39. * PHPUnit test case
  40. */
  41. require_once 'PHPUnit/Framework/TestCase.php';
  42. /**
  43. * @category Zend
  44. * @package Zend_Search_Lucene
  45. * @subpackage UnitTests
  46. * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
  47. * @license http://framework.zend.com/license/new-bsd New BSD License
  48. * @group Zend_Search_Lucene
  49. */
  50. class Zend_Search_Lucene_DocumentTest extends PHPUnit_Framework_TestCase
  51. {
  52. private function _clearDirectory($dirName)
  53. {
  54. if (!file_exists($dirName) || !is_dir($dirName)) {
  55. return;
  56. }
  57. // remove files from temporary direcytory
  58. $dir = opendir($dirName);
  59. while (($file = readdir($dir)) !== false) {
  60. if (!is_dir($dirName . '/' . $file)) {
  61. @unlink($dirName . '/' . $file);
  62. }
  63. }
  64. closedir($dir);
  65. }
  66. public function testCreate()
  67. {
  68. $document = new Zend_Search_Lucene_Document();
  69. $this->assertEquals($document->boost, 1);
  70. }
  71. public function testFields()
  72. {
  73. $document = new Zend_Search_Lucene_Document();
  74. $document->addField(Zend_Search_Lucene_Field::Text('title', 'Title'));
  75. $document->addField(Zend_Search_Lucene_Field::Text('annotation', 'Annotation'));
  76. $document->addField(Zend_Search_Lucene_Field::Text('body', 'Document body, document body, document body...'));
  77. $fieldnamesDiffArray = array_diff($document->getFieldNames(), array('title', 'annotation', 'body'));
  78. $this->assertTrue(is_array($fieldnamesDiffArray));
  79. $this->assertEquals(count($fieldnamesDiffArray), 0);
  80. $this->assertEquals($document->title, 'Title');
  81. $this->assertEquals($document->annotation, 'Annotation');
  82. $this->assertEquals($document->body, 'Document body, document body, document body...');
  83. $this->assertEquals($document->getField('title')->value, 'Title');
  84. $this->assertEquals($document->getField('annotation')->value, 'Annotation');
  85. $this->assertEquals($document->getField('body')->value, 'Document body, document body, document body...');
  86. $this->assertEquals($document->getFieldValue('title'), 'Title');
  87. $this->assertEquals($document->getFieldValue('annotation'), 'Annotation');
  88. $this->assertEquals($document->getFieldValue('body'), 'Document body, document body, document body...');
  89. if (PHP_OS == 'AIX') {
  90. return; // tests below here not valid on AIX
  91. }
  92. $wordsWithUmlautsIso88591 = iconv('UTF-8', 'ISO-8859-1', 'Words with umlauts: åãü...');
  93. $document->addField(Zend_Search_Lucene_Field::Text('description', $wordsWithUmlautsIso88591, 'ISO-8859-1'));
  94. $this->assertEquals($document->description, $wordsWithUmlautsIso88591);
  95. $this->assertEquals($document->getFieldUtf8Value('description'), 'Words with umlauts: åãü...');
  96. }
  97. public function testAddFieldMethodChaining()
  98. {
  99. $document = new Zend_Search_Lucene_Document();
  100. $this->assertTrue($document->addField(Zend_Search_Lucene_Field::Text('title', 'Title')) instanceof Zend_Search_Lucene_Document);
  101. $document = new Zend_Search_Lucene_Document();
  102. $document->addField(Zend_Search_Lucene_Field::Text('title', 'Title'))
  103. ->addField(Zend_Search_Lucene_Field::Text('annotation', 'Annotation'))
  104. ->addField(Zend_Search_Lucene_Field::Text('body', 'Document body, document body, document body...'));
  105. }
  106. public function testHtmlHighlighting()
  107. {
  108. $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>');
  109. $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
  110. $doc->highlight('document', '#66ffff');
  111. $this->assertTrue(strpos($doc->getHTML(), '<b style="color:black;background-color:#66ffff">Document</b> body.') !== false);
  112. }
  113. public function testHtmlExtendedHighlighting()
  114. {
  115. $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>');
  116. $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
  117. $doc->highlightExtended('document',
  118. array('Zend_Search_Lucene_DocumentTest_DocHighlightingContainer',
  119. 'extendedHighlightingCallback'),
  120. array('style="color:black;background-color:#ff66ff"',
  121. '(!!!)'));
  122. $this->assertTrue(strpos($doc->getHTML(), '<b style="color:black;background-color:#ff66ff">Document</b>(!!!) body.') !== false);
  123. }
  124. public function testHtmlWordsHighlighting()
  125. {
  126. $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>');
  127. $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
  128. $doc->highlight(array('document', 'body'), '#66ffff');
  129. $highlightedHTML = $doc->getHTML();
  130. $this->assertTrue(strpos($highlightedHTML, '<b style="color:black;background-color:#66ffff">Document</b>') !== false);
  131. $this->assertTrue(strpos($highlightedHTML, '<b style="color:black;background-color:#66ffff">body</b>') !== false);
  132. }
  133. public function testHtmlExtendedHighlightingCorrectWrongHtml()
  134. {
  135. $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>');
  136. $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
  137. $doc->highlightExtended('document',
  138. array('Zend_Search_Lucene_DocumentTest_DocHighlightingContainer',
  139. 'extendedHighlightingCallback'),
  140. array('style="color:black;background-color:#ff66ff"',
  141. '<h3>(!!!)' /* Wrong HTML here, <h3> tag is not closed */));
  142. $this->assertTrue(strpos($doc->getHTML(), '<b style="color:black;background-color:#ff66ff">Document</b><h3>(!!!)</h3> body.') !== false);
  143. }
  144. public function testHtmlLinksProcessing()
  145. {
  146. $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile(dirname(__FILE__) . '/_indexSource/_files/contributing.documentation.html', true);
  147. $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
  148. $this->assertTrue(array_values($doc->getHeaderLinks()) ==
  149. array('index.html', 'contributing.html', 'contributing.bugs.html', 'contributing.wishlist.html'));
  150. $this->assertTrue(array_values($doc->getLinks()) ==
  151. array('contributing.bugs.html',
  152. 'contributing.wishlist.html',
  153. 'developers.documentation.html',
  154. 'faq.translators-revision-tracking.html',
  155. 'index.html',
  156. 'contributing.html'));
  157. }
  158. /**
  159. * @group ZF-4252
  160. */
  161. public function testHtmlInlineTagsIndexing()
  162. {
  163. $index = Zend_Search_Lucene::create(dirname(__FILE__) . '/_index/_files');
  164. $htmlString = '<html><head><title>Hello World</title></head>'
  165. . '<body><b>Zend</b>Framework' . "\n" . ' <div>Foo</div>Bar ' . "\n"
  166. . ' <strong>Test</strong></body></html>';
  167. $doc = Zend_Search_Lucene_Document_Html::loadHTML($htmlString);
  168. $index->addDocument($doc);
  169. $hits = $index->find('FooBar');
  170. $this->assertEquals(count($hits), 0);
  171. $hits = $index->find('ZendFramework');
  172. $this->assertEquals(count($hits), 1);
  173. unset($index);
  174. $this->_clearDirectory(dirname(__FILE__) . '/_index/_files');
  175. }
  176. /**
  177. * @group ZF-8740
  178. */
  179. public function testHtmlAreaTags()
  180. {
  181. $html = '<HTML>'
  182. . '<HEAD><TITLE>Page title</TITLE></HEAD>'
  183. . '<BODY>'
  184. . 'Document body.'
  185. . '<img src="img.png" width="640" height="480" alt="some image" usemap="#some_map" />'
  186. . '<map name="some_map">'
  187. . '<area shape="rect" coords="0,0,100,100" href="link3.html" alt="Link 3" />'
  188. . '<area shape="rect" coords="200,200,300,300" href="link4.html" alt="Link 4" />'
  189. . '</map>'
  190. . '<a href="link1.html">Link 1</a>.'
  191. . '<a href="link2.html" rel="nofollow">Link 1</a>.'
  192. . '</BODY>'
  193. . '</HTML>';
  194. $oldNoFollowValue = Zend_Search_Lucene_Document_Html::getExcludeNoFollowLinks();
  195. Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(false);
  196. $doc1 = Zend_Search_Lucene_Document_Html::loadHTML($html);
  197. $this->assertTrue($doc1 instanceof Zend_Search_Lucene_Document_Html);
  198. $links = array('link1.html', 'link2.html', 'link3.html', 'link4.html');
  199. $this->assertTrue(array_values($doc1->getLinks()) == $links);
  200. }
  201. public function testHtmlNoFollowLinks()
  202. {
  203. $html = '<HTML>'
  204. . '<HEAD><TITLE>Page title</TITLE></HEAD>'
  205. . '<BODY>'
  206. . 'Document body.'
  207. . '<a href="link1.html">Link 1</a>.'
  208. . '<a href="link2.html" rel="nofollow">Link 1</a>.'
  209. . '</BODY>'
  210. . '</HTML>';
  211. $oldNoFollowValue = Zend_Search_Lucene_Document_Html::getExcludeNoFollowLinks();
  212. Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(false);
  213. $doc1 = Zend_Search_Lucene_Document_Html::loadHTML($html);
  214. $this->assertTrue($doc1 instanceof Zend_Search_Lucene_Document_Html);
  215. $this->assertTrue(array_values($doc1->getLinks()) == array('link1.html', 'link2.html'));
  216. Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(true);
  217. $doc2 = Zend_Search_Lucene_Document_Html::loadHTML($html);
  218. $this->assertTrue($doc2 instanceof Zend_Search_Lucene_Document_Html);
  219. $this->assertTrue(array_values($doc2->getLinks()) == array('link1.html'));
  220. }
  221. public function testDocx()
  222. {
  223. if (!class_exists('ZipArchive')) {
  224. $this->markTestSkipped('ZipArchive class (Zip extension) is not loaded');
  225. }
  226. $docxDocument = Zend_Search_Lucene_Document_Docx::loadDocxFile(dirname(__FILE__) . '/_openXmlDocuments/test.docx', true);
  227. $this->assertTrue($docxDocument instanceof Zend_Search_Lucene_Document_Docx);
  228. $this->assertEquals($docxDocument->getFieldValue('title'), 'Test document');
  229. $this->assertEquals($docxDocument->getFieldValue('description'), 'This is a test document which can be used to demonstrate something.');
  230. $this->assertTrue($docxDocument->getFieldValue('body') != '');
  231. try {
  232. $docxDocument1 = Zend_Search_Lucene_Document_Docx::loadDocxFile(dirname(__FILE__) . '/_openXmlDocuments/dummy.docx', true);
  233. $this->fail('File not readable exception is expected.');
  234. } catch (Zend_Search_Lucene_Document_Exception $e) {
  235. if (strpos($e->getMessage(), 'is not readable') === false) {
  236. // Passthrough exception
  237. throw $e;
  238. }
  239. }
  240. }
  241. public function testPptx()
  242. {
  243. if (!class_exists('ZipArchive')) {
  244. $this->markTestSkipped('ZipArchive class (Zip extension) is not loaded');
  245. }
  246. $pptxDocument = Zend_Search_Lucene_Document_Pptx::loadPptxFile(dirname(__FILE__) . '/_openXmlDocuments/test.pptx', true);
  247. $this->assertTrue($pptxDocument instanceof Zend_Search_Lucene_Document_Pptx);
  248. $this->assertEquals($pptxDocument->getFieldValue('title'), 'Test document');
  249. $this->assertEquals($pptxDocument->getFieldValue('description'), 'This is a test document which can be used to demonstrate something.');
  250. $this->assertTrue($pptxDocument->getFieldValue('body') != '');
  251. }
  252. public function testXlsx()
  253. {
  254. if (!class_exists('ZipArchive')) {
  255. $this->markTestSkipped('ZipArchive class (Zip extension) is not loaded');
  256. }
  257. $xlsxDocument = Zend_Search_Lucene_Document_Xlsx::loadXlsxFile(dirname(__FILE__) . '/_openXmlDocuments/test.xlsx', true);
  258. $this->assertTrue($xlsxDocument instanceof Zend_Search_Lucene_Document_Xlsx);
  259. $this->assertEquals($xlsxDocument->getFieldValue('title'), 'Test document');
  260. $this->assertEquals($xlsxDocument->getFieldValue('description'), 'This is a test document which can be used to demonstrate something.');
  261. $this->assertTrue($xlsxDocument->getFieldValue('body') != '');
  262. $this->assertTrue( strpos($xlsxDocument->getFieldValue('body'), 'ipsum') !== false );
  263. }
  264. /**
  265. * @group ZF-10686
  266. */
  267. public function testLoadHtmlWithAttributesInTagHTML()
  268. {
  269. $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML lang="en_US"><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>');
  270. $this->assertEquals('Page title ', $doc->title);
  271. }
  272. }
  273. class Zend_Search_Lucene_DocumentTest_DocHighlightingContainer {
  274. public static function extendedHighlightingCallback($stringToHighlight, $param1, $param2)
  275. {
  276. return '<b ' . $param1 . '>' . $stringToHighlight . '</b>' . $param2;
  277. }
  278. }