2
0

Phrase.php 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Search
  18. * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. */
  21. /** Zend_Search_Lucene_Search_Query_Processing */
  22. require_once 'Zend/Search/Lucene/Search/Query/Preprocessing.php';
  23. /** Zend_Search_Lucene_Search_Query_Phrase */
  24. require_once 'Zend/Search/Lucene/Search/Query/Phrase.php';
  25. /** Zend_Search_Lucene_Search_Query_Insignificant */
  26. require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php';
  27. /** Zend_Search_Lucene_Search_Query_Empty */
  28. require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
  29. /** Zend_Search_Lucene_Search_Query_Term */
  30. require_once 'Zend/Search/Lucene/Search/Query/Term.php';
  31. /** Zend_Search_Lucene_Index_Term */
  32. require_once 'Zend/Search/Lucene/Index/Term.php';
  33. /**
  34. * It's an internal abstract class intended to finalize ase a query processing after query parsing.
  35. * This type of query is not actually involved into query execution.
  36. *
  37. * @category Zend
  38. * @package Zend_Search_Lucene
  39. * @subpackage Search
  40. * @internal
  41. * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
  42. * @license http://framework.zend.com/license/new-bsd New BSD License
  43. */
  44. class Zend_Search_Lucene_Search_Query_Preprocessing_Phrase extends Zend_Search_Lucene_Search_Query_Preprocessing
  45. {
  46. /**
  47. * Phrase to find.
  48. *
  49. * @var string
  50. */
  51. private $_phrase;
  52. /**
  53. * Phrase encoding (field name is always provided using UTF-8 encoding since it may be retrieved from index).
  54. *
  55. * @var string
  56. */
  57. private $_phraseEncoding;
  58. /**
  59. * Field name.
  60. *
  61. * @var string
  62. */
  63. private $_field;
  64. /**
  65. * Sets the number of other words permitted between words in query phrase.
  66. * If zero, then this is an exact phrase search. For larger values this works
  67. * like a WITHIN or NEAR operator.
  68. *
  69. * The slop is in fact an edit-distance, where the units correspond to
  70. * moves of terms in the query phrase out of position. For example, to switch
  71. * the order of two words requires two moves (the first move places the words
  72. * atop one another), so to permit re-orderings of phrases, the slop must be
  73. * at least two.
  74. * More exact matches are scored higher than sloppier matches, thus search
  75. * results are sorted by exactness.
  76. *
  77. * The slop is zero by default, requiring exact matches.
  78. *
  79. * @var integer
  80. */
  81. private $_slop;
  82. /**
  83. * Class constructor. Create a new preprocessing object for prase query.
  84. *
  85. * @param string $phrase Phrase to search.
  86. * @param string $phraseEncoding Phrase encoding.
  87. * @param string $fieldName Field name.
  88. */
  89. public function __construct($phrase, $phraseEncoding, $fieldName)
  90. {
  91. $this->_phrase = $phrase;
  92. $this->_phraseEncoding = $phraseEncoding;
  93. $this->_field = $fieldName;
  94. }
  95. /**
  96. * Set slop
  97. *
  98. * @param integer $slop
  99. */
  100. public function setSlop($slop)
  101. {
  102. $this->_slop = $slop;
  103. }
  104. /**
  105. * Get slop
  106. *
  107. * @return integer
  108. */
  109. public function getSlop()
  110. {
  111. return $this->_slop;
  112. }
  113. /**
  114. * Re-write query into primitive queries in the context of specified index
  115. *
  116. * @param Zend_Search_Lucene_Interface $index
  117. * @return Zend_Search_Lucene_Search_Query
  118. */
  119. public function rewrite(Zend_Search_Lucene_Interface $index)
  120. {
  121. // Allow to use wildcards within phrases
  122. // They are either removed by text analyzer or used as a part of keyword for keyword fields
  123. //
  124. // if (strpos($this->_phrase, '?') !== false || strpos($this->_phrase, '*') !== false) {
  125. // require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
  126. // throw new Zend_Search_Lucene_Search_QueryParserException('Wildcards are only allowed in a single terms.');
  127. // }
  128. // Split query into subqueries if field name is not specified
  129. if ($this->_field === null) {
  130. $query = new Zend_Search_Lucene_Search_Query_Boolean();
  131. $query->setBoost($this->getBoost());
  132. if (Zend_Search_Lucene::getDefaultSearchField() === null) {
  133. $searchFields = $index->getFieldNames(true);
  134. } else {
  135. $searchFields = array(Zend_Search_Lucene::getDefaultSearchField());
  136. }
  137. foreach ($searchFields as $fieldName) {
  138. $subquery = new Zend_Search_Lucene_Search_Query_Preprocessing_Phrase($this->_phrase,
  139. $this->_phraseEncoding,
  140. $fieldName);
  141. $subquery->setSlop($this->getSlop());
  142. $query->addSubquery($subquery->rewrite($index));
  143. }
  144. $this->_matches = $query->getQueryTerms();
  145. return $query;
  146. }
  147. // Recognize exact term matching (it corresponds to Keyword fields stored in the index)
  148. // encoding is not used since we expect binary matching
  149. $term = new Zend_Search_Lucene_Index_Term($this->_phrase, $this->_field);
  150. if ($index->hasTerm($term)) {
  151. $query = new Zend_Search_Lucene_Search_Query_Term($term);
  152. $query->setBoost($this->getBoost());
  153. $this->_matches = $query->getQueryTerms();
  154. return $query;
  155. }
  156. // tokenize phrase using current analyzer and process it as a phrase query
  157. $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_phrase, $this->_phraseEncoding);
  158. if (count($tokens) == 0) {
  159. $this->_matches = array();
  160. return new Zend_Search_Lucene_Search_Query_Insignificant();
  161. }
  162. if (count($tokens) == 1) {
  163. $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
  164. $query = new Zend_Search_Lucene_Search_Query_Term($term);
  165. $query->setBoost($this->getBoost());
  166. $this->_matches = $query->getQueryTerms();
  167. return $query;
  168. }
  169. //It's non-trivial phrase query
  170. $position = -1;
  171. $query = new Zend_Search_Lucene_Search_Query_Phrase();
  172. foreach ($tokens as $token) {
  173. $position += $token->getPositionIncrement();
  174. $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $this->_field);
  175. $query->addTerm($term, $position);
  176. $query->setSlop($this->getSlop());
  177. }
  178. $this->_matches = $query->getQueryTerms();
  179. return $query;
  180. }
  181. /**
  182. * Query specific matches highlighting
  183. *
  184. * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
  185. */
  186. protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
  187. {
  188. /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */
  189. /** Skip exact term matching recognition, keyword fields highlighting is not supported */
  190. /** Skip wildcard queries recognition. Supported wildcards are removed by text analyzer */
  191. // tokenize phrase using current analyzer and process it as a phrase query
  192. $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_phrase, $this->_phraseEncoding);
  193. if (count($tokens) == 0) {
  194. // Do nothing
  195. return;
  196. }
  197. if (count($tokens) == 1) {
  198. $highlighter->highlight($tokens[0]->getTermText());
  199. return;
  200. }
  201. //It's non-trivial phrase query
  202. $words = array();
  203. foreach ($tokens as $token) {
  204. $words[] = $token->getTermText();
  205. }
  206. $highlighter->highlight($words);
  207. }
  208. /**
  209. * Print a query
  210. *
  211. * @return string
  212. */
  213. public function __toString()
  214. {
  215. // It's used only for query visualisation, so we don't care about characters escaping
  216. if ($this->_field !== null) {
  217. $query = $this->_field . ':';
  218. } else {
  219. $query = '';
  220. }
  221. $query .= '"' . $this->_phrase . '"';
  222. if ($this->_slop != 0) {
  223. $query .= '~' . $this->_slop;
  224. }
  225. if ($this->getBoost() != 1) {
  226. $query .= '^' . round($this->getBoost(), 4);
  227. }
  228. return $query;
  229. }
  230. }