Term.php 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Search
  18. * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. */
  21. /** Zend_Search_Lucene_Search_Query_Processing */
  22. require_once 'Zend/Search/Lucene/Search/Query/Preprocessing.php';
  23. /** Zend_Search_Lucene_Search_Query_Phrase */
  24. require_once 'Zend/Search/Lucene/Search/Query/Phrase.php';
  25. /** Zend_Search_Lucene_Search_Query_Insignificant */
  26. require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php';
  27. /** Zend_Search_Lucene_Search_Query_Empty */
  28. require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
  29. /** Zend_Search_Lucene_Search_Query_Term */
  30. require_once 'Zend/Search/Lucene/Search/Query/Term.php';
  31. /** Zend_Search_Lucene_Index_Term */
  32. require_once 'Zend/Search/Lucene/Index/Term.php';
  33. /**
  34. * It's an internal abstract class intended to finalize ase a query processing after query parsing.
  35. * This type of query is not actually involved into query execution.
  36. *
  37. * @category Zend
  38. * @package Zend_Search_Lucene
  39. * @subpackage Search
  40. * @internal
  41. * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
  42. * @license http://framework.zend.com/license/new-bsd New BSD License
  43. */
  44. class Zend_Search_Lucene_Search_Query_Preprocessing_Term extends Zend_Search_Lucene_Search_Query_Preprocessing
  45. {
  46. /**
  47. * word (query parser lexeme) to find.
  48. *
  49. * @var string
  50. */
  51. private $_word;
  52. /**
  53. * Word encoding (field name is always provided using UTF-8 encoding since it may be retrieved from index).
  54. *
  55. * @var string
  56. */
  57. private $_encoding;
  58. /**
  59. * Field name.
  60. *
  61. * @var string
  62. */
  63. private $_field;
  64. /**
  65. * Class constructor. Create a new preprocessing object for prase query.
  66. *
  67. * @param string $word Non-tokenized word (query parser lexeme) to search.
  68. * @param string $encoding Word encoding.
  69. * @param string $fieldName Field name.
  70. */
  71. public function __construct($word, $encoding, $fieldName)
  72. {
  73. $this->_word = $word;
  74. $this->_encoding = $encoding;
  75. $this->_field = $fieldName;
  76. }
  77. /**
  78. * Re-write query into primitive queries in the context of specified index
  79. *
  80. * @param Zend_Search_Lucene_Interface $index
  81. * @return Zend_Search_Lucene_Search_Query
  82. */
  83. public function rewrite(Zend_Search_Lucene_Interface $index)
  84. {
  85. if ($this->_field === null) {
  86. $query = new Zend_Search_Lucene_Search_Query_MultiTerm();
  87. $query->setBoost($this->getBoost());
  88. $hasInsignificantSubqueries = false;
  89. if (Zend_Search_Lucene::getDefaultSearchField() === null) {
  90. $searchFields = $index->getFieldNames(true);
  91. } else {
  92. $searchFields = array(Zend_Search_Lucene::getDefaultSearchField());
  93. }
  94. foreach ($searchFields as $fieldName) {
  95. $subquery = new Zend_Search_Lucene_Search_Query_Preprocessing_Term($this->_word,
  96. $this->_encoding,
  97. $fieldName);
  98. $rewrittenSubquery = $subquery->rewrite($index);
  99. foreach ($rewrittenSubquery->getQueryTerms() as $term) {
  100. $query->addTerm($term);
  101. }
  102. if ($rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Insignificant) {
  103. $hasInsignificantSubqueries = true;
  104. }
  105. }
  106. if (count($query->getTerms()) == 0) {
  107. $this->_matches = array();
  108. if ($hasInsignificantSubqueries) {
  109. return new Zend_Search_Lucene_Search_Query_Insignificant();
  110. } else {
  111. return new Zend_Search_Lucene_Search_Query_Empty();
  112. }
  113. }
  114. $this->_matches = $query->getQueryTerms();
  115. return $query;
  116. }
  117. // -------------------------------------
  118. // Recognize exact term matching (it corresponds to Keyword fields stored in the index)
  119. // encoding is not used since we expect binary matching
  120. $term = new Zend_Search_Lucene_Index_Term($this->_word, $this->_field);
  121. if ($index->hasTerm($term)) {
  122. $query = new Zend_Search_Lucene_Search_Query_Term($term);
  123. $query->setBoost($this->getBoost());
  124. $this->_matches = $query->getQueryTerms();
  125. return $query;
  126. }
  127. // -------------------------------------
  128. // Recognize wildcard queries
  129. /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
  130. if (@preg_match('/\pL/u', 'a') == 1) {
  131. $word = iconv($this->_encoding, 'UTF-8', $this->_word);
  132. $wildcardsPattern = '/[*?]/u';
  133. $subPatternsEncoding = 'UTF-8';
  134. } else {
  135. $word = $this->_word;
  136. $wildcardsPattern = '/[*?]/';
  137. $subPatternsEncoding = $this->_encoding;
  138. }
  139. $subPatterns = preg_split($wildcardsPattern, $word, -1, PREG_SPLIT_OFFSET_CAPTURE);
  140. if (count($subPatterns) > 1) {
  141. // Wildcard query is recognized
  142. $pattern = '';
  143. foreach ($subPatterns as $id => $subPattern) {
  144. // Append corresponding wildcard character to the pattern before each sub-pattern (except first)
  145. if ($id != 0) {
  146. $pattern .= $word[ $subPattern[1] - 1 ];
  147. }
  148. // Check if each subputtern is a single word in terms of current analyzer
  149. $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($subPattern[0], $subPatternsEncoding);
  150. if (count($tokens) > 1) {
  151. require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
  152. throw new Zend_Search_Lucene_Search_QueryParserException('Wildcard search is supported only for non-multiple word terms');
  153. }
  154. foreach ($tokens as $token) {
  155. $pattern .= $token->getTermText();
  156. }
  157. }
  158. $term = new Zend_Search_Lucene_Index_Term($pattern, $this->_field);
  159. $query = new Zend_Search_Lucene_Search_Query_Wildcard($term);
  160. $query->setBoost($this->getBoost());
  161. // Get rewritten query. Important! It also fills terms matching container.
  162. $rewrittenQuery = $query->rewrite($index);
  163. $this->_matches = $query->getQueryTerms();
  164. return $rewrittenQuery;
  165. }
  166. // -------------------------------------
  167. // Recognize one-term multi-term and "insignificant" queries
  168. $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
  169. if (count($tokens) == 0) {
  170. $this->_matches = array();
  171. return new Zend_Search_Lucene_Search_Query_Insignificant();
  172. }
  173. if (count($tokens) == 1) {
  174. $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
  175. $query = new Zend_Search_Lucene_Search_Query_Term($term);
  176. $query->setBoost($this->getBoost());
  177. $this->_matches = $query->getQueryTerms();
  178. return $query;
  179. }
  180. //It's not insignificant or one term query
  181. $query = new Zend_Search_Lucene_Search_Query_MultiTerm();
  182. /**
  183. * @todo Process $token->getPositionIncrement() to support stemming, synonyms and other
  184. * analizer design features
  185. */
  186. foreach ($tokens as $token) {
  187. $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $this->_field);
  188. $query->addTerm($term, true); // all subterms are required
  189. }
  190. $query->setBoost($this->getBoost());
  191. $this->_matches = $query->getQueryTerms();
  192. return $query;
  193. }
  194. /**
  195. * Query specific matches highlighting
  196. *
  197. * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
  198. */
  199. protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
  200. {
  201. /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */
  202. /** Skip exact term matching recognition, keyword fields highlighting is not supported */
  203. // -------------------------------------
  204. // Recognize wildcard queries
  205. /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
  206. if (@preg_match('/\pL/u', 'a') == 1) {
  207. $word = iconv($this->_encoding, 'UTF-8', $this->_word);
  208. $wildcardsPattern = '/[*?]/u';
  209. $subPatternsEncoding = 'UTF-8';
  210. } else {
  211. $word = $this->_word;
  212. $wildcardsPattern = '/[*?]/';
  213. $subPatternsEncoding = $this->_encoding;
  214. }
  215. $subPatterns = preg_split($wildcardsPattern, $word, -1, PREG_SPLIT_OFFSET_CAPTURE);
  216. if (count($subPatterns) > 1) {
  217. // Wildcard query is recognized
  218. $pattern = '';
  219. foreach ($subPatterns as $id => $subPattern) {
  220. // Append corresponding wildcard character to the pattern before each sub-pattern (except first)
  221. if ($id != 0) {
  222. $pattern .= $word[ $subPattern[1] - 1 ];
  223. }
  224. // Check if each subputtern is a single word in terms of current analyzer
  225. $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($subPattern[0], $subPatternsEncoding);
  226. if (count($tokens) > 1) {
  227. // Do nothing (nothing is highlighted)
  228. return;
  229. }
  230. foreach ($tokens as $token) {
  231. $pattern .= $token->getTermText();
  232. }
  233. }
  234. $term = new Zend_Search_Lucene_Index_Term($pattern, $this->_field);
  235. $query = new Zend_Search_Lucene_Search_Query_Wildcard($term);
  236. $query->_highlightMatches($highlighter);
  237. return;
  238. }
  239. // -------------------------------------
  240. // Recognize one-term multi-term and "insignificant" queries
  241. $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
  242. if (count($tokens) == 0) {
  243. // Do nothing
  244. return;
  245. }
  246. if (count($tokens) == 1) {
  247. $highlighter->highlight($tokens[0]->getTermText());
  248. return;
  249. }
  250. //It's not insignificant or one term query
  251. $words = array();
  252. foreach ($tokens as $token) {
  253. $words[] = $token->getTermText();
  254. }
  255. $highlighter->highlight($words);
  256. }
  257. /**
  258. * Print a query
  259. *
  260. * @return string
  261. */
  262. public function __toString()
  263. {
  264. // It's used only for query visualisation, so we don't care about characters escaping
  265. if ($this->_field !== null) {
  266. $query = $this->_field . ':';
  267. } else {
  268. $query = '';
  269. }
  270. $query .= $this->_word;
  271. if ($this->getBoost() != 1) {
  272. $query .= '^' . round($this->getBoost(), 4);
  273. }
  274. return $query;
  275. }
  276. }