Wildcard.php 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Search
  18. * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. */
  21. /** Zend_Search_Lucene_Search_Query */
  22. require_once 'Zend/Search/Lucene/Search/Query.php';
  23. /** Zend_Search_Lucene_Search_Query_MultiTerm */
  24. require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php';
  25. /**
  26. * @category Zend
  27. * @package Zend_Search_Lucene
  28. * @subpackage Search
  29. * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
  30. * @license http://framework.zend.com/license/new-bsd New BSD License
  31. */
  32. class Zend_Search_Lucene_Search_Query_Wildcard extends Zend_Search_Lucene_Search_Query
  33. {
  34. /**
  35. * Search pattern.
  36. *
  37. * Field has to be fully specified or has to be null
  38. * Text may contain '*' or '?' symbols
  39. *
  40. * @var Zend_Search_Lucene_Index_Term
  41. */
  42. private $_pattern;
  43. /**
  44. * Matched terms.
  45. *
  46. * Matched terms list.
  47. * It's filled during the search (rewrite operation) and may be used for search result
  48. * post-processing
  49. *
  50. * Array of Zend_Search_Lucene_Index_Term objects
  51. *
  52. * @var array
  53. */
  54. private $_matches = null;
  55. /**
  56. * Minimum term prefix length (number of minimum non-wildcard characters)
  57. *
  58. * @var integer
  59. */
  60. private static $_minPrefixLength = 3;
  61. /**
  62. * Zend_Search_Lucene_Search_Query_Wildcard constructor.
  63. *
  64. * @param Zend_Search_Lucene_Index_Term $pattern
  65. */
  66. public function __construct(Zend_Search_Lucene_Index_Term $pattern)
  67. {
  68. $this->_pattern = $pattern;
  69. }
  70. /**
  71. * Get minimum prefix length
  72. *
  73. * @return integer
  74. */
  75. public static function getMinPrefixLength()
  76. {
  77. return self::$_minPrefixLength;
  78. }
  79. /**
  80. * Set minimum prefix length
  81. *
  82. * @param integer $minPrefixLength
  83. */
  84. public static function setMinPrefixLength($minPrefixLength)
  85. {
  86. self::$_minPrefixLength = $minPrefixLength;
  87. }
  88. /**
  89. * Get terms prefix
  90. *
  91. * @param string $word
  92. * @return string
  93. */
  94. private static function _getPrefix($word)
  95. {
  96. $questionMarkPosition = strpos($word, '?');
  97. $astrericPosition = strpos($word, '*');
  98. if ($questionMarkPosition !== false) {
  99. if ($astrericPosition !== false) {
  100. return substr($word, 0, min($questionMarkPosition, $astrericPosition));
  101. }
  102. return substr($word, 0, $questionMarkPosition);
  103. } else if ($astrericPosition !== false) {
  104. return substr($word, 0, $astrericPosition);
  105. }
  106. return $word;
  107. }
  108. /**
  109. * Re-write query into primitive queries in the context of specified index
  110. *
  111. * @param Zend_Search_Lucene_Interface $index
  112. * @return Zend_Search_Lucene_Search_Query
  113. * @throws Zend_Search_Lucene_Exception
  114. */
  115. public function rewrite(Zend_Search_Lucene_Interface $index)
  116. {
  117. $this->_matches = array();
  118. if ($this->_pattern->field === null) {
  119. // Search through all fields
  120. $fields = $index->getFieldNames(true /* indexed fields list */);
  121. } else {
  122. $fields = array($this->_pattern->field);
  123. }
  124. $prefix = self::_getPrefix($this->_pattern->text);
  125. $prefixLength = strlen($prefix);
  126. $matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*') , preg_quote($this->_pattern->text, '/')) . '$/';
  127. if ($prefixLength < self::$_minPrefixLength) {
  128. throw new Zend_Search_Lucene_Exception('At least ' . self::$_minPrefixLength . ' non-wildcard characters are required at the beginning of pattern.');
  129. }
  130. /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
  131. if (@preg_match('/\pL/u', 'a') == 1) {
  132. // PCRE unicode support is turned on
  133. // add Unicode modifier to the match expression
  134. $matchExpression .= 'u';
  135. }
  136. $maxTerms = Zend_Search_Lucene::getTermsPerQueryLimit();
  137. foreach ($fields as $field) {
  138. $index->resetTermsStream();
  139. if ($prefix != '') {
  140. $index->skipTo(new Zend_Search_Lucene_Index_Term($prefix, $field));
  141. while ($index->currentTerm() !== null &&
  142. $index->currentTerm()->field == $field &&
  143. substr($index->currentTerm()->text, 0, $prefixLength) == $prefix) {
  144. if (preg_match($matchExpression, $index->currentTerm()->text) === 1) {
  145. $this->_matches[] = $index->currentTerm();
  146. if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
  147. throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
  148. }
  149. }
  150. $index->nextTerm();
  151. }
  152. } else {
  153. $index->skipTo(new Zend_Search_Lucene_Index_Term('', $field));
  154. while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) {
  155. if (preg_match($matchExpression, $index->currentTerm()->text) === 1) {
  156. $this->_matches[] = $index->currentTerm();
  157. if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
  158. throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
  159. }
  160. }
  161. $index->nextTerm();
  162. }
  163. }
  164. $index->closeTermsStream();
  165. }
  166. if (count($this->_matches) == 0) {
  167. return new Zend_Search_Lucene_Search_Query_Empty();
  168. } else if (count($this->_matches) == 1) {
  169. return new Zend_Search_Lucene_Search_Query_Term(reset($this->_matches));
  170. } else {
  171. $rewrittenQuery = new Zend_Search_Lucene_Search_Query_MultiTerm();
  172. foreach ($this->_matches as $matchedTerm) {
  173. $rewrittenQuery->addTerm($matchedTerm);
  174. }
  175. return $rewrittenQuery;
  176. }
  177. }
  178. /**
  179. * Optimize query in the context of specified index
  180. *
  181. * @param Zend_Search_Lucene_Interface $index
  182. * @return Zend_Search_Lucene_Search_Query
  183. */
  184. public function optimize(Zend_Search_Lucene_Interface $index)
  185. {
  186. throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
  187. }
  188. /**
  189. * Returns query pattern
  190. *
  191. * @return Zend_Search_Lucene_Index_Term
  192. */
  193. public function getPattern()
  194. {
  195. return $this->_pattern;
  196. }
  197. /**
  198. * Return query terms
  199. *
  200. * @return array
  201. * @throws Zend_Search_Lucene_Exception
  202. */
  203. public function getQueryTerms()
  204. {
  205. if ($this->_matches === null) {
  206. throw new Zend_Search_Lucene_Exception('Search has to be performed first to get matched terms');
  207. }
  208. return $this->_matches;
  209. }
  210. /**
  211. * Constructs an appropriate Weight implementation for this query.
  212. *
  213. * @param Zend_Search_Lucene_Interface $reader
  214. * @return Zend_Search_Lucene_Search_Weight
  215. * @throws Zend_Search_Lucene_Exception
  216. */
  217. public function createWeight(Zend_Search_Lucene_Interface $reader)
  218. {
  219. throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
  220. }
  221. /**
  222. * Execute query in context of index reader
  223. * It also initializes necessary internal structures
  224. *
  225. * @param Zend_Search_Lucene_Interface $reader
  226. * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
  227. * @throws Zend_Search_Lucene_Exception
  228. */
  229. public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
  230. {
  231. throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
  232. }
  233. /**
  234. * Get document ids likely matching the query
  235. *
  236. * It's an array with document ids as keys (performance considerations)
  237. *
  238. * @return array
  239. * @throws Zend_Search_Lucene_Exception
  240. */
  241. public function matchedDocs()
  242. {
  243. throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
  244. }
  245. /**
  246. * Score specified document
  247. *
  248. * @param integer $docId
  249. * @param Zend_Search_Lucene_Interface $reader
  250. * @return float
  251. * @throws Zend_Search_Lucene_Exception
  252. */
  253. public function score($docId, Zend_Search_Lucene_Interface $reader)
  254. {
  255. throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
  256. }
  257. /**
  258. * Query specific matches highlighting
  259. *
  260. * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
  261. */
  262. protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
  263. {
  264. $words = array();
  265. $matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*') , preg_quote($this->_pattern->text, '/')) . '$/';
  266. if (@preg_match('/\pL/u', 'a') == 1) {
  267. // PCRE unicode support is turned on
  268. // add Unicode modifier to the match expression
  269. $matchExpression .= 'u';
  270. }
  271. $docBody = $highlighter->getDocument()->getFieldUtf8Value('body');
  272. $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($docBody, 'UTF-8');
  273. foreach ($tokens as $token) {
  274. if (preg_match($matchExpression, $token->getTermText()) === 1) {
  275. $words[] = $token->getTermText();
  276. }
  277. }
  278. $highlighter->highlight($words);
  279. }
  280. /**
  281. * Print a query
  282. *
  283. * @return string
  284. */
  285. public function __toString()
  286. {
  287. // It's used only for query visualisation, so we don't care about characters escaping
  288. if ($this->_pattern->field !== null) {
  289. $query = $this->_pattern->field . ':';
  290. } else {
  291. $query = '';
  292. }
  293. $query .= $this->_pattern->text;
  294. if ($this->getBoost() != 1) {
  295. $query = $query . '^' . round($this->getBoost(), 4);
  296. }
  297. return $query;
  298. }
  299. }