Analyzer.php 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Analysis
  18. * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id$
  21. */
  22. /** Zend_Search_Lucene_Analysis_Token */
  23. require_once 'Zend/Search/Lucene/Analysis/Token.php';
  24. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */
  25. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php';
  26. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive */
  27. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8/CaseInsensitive.php';
  28. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */
  29. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php';
  30. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive */
  31. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num/CaseInsensitive.php';
  32. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */
  33. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php';
  34. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
  35. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php';
  36. /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum */
  37. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php';
  38. /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive */
  39. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php';
  40. /** Zend_Search_Lucene_Analysis_TokenFilter_StopWords */
  41. require_once 'Zend/Search/Lucene/Analysis/TokenFilter/StopWords.php';
  42. /** Zend_Search_Lucene_Analysis_TokenFilter_ShortWords */
  43. require_once 'Zend/Search/Lucene/Analysis/TokenFilter/ShortWords.php';
  44. /**
  45. * An Analyzer is used to analyze text.
  46. * It thus represents a policy for extracting index terms from text.
  47. *
  48. * Note:
  49. * Lucene Java implementation is oriented to streams. It provides effective work
  50. * with a huge documents (more then 20Mb).
  51. * But engine itself is not oriented such documents.
  52. * Thus Zend_Search_Lucene analysis API works with data strings and sets (arrays).
  53. *
  54. * @category Zend
  55. * @package Zend_Search_Lucene
  56. * @subpackage Analysis
  57. * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
  58. * @license http://framework.zend.com/license/new-bsd New BSD License
  59. */
  60. abstract class Zend_Search_Lucene_Analysis_Analyzer
  61. {
  62. /**
  63. * The Analyzer implementation used by default.
  64. *
  65. * @var Zend_Search_Lucene_Analysis_Analyzer
  66. */
  67. private static $_defaultImpl;
  68. /**
  69. * Input string
  70. *
  71. * @var string
  72. */
  73. protected $_input = null;
  74. /**
  75. * Input string encoding
  76. *
  77. * @var string
  78. */
  79. protected $_encoding = '';
  80. /**
  81. * Tokenize text to a terms
  82. * Returns array of Zend_Search_Lucene_Analysis_Token objects
  83. *
  84. * Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
  85. *
  86. * @param string $data
  87. * @return array
  88. */
  89. public function tokenize($data, $encoding = '')
  90. {
  91. $this->setInput($data, $encoding);
  92. $tokenList = array();
  93. while (($nextToken = $this->nextToken()) !== null) {
  94. $tokenList[] = $nextToken;
  95. }
  96. return $tokenList;
  97. }
  98. /**
  99. * Tokenization stream API
  100. * Set input
  101. *
  102. * @param string $data
  103. */
  104. public function setInput($data, $encoding = '')
  105. {
  106. $this->_input = $data;
  107. $this->_encoding = $encoding;
  108. $this->reset();
  109. }
  110. /**
  111. * Reset token stream
  112. */
  113. abstract public function reset();
  114. /**
  115. * Tokenization stream API
  116. * Get next token
  117. * Returns null at the end of stream
  118. *
  119. * Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
  120. *
  121. * @return Zend_Search_Lucene_Analysis_Token|null
  122. */
  123. abstract public function nextToken();
  124. /**
  125. * Set the default Analyzer implementation used by indexing code.
  126. *
  127. * @param Zend_Search_Lucene_Analysis_Analyzer $similarity
  128. */
  129. public static function setDefault(Zend_Search_Lucene_Analysis_Analyzer $analyzer)
  130. {
  131. self::$_defaultImpl = $analyzer;
  132. }
  133. /**
  134. * Return the default Analyzer implementation used by indexing code.
  135. *
  136. * @return Zend_Search_Lucene_Analysis_Analyzer
  137. */
  138. public static function getDefault()
  139. {
  140. if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Analysis_Analyzer) {
  141. self::$_defaultImpl = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
  142. }
  143. return self::$_defaultImpl;
  144. }
  145. }