Analyzer.php 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Analysis
  18. * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. */
  21. /** Zend_Search_Lucene_Analysis_Token */
  22. require_once 'Zend/Search/Lucene/Analysis/Token.php';
  23. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */
  24. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php';
  25. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive */
  26. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8/CaseInsensitive.php';
  27. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */
  28. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php';
  29. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive */
  30. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num/CaseInsensitive.php';
  31. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */
  32. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php';
  33. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
  34. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php';
  35. /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum */
  36. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php';
  37. /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive */
  38. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php';
  39. /** Zend_Search_Lucene_Analysis_TokenFilter_StopWords */
  40. require_once 'Zend/Search/Lucene/Analysis/TokenFilter/StopWords.php';
  41. /** Zend_Search_Lucene_Analysis_TokenFilter_ShortWords */
  42. require_once 'Zend/Search/Lucene/Analysis/TokenFilter/ShortWords.php';
  43. /**
  44. * An Analyzer is used to analyze text.
  45. * It thus represents a policy for extracting index terms from text.
  46. *
  47. * Note:
  48. * Lucene Java implementation is oriented to streams. It provides effective work
  49. * with a huge documents (more then 20Mb).
  50. * But engine itself is not oriented such documents.
  51. * Thus Zend_Search_Lucene analysis API works with data strings and sets (arrays).
  52. *
  53. * @category Zend
  54. * @package Zend_Search_Lucene
  55. * @subpackage Analysis
  56. * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
  57. * @license http://framework.zend.com/license/new-bsd New BSD License
  58. */
  59. abstract class Zend_Search_Lucene_Analysis_Analyzer
  60. {
  61. /**
  62. * The Analyzer implementation used by default.
  63. *
  64. * @var Zend_Search_Lucene_Analysis_Analyzer
  65. */
  66. private static $_defaultImpl;
  67. /**
  68. * Input string
  69. *
  70. * @var string
  71. */
  72. protected $_input = null;
  73. /**
  74. * Input string encoding
  75. *
  76. * @var string
  77. */
  78. protected $_encoding = '';
  79. /**
  80. * Tokenize text to a terms
  81. * Returns array of Zend_Search_Lucene_Analysis_Token objects
  82. *
  83. * Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
  84. *
  85. * @param string $data
  86. * @return array
  87. */
  88. public function tokenize($data, $encoding = '')
  89. {
  90. $this->setInput($data, $encoding);
  91. $tokenList = array();
  92. while (($nextToken = $this->nextToken()) !== null) {
  93. $tokenList[] = $nextToken;
  94. }
  95. return $tokenList;
  96. }
  97. /**
  98. * Tokenization stream API
  99. * Set input
  100. *
  101. * @param string $data
  102. */
  103. public function setInput($data, $encoding = '')
  104. {
  105. $this->_input = $data;
  106. $this->_encoding = $encoding;
  107. $this->reset();
  108. }
  109. /**
  110. * Reset token stream
  111. */
  112. abstract public function reset();
  113. /**
  114. * Tokenization stream API
  115. * Get next token
  116. * Returns null at the end of stream
  117. *
  118. * Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
  119. *
  120. * @return Zend_Search_Lucene_Analysis_Token|null
  121. */
  122. abstract public function nextToken();
  123. /**
  124. * Set the default Analyzer implementation used by indexing code.
  125. *
  126. * @param Zend_Search_Lucene_Analysis_Analyzer $similarity
  127. */
  128. public static function setDefault(Zend_Search_Lucene_Analysis_Analyzer $analyzer)
  129. {
  130. self::$_defaultImpl = $analyzer;
  131. }
  132. /**
  133. * Return the default Analyzer implementation used by indexing code.
  134. *
  135. * @return Zend_Search_Lucene_Analysis_Analyzer
  136. */
  137. public static function getDefault()
  138. {
  139. if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Analysis_Analyzer) {
  140. self::$_defaultImpl = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
  141. }
  142. return self::$_defaultImpl;
  143. }
  144. }