Utf8.php 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Analysis
  18. * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. */
  21. /** Zend_Search_Lucene_Analysis_Analyzer_Common */
  22. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php';
  23. /**
  24. * @category Zend
  25. * @package Zend_Search_Lucene
  26. * @subpackage Analysis
  27. * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
  28. * @license http://framework.zend.com/license/new-bsd New BSD License
  29. */
  30. class Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 extends Zend_Search_Lucene_Analysis_Analyzer_Common
  31. {
  32. /**
  33. * Current char position in an UTF-8 stream
  34. *
  35. * @var integer
  36. */
  37. private $_position;
  38. /**
  39. * Current binary position in an UTF-8 stream
  40. *
  41. * @var integer
  42. */
  43. private $_bytePosition;
  44. /**
  45. * Object constructor
  46. *
  47. * @throws Zend_Search_Lucene_Exception
  48. */
  49. public function __construct()
  50. {
  51. if (@preg_match('/\pL/u', 'a') != 1) {
  52. // PCRE unicode support is turned off
  53. require_once 'Zend/Search/Lucene/Exception.php';
  54. throw new Zend_Search_Lucene_Exception('Utf8 analyzer needs PCRE unicode support to be enabled.');
  55. }
  56. }
  57. /**
  58. * Reset token stream
  59. */
  60. public function reset()
  61. {
  62. $this->_position = 0;
  63. $this->_bytePosition = 0;
  64. // convert input into UTF-8
  65. if (strcasecmp($this->_encoding, 'utf8' ) != 0 &&
  66. strcasecmp($this->_encoding, 'utf-8') != 0 ) {
  67. $this->_input = iconv($this->_encoding, 'UTF-8', $this->_input);
  68. $this->_encoding = 'UTF-8';
  69. }
  70. }
  71. /**
  72. * Tokenization stream API
  73. * Get next token
  74. * Returns null at the end of stream
  75. *
  76. * @return Zend_Search_Lucene_Analysis_Token|null
  77. */
  78. public function nextToken()
  79. {
  80. if ($this->_input === null) {
  81. return null;
  82. }
  83. do {
  84. if (! preg_match('/[\p{L}]+/u', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_bytePosition)) {
  85. // It covers both cases a) there are no matches (preg_match(...) === 0)
  86. // b) error occured (preg_match(...) === FALSE)
  87. return null;
  88. }
  89. // matched string
  90. $matchedWord = $match[0][0];
  91. // binary position of the matched word in the input stream
  92. $binStartPos = $match[0][1];
  93. // character position of the matched word in the input stream
  94. $startPos = $this->_position +
  95. iconv_strlen(substr($this->_input,
  96. $this->_bytePosition,
  97. $binStartPos - $this->_bytePosition),
  98. 'UTF-8');
  99. // character postion of the end of matched word in the input stream
  100. $endPos = $startPos + iconv_strlen($matchedWord, 'UTF-8');
  101. $this->_bytePosition = $binStartPos + strlen($matchedWord);
  102. $this->_position = $endPos;
  103. $token = $this->normalize(new Zend_Search_Lucene_Analysis_Token($matchedWord, $startPos, $endPos));
  104. } while ($token === null); // try again if token is skipped
  105. return $token;
  106. }
  107. }