Utf8.php 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Analysis
  18. * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id$
  21. */
  22. /** Zend_Search_Lucene_Analysis_Analyzer_Common */
  23. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php';
  24. /**
  25. * @category Zend
  26. * @package Zend_Search_Lucene
  27. * @subpackage Analysis
  28. * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com)
  29. * @license http://framework.zend.com/license/new-bsd New BSD License
  30. */
  31. class Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 extends Zend_Search_Lucene_Analysis_Analyzer_Common
  32. {
  33. /**
  34. * Current char position in an UTF-8 stream
  35. *
  36. * @var integer
  37. */
  38. private $_position;
  39. /**
  40. * Current binary position in an UTF-8 stream
  41. *
  42. * @var integer
  43. */
  44. private $_bytePosition;
  45. /**
  46. * Object constructor
  47. *
  48. * @throws Zend_Search_Lucene_Exception
  49. */
  50. public function __construct()
  51. {
  52. if (@preg_match('/\pL/u', 'a') != 1) {
  53. // PCRE unicode support is turned off
  54. require_once 'Zend/Search/Lucene/Exception.php';
  55. throw new Zend_Search_Lucene_Exception('Utf8 analyzer needs PCRE unicode support to be enabled.');
  56. }
  57. }
  58. /**
  59. * Reset token stream
  60. */
  61. public function reset()
  62. {
  63. $this->_position = 0;
  64. $this->_bytePosition = 0;
  65. // convert input into UTF-8
  66. if (strcasecmp($this->_encoding, 'utf8' ) != 0 &&
  67. strcasecmp($this->_encoding, 'utf-8') != 0 ) {
  68. $this->_input = iconv($this->_encoding, 'UTF-8', $this->_input);
  69. $this->_encoding = 'UTF-8';
  70. }
  71. }
  72. /**
  73. * Tokenization stream API
  74. * Get next token
  75. * Returns null at the end of stream
  76. *
  77. * @return Zend_Search_Lucene_Analysis_Token|null
  78. */
  79. public function nextToken()
  80. {
  81. if ($this->_input === null) {
  82. return null;
  83. }
  84. do {
  85. if (! preg_match('/[\p{L}]+/u', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_bytePosition)) {
  86. // It covers both cases a) there are no matches (preg_match(...) === 0)
  87. // b) error occured (preg_match(...) === FALSE)
  88. return null;
  89. }
  90. // matched string
  91. $matchedWord = $match[0][0];
  92. // binary position of the matched word in the input stream
  93. $binStartPos = $match[0][1];
  94. // character position of the matched word in the input stream
  95. $startPos = $this->_position +
  96. iconv_strlen(substr($this->_input,
  97. $this->_bytePosition,
  98. $binStartPos - $this->_bytePosition),
  99. 'UTF-8');
  100. // character postion of the end of matched word in the input stream
  101. $endPos = $startPos + iconv_strlen($matchedWord, 'UTF-8');
  102. $this->_bytePosition = $binStartPos + strlen($matchedWord);
  103. $this->_position = $endPos;
  104. $token = $this->normalize(new Zend_Search_Lucene_Analysis_Token($matchedWord, $startPos, $endPos));
  105. } while ($token === null); // try again if token is skipped
  106. return $token;
  107. }
  108. }