Zend_Search_Lucene-Extending.xml 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362
  1. <sect1 id="zend.search.lucene.extending">
  2. <title>扩展性</title>
  3. <sect2 id="zend.search.lucene.extending.analysis">
  4. <title>文本分析</title>
  5. <para>
  6. <code>Zend_Search_Lucene_Analysis_Analyzer</code> 类被索引建立程序用于记号化文档的文本字段。
  7. </para>
  8. <para>
  9. <code>Zend_Search_Lucene_Analysis_Analyzer::getDefault()</code> 方法和 <code>Zend_Search_Lucene_Analysis_Analyzer::setDefault()</code> 方法用于获取和设置默认的分析程序。
  10. </para>
  11. <para>
  12. 因此你可以使用你自己的文本分析程序或者从预设的分析程序中选择一个:
  13. <code>Zend_Search_Lucene_Analysis_Analyzer_Common_Text</code> 和 <code>Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive</code>(缺省的)。两者都把记号解释为一个字母序列。
  14. <code>Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive</code>将记号转化为小写。
  15. </para>
  16. <para>
  17. 使用下面代码更换分析程序:
  18. </para>
  19. <programlisting role="php"><![CDATA[<?php
  20. Zend_Search_Lucene_Analysis_Analyzer::setDefault(
  21. new Zend_Search_Lucene_Analysis_Analyzer_Common_Text());
  22. ...
  23. $index->addDocument($doc);
  24. ?>]]></programlisting>
  25. <para>
  26. <code>Zend_Search_Lucene_Analysis_Analyzer_Common</code>类设计来作为所有自定义分析程序的祖先。用户可以只定义
  27. <code>tokenize()</code>方法,它将字符串输入数据变成记号数组并返回。
  28. </para>
  29. <para>
  30. 方法 <code>tokenize()</code> 应该针对所有记号应用方法 <code>normalize()</code>。这样可以在你的分析程序中允许使用记号过滤。
  31. </para>
  32. <para>
  33. 这里是一个自定义分析程序的例子,它将单词变成数字作为搜索项:
  34. <example>
  35. <title>自定义文本分析程序</title>
  36. <programlisting role="php"><![CDATA[<?php
  37. /** Here is a custome text analyser, which treats words with digits as one term */
  38. /** Zend_Search_Lucene_Analysis_Analyzer_Common */
  39. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php';
  40. class My_Analyzer extends Zend_Search_Lucene_Analysis_Analyzer_Common
  41. {
  42. /**
  43. * Tokenize text to a terms
  44. * Returns array of Zend_Search_Lucene_Analysis_Token objects
  45. *
  46. * @param string $data
  47. * @return array
  48. */
  49. public function tokenize($data)
  50. {
  51. $tokenStream = array();
  52. $position = 0;
  53. while ($position < strlen($data)) {
  54. // skip white space
  55. while ($position < strlen($data) && !ctype_alpha($data{$position}) && !ctype_digit($data{$position})) {
  56. $position++;
  57. }
  58. $termStartPosition = $position;
  59. // read token
  60. while ($position < strlen($data) && (ctype_alpha($data{$position}) || ctype_digit($data{$position}))) {
  61. $position++;
  62. }
  63. // Empty token, end of stream.
  64. if ($position == $termStartPosition) {
  65. break;
  66. }
  67. $token = new Zend_Search_Lucene_Analysis_Token(substr($data,
  68. $termStartPosition,
  69. $position-$termStartPosition),
  70. $termStartPosition,
  71. $position);
  72. $tokenStream[] = $this->normalize($token);
  73. }
  74. return $tokenStream;
  75. }
  76. }
  77. Zend_Search_Lucene_Analysis_Analyzer::setDefault(
  78. new My_Analyzer());
  79. ?>]]></programlisting>
  80. </example>
  81. </para>
  82. </sect2>
  83. <sect2 id="zend.search.lucene.extending.scoring">
  84. <title> 评分算法</title>
  85. <para>
  86. 查询 <literal>q</literal> 的在文档 <literal>d</literal> 中的分值 score 定义如下:
  87. </para>
  88. <para>
  89. <code>score(q,d) = sum( tf(t in d) * idf(t) * getBoost(t.field in d) * lengthNorm(t.field in d) ) *
  90. coord(q,d) * queryNorm(q)</code>
  91. </para>
  92. <para>
  93. tf(t in d) - <code>Zend_Search_Lucene_Search_Similarity::tf($freq)</code> - 基于搜索项或者短语在文档中出现次数的分值因子。
  94. </para>
  95. <para>
  96. idf(t) - <code>Zend_Search_Lucene_Search_SimilaritySimilarity::tf($term, $reader)</code> - 针对特定索引的简单搜索项的分值因子。
  97. </para>
  98. <para>
  99. getBoost(t.field in d) - 针对搜索项字段的增益因子。
  100. </para>
  101. <para>
  102. lengthNorm($term) - 对一个给定字段,其中包含的搜索项的总数的标准值。这个值保存在索引中。这些值和字段增益一起,保存在索引中,通过搜索代码和每一个搜索结果的每一个字段的分值相乘。
  103. </para>
  104. <para>
  105. 匹配较长的字段精度较低,所以这个实现方法通常在 numTikuns 较大时返回较小的分值,而在 numTokens 较小时返回较大的分值。
  106. </para>
  107. <para>
  108. coord(q,d) - <code>Zend_Search_Lucene_Search_Similarity::coord($overlap, $maxOverlap)</code> - 基于文档包含的所有查询搜索项碎片的分值因子。
  109. </para>
  110. <para>
  111. 出现大部分的查询搜索项表示更好的匹配查询,所以这个实现方法通常当这些参数的比率较大时返回较大的分值,而这些比率较小时返回较小的分值。
  112. </para>
  113. <para>
  114. queryNorm(q) - 对给定的查询,所有查询搜索项的权重的总和的标准值。这个值用于和每一个查询搜索项相乘。
  115. </para>
  116. <para>
  117. 这对于定级没有帮助,而仅仅是尝试为不同的查询建立可比较的评分。
  118. </para>
  119. <para>
  120. 你可以通过自定义 Similatity 类来定制评分算法。可以按照下面的定义来扩展 Zend_Search_Lucene_Search_Similarity 类,然后使用
  121. <code>Zend_Search_Lucene_Search_Similarity::setDefault($similarity);</code> 方法来将其设置为缺省的评分算法。
  122. </para>
  123. <programlisting role="php"><![CDATA[<?php
  124. class MySimilarity extends Zend_Search_Lucene_Search_Similarity {
  125. public function lengthNorm($fieldName, $numTerms) {
  126. return 1.0/sqrt($numTerms);
  127. }
  128. public function queryNorm($sumOfSquaredWeights) {
  129. return 1.0/sqrt($sumOfSquaredWeights);
  130. }
  131. public function tf($freq) {
  132. return sqrt($freq);
  133. }
  134. /**
  135. * It's not used now. Computes the amount of a sloppy phrase match,
  136. * based on an edit distance.
  137. */
  138. public function sloppyFreq($distance) {
  139. return 1.0;
  140. }
  141. public function idfFreq($docFreq, $numDocs) {
  142. return log($numDocs/(float)($docFreq+1)) + 1.0;
  143. }
  144. public function coord($overlap, $maxOverlap) {
  145. return $overlap/(float)$maxOverlap;
  146. }
  147. }
  148. $mySimilarity = new MySimilarity();
  149. Zend_Search_Lucene_Search_Similarity::setDefault($mySimilarity);
  150. ?>]]></programlisting>
  151. </sect2>
  152. <sect2 id="zend.search.lucene.extending.storage">
  153. <title>存储容器</title>
  154. <para>
  155. 抽象类 Zend_Search_Lucene_Storage_Directory 定义了目录功能。
  156. </para>
  157. <para>
  158. Zend_Search_Lucene 构造方法使用字符串或者 Zend_Search_Lucene_Storage_Directory 对象作为输入。
  159. </para>
  160. <para>
  161. Zend_Search_Lucene_Storage_Directory_Filesystem 类实现了针对文件系统的目录功能。
  162. </para>
  163. <para>
  164. 如果字符串被用于 Zend_Search_Lucene 构造方法的输入,那么索引阅读程序(Zend_Search_Lucene 对象)认为它是一个文件系统路径并自行实例化 Zend_Search_Lucene_Storage_Directory_Filesystem 对象。
  165. </para>
  166. <para>
  167. 你可以通过扩展 Zend_Search_Lucene_Storage_Directory 类定义自己的目录实现。
  168. </para>
  169. <para>
  170. Zend_Search_Lucene_Storage_Directory 的方法:
  171. <programlisting><![CDATA[<?php
  172. abstract class Zend_Search_Lucene_Storage_Directory {
  173. /**
  174. * Closes the store.
  175. *
  176. * @return void
  177. */
  178. abstract function close();
  179. /**
  180. * Creates a new, empty file in the directory with the given $filename.
  181. *
  182. * @param string $name
  183. * @return void
  184. */
  185. abstract function createFile($filename);
  186. /**
  187. * Removes an existing $filename in the directory.
  188. *
  189. * @param string $filename
  190. * @return void
  191. */
  192. abstract function deleteFile($filename);
  193. /**
  194. * Returns true if a file with the given $filename exists.
  195. *
  196. * @param string $filename
  197. * @return boolean
  198. */
  199. abstract function fileExists($filename);
  200. /**
  201. * Returns the length of a $filename in the directory.
  202. *
  203. * @param string $filename
  204. * @return integer
  205. */
  206. abstract function fileLength($filename);
  207. /**
  208. * Returns the UNIX timestamp $filename was last modified.
  209. *
  210. * @param string $filename
  211. * @return integer
  212. */
  213. abstract function fileModified($filename);
  214. /**
  215. * Renames an existing file in the directory.
  216. *
  217. * @param string $from
  218. * @param string $to
  219. * @return void
  220. */
  221. abstract function renameFile($from, $to);
  222. /**
  223. * Sets the modified time of $filename to now.
  224. *
  225. * @param string $filename
  226. * @return void
  227. */
  228. abstract function touchFile($filename);
  229. /**
  230. * Returns a Zend_Search_Lucene_Storage_File object for a given $filename in the directory.
  231. *
  232. * @param string $filename
  233. * @return Zend_Search_Lucene_Storage_File
  234. */
  235. abstract function getFileObject($filename);
  236. }
  237. ?>]]></programlisting>
  238. </para>
  239. <para>
  240. Zend_Search_Lucene_Storage_Directory 类的 <code>getFileObject($filename)</code> 方法返回 Zend_Search_Lucene_Storage_File 对象。
  241. </para>
  242. <para>
  243. Zend_Search_Lucene_Storage_File 抽象类实现了文件抽象和原始的索引文件读取。
  244. </para>
  245. <para>
  246. 你还必须扩展 Zend_Search_Lucene_Storage_File 类以建立自己的目录实现。
  247. </para>
  248. <para>
  249. Zend_Search_Lucene_Storage_File 类中只有两个方法是你必须重载的:
  250. <programlisting><![CDATA[<?php
  251. class MyFile extends Zend_Search_Lucene_Storage_File {
  252. /**
  253. * Sets the file position indicator and advances the file pointer.
  254. * The new position, measured in bytes from the beginning of the file,
  255. * is obtained by adding offset to the position specified by whence,
  256. * whose values are defined as follows:
  257. * SEEK_SET - Set position equal to offset bytes.
  258. * SEEK_CUR - Set position to current location plus offset.
  259. * SEEK_END - Set position to end-of-file plus offset. (To move to
  260. * a position before the end-of-file, you need to pass a negative value
  261. * in offset.)
  262. * Upon success, returns 0; otherwise, returns -1
  263. *
  264. * @param integer $offset
  265. * @param integer $whence
  266. * @return integer
  267. */
  268. public function seek($offset, $whence=SEEK_SET) {
  269. ...
  270. }
  271. /**
  272. * Read a $length bytes from the file and advance the file pointer.
  273. *
  274. * @param integer $length
  275. * @return string
  276. */
  277. protected function _fread($length=1) {
  278. ...
  279. }
  280. }
  281. ?>]]></programlisting>
  282. </para>
  283. </sect2>
  284. </sect1>
  285. <!--
  286. vim:se ts=4 sw=4 et:
  287. -->