ngrams.py 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237
  1. # Copyright 2007 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. from whoosh.compat import text_type
  28. from whoosh.compat import xrange
  29. from whoosh.analysis.acore import Token
  30. from whoosh.analysis.filters import Filter, LowercaseFilter
  31. from whoosh.analysis.tokenizers import Tokenizer, RegexTokenizer
  32. # Tokenizer
  33. class NgramTokenizer(Tokenizer):
  34. """Splits input text into N-grams instead of words.
  35. >>> ngt = NgramTokenizer(4)
  36. >>> [token.text for token in ngt("hi there")]
  37. ["hi t", "i th", " the", "ther", "here"]
  38. Note that this tokenizer does NOT use a regular expression to extract
  39. words, so the grams emitted by it will contain whitespace, punctuation,
  40. etc. You may want to massage the input or add a custom filter to this
  41. tokenizer's output.
  42. Alternatively, if you only want sub-word grams without whitespace, you
  43. could combine a RegexTokenizer with NgramFilter instead.
  44. """
  45. __inittypes__ = dict(minsize=int, maxsize=int)
  46. def __init__(self, minsize, maxsize=None):
  47. """
  48. :param minsize: The minimum size of the N-grams.
  49. :param maxsize: The maximum size of the N-grams. If you omit
  50. this parameter, maxsize == minsize.
  51. """
  52. self.min = minsize
  53. self.max = maxsize or minsize
  54. def __eq__(self, other):
  55. if self.__class__ is other.__class__:
  56. if self.min == other.min and self.max == other.max:
  57. return True
  58. return False
  59. def __call__(self, value, positions=False, chars=False, keeporiginal=False,
  60. removestops=True, start_pos=0, start_char=0, mode='',
  61. **kwargs):
  62. assert isinstance(value, text_type), "%r is not unicode" % value
  63. inlen = len(value)
  64. t = Token(positions, chars, removestops=removestops, mode=mode)
  65. pos = start_pos
  66. if mode == "query":
  67. size = min(self.max, inlen)
  68. for start in xrange(0, inlen - size + 1):
  69. end = start + size
  70. if end > inlen:
  71. continue
  72. t.text = value[start:end]
  73. if keeporiginal:
  74. t.original = t.text
  75. t.stopped = False
  76. if positions:
  77. t.pos = pos
  78. if chars:
  79. t.startchar = start_char + start
  80. t.endchar = start_char + end
  81. yield t
  82. pos += 1
  83. else:
  84. for start in xrange(0, inlen - self.min + 1):
  85. for size in xrange(self.min, self.max + 1):
  86. end = start + size
  87. if end > inlen:
  88. continue
  89. t.text = value[start:end]
  90. if keeporiginal:
  91. t.original = t.text
  92. t.stopped = False
  93. if positions:
  94. t.pos = pos
  95. if chars:
  96. t.startchar = start_char + start
  97. t.endchar = start_char + end
  98. yield t
  99. pos += 1
  100. # Filter
  101. class NgramFilter(Filter):
  102. """Splits token text into N-grams.
  103. >>> rext = RegexTokenizer()
  104. >>> stream = rext("hello there")
  105. >>> ngf = NgramFilter(4)
  106. >>> [token.text for token in ngf(stream)]
  107. ["hell", "ello", "ther", "here"]
  108. """
  109. __inittypes__ = dict(minsize=int, maxsize=int)
  110. def __init__(self, minsize, maxsize=None, at=None):
  111. """
  112. :param minsize: The minimum size of the N-grams.
  113. :param maxsize: The maximum size of the N-grams. If you omit this
  114. parameter, maxsize == minsize.
  115. :param at: If 'start', only take N-grams from the start of each word.
  116. if 'end', only take N-grams from the end of each word. Otherwise,
  117. take all N-grams from the word (the default).
  118. """
  119. self.min = minsize
  120. self.max = maxsize or minsize
  121. self.at = 0
  122. if at == "start":
  123. self.at = -1
  124. elif at == "end":
  125. self.at = 1
  126. def __eq__(self, other):
  127. return other and self.__class__ is other.__class__\
  128. and self.min == other.min and self.max == other.max
  129. def __call__(self, tokens):
  130. assert hasattr(tokens, "__iter__")
  131. at = self.at
  132. for t in tokens:
  133. text = t.text
  134. if len(text) < self.min:
  135. continue
  136. chars = t.chars
  137. if chars:
  138. startchar = t.startchar
  139. # Token positions don't mean much for N-grams,
  140. # so we'll leave the token's original position
  141. # untouched.
  142. if t.mode == "query":
  143. size = min(self.max, len(t.text))
  144. if at == -1:
  145. t.text = text[:size]
  146. if chars:
  147. t.endchar = startchar + size
  148. yield t
  149. elif at == 1:
  150. t.text = text[0 - size:]
  151. if chars:
  152. t.startchar = t.endchar - size
  153. yield t
  154. else:
  155. for start in xrange(0, len(text) - size + 1):
  156. t.text = text[start:start + size]
  157. if chars:
  158. t.startchar = startchar + start
  159. t.endchar = startchar + start + size
  160. yield t
  161. else:
  162. if at == -1:
  163. limit = min(self.max, len(text))
  164. for size in xrange(self.min, limit + 1):
  165. t.text = text[:size]
  166. if chars:
  167. t.endchar = startchar + size
  168. yield t
  169. elif at == 1:
  170. if chars:
  171. original_startchar = t.startchar
  172. start = max(0, len(text) - self.max)
  173. for i in xrange(start, len(text) - self.min + 1):
  174. t.text = text[i:]
  175. if chars:
  176. t.startchar = original_startchar + i
  177. yield t
  178. else:
  179. for start in xrange(0, len(text) - self.min + 1):
  180. for size in xrange(self.min, self.max + 1):
  181. end = start + size
  182. if end > len(text):
  183. continue
  184. t.text = text[start:end]
  185. if chars:
  186. t.startchar = startchar + start
  187. t.endchar = startchar + end
  188. yield t
  189. # Analyzers
  190. def NgramAnalyzer(minsize, maxsize=None):
  191. """Composes an NgramTokenizer and a LowercaseFilter.
  192. >>> ana = NgramAnalyzer(4)
  193. >>> [token.text for token in ana("hi there")]
  194. ["hi t", "i th", " the", "ther", "here"]
  195. """
  196. return NgramTokenizer(minsize, maxsize=maxsize) | LowercaseFilter()
  197. def NgramWordAnalyzer(minsize, maxsize=None, tokenizer=None, at=None):
  198. if not tokenizer:
  199. tokenizer = RegexTokenizer()
  200. return tokenizer | LowercaseFilter() | NgramFilter(minsize, maxsize, at=at)