tokenizers.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. # Copyright 2007 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. from whoosh.compat import u, text_type
  28. from whoosh.analysis.acore import Composable, Token
  29. from whoosh.util.text import rcompile
  30. default_pattern = rcompile(r"\w+(\.?\w+)*")
  31. # Tokenizers
  32. class Tokenizer(Composable):
  33. """Base class for Tokenizers.
  34. """
  35. def __eq__(self, other):
  36. return other and self.__class__ is other.__class__
  37. class IDTokenizer(Tokenizer):
  38. """Yields the entire input string as a single token. For use in indexed but
  39. untokenized fields, such as a document's path.
  40. >>> idt = IDTokenizer()
  41. >>> [token.text for token in idt("/a/b 123 alpha")]
  42. ["/a/b 123 alpha"]
  43. """
  44. def __call__(self, value, positions=False, chars=False,
  45. keeporiginal=False, removestops=True,
  46. start_pos=0, start_char=0, mode='', **kwargs):
  47. assert isinstance(value, text_type), "%r is not unicode" % value
  48. t = Token(positions, chars, removestops=removestops, mode=mode,
  49. **kwargs)
  50. t.text = value
  51. t.boost = 1.0
  52. if keeporiginal:
  53. t.original = value
  54. if positions:
  55. t.pos = start_pos + 1
  56. if chars:
  57. t.startchar = start_char
  58. t.endchar = start_char + len(value)
  59. yield t
  60. class RegexTokenizer(Tokenizer):
  61. """
  62. Uses a regular expression to extract tokens from text.
  63. >>> rex = RegexTokenizer()
  64. >>> [token.text for token in rex(u("hi there 3.141 big-time under_score"))]
  65. ["hi", "there", "3.141", "big", "time", "under_score"]
  66. """
  67. def __init__(self, expression=default_pattern, gaps=False):
  68. """
  69. :param expression: A regular expression object or string. Each match
  70. of the expression equals a token. Group 0 (the entire matched text)
  71. is used as the text of the token. If you require more complicated
  72. handling of the expression match, simply write your own tokenizer.
  73. :param gaps: If True, the tokenizer *splits* on the expression, rather
  74. than matching on the expression.
  75. """
  76. self.expression = rcompile(expression)
  77. self.gaps = gaps
  78. def __eq__(self, other):
  79. if self.__class__ is other.__class__:
  80. if self.expression.pattern == other.expression.pattern:
  81. return True
  82. return False
  83. def __call__(self, value, positions=False, chars=False, keeporiginal=False,
  84. removestops=True, start_pos=0, start_char=0, tokenize=True,
  85. mode='', **kwargs):
  86. """
  87. :param value: The unicode string to tokenize.
  88. :param positions: Whether to record token positions in the token.
  89. :param chars: Whether to record character offsets in the token.
  90. :param start_pos: The position number of the first token. For example,
  91. if you set start_pos=2, the tokens will be numbered 2,3,4,...
  92. instead of 0,1,2,...
  93. :param start_char: The offset of the first character of the first
  94. token. For example, if you set start_char=2, the text "aaa bbb"
  95. will have chars (2,5),(6,9) instead (0,3),(4,7).
  96. :param tokenize: if True, the text should be tokenized.
  97. """
  98. assert isinstance(value, text_type), "%s is not unicode" % repr(value)
  99. t = Token(positions, chars, removestops=removestops, mode=mode,
  100. **kwargs)
  101. if not tokenize:
  102. t.original = t.text = value
  103. t.boost = 1.0
  104. if positions:
  105. t.pos = start_pos
  106. if chars:
  107. t.startchar = start_char
  108. t.endchar = start_char + len(value)
  109. yield t
  110. elif not self.gaps:
  111. # The default: expression matches are used as tokens
  112. for pos, match in enumerate(self.expression.finditer(value)):
  113. t.text = match.group(0)
  114. t.boost = 1.0
  115. if keeporiginal:
  116. t.original = t.text
  117. t.stopped = False
  118. if positions:
  119. t.pos = start_pos + pos
  120. if chars:
  121. t.startchar = start_char + match.start()
  122. t.endchar = start_char + match.end()
  123. yield t
  124. else:
  125. # When gaps=True, iterate through the matches and
  126. # yield the text between them.
  127. prevend = 0
  128. pos = start_pos
  129. for match in self.expression.finditer(value):
  130. start = prevend
  131. end = match.start()
  132. text = value[start:end]
  133. if text:
  134. t.text = text
  135. t.boost = 1.0
  136. if keeporiginal:
  137. t.original = t.text
  138. t.stopped = False
  139. if positions:
  140. t.pos = pos
  141. pos += 1
  142. if chars:
  143. t.startchar = start_char + start
  144. t.endchar = start_char + end
  145. yield t
  146. prevend = match.end()
  147. # If the last "gap" was before the end of the text,
  148. # yield the last bit of text as a final token.
  149. if prevend < len(value):
  150. t.text = value[prevend:]
  151. t.boost = 1.0
  152. if keeporiginal:
  153. t.original = t.text
  154. t.stopped = False
  155. if positions:
  156. t.pos = pos
  157. if chars:
  158. t.startchar = prevend
  159. t.endchar = len(value)
  160. yield t
  161. class CharsetTokenizer(Tokenizer):
  162. """Tokenizes and translates text according to a character mapping object.
  163. Characters that map to None are considered token break characters. For all
  164. other characters the map is used to translate the character. This is useful
  165. for case and accent folding.
  166. This tokenizer loops character-by-character and so will likely be much
  167. slower than :class:`RegexTokenizer`.
  168. One way to get a character mapping object is to convert a Sphinx charset
  169. table file using :func:`whoosh.support.charset.charset_table_to_dict`.
  170. >>> from whoosh.support.charset import charset_table_to_dict
  171. >>> from whoosh.support.charset import default_charset
  172. >>> charmap = charset_table_to_dict(default_charset)
  173. >>> chtokenizer = CharsetTokenizer(charmap)
  174. >>> [t.text for t in chtokenizer(u'Stra\\xdfe ABC')]
  175. [u'strase', u'abc']
  176. The Sphinx charset table format is described at
  177. http://www.sphinxsearch.com/docs/current.html#conf-charset-table.
  178. """
  179. __inittype__ = dict(charmap=str)
  180. def __init__(self, charmap):
  181. """
  182. :param charmap: a mapping from integer character numbers to unicode
  183. characters, as used by the unicode.translate() method.
  184. """
  185. self.charmap = charmap
  186. def __eq__(self, other):
  187. return (other
  188. and self.__class__ is other.__class__
  189. and self.charmap == other.charmap)
  190. def __call__(self, value, positions=False, chars=False, keeporiginal=False,
  191. removestops=True, start_pos=0, start_char=0, tokenize=True,
  192. mode='', **kwargs):
  193. """
  194. :param value: The unicode string to tokenize.
  195. :param positions: Whether to record token positions in the token.
  196. :param chars: Whether to record character offsets in the token.
  197. :param start_pos: The position number of the first token. For example,
  198. if you set start_pos=2, the tokens will be numbered 2,3,4,...
  199. instead of 0,1,2,...
  200. :param start_char: The offset of the first character of the first
  201. token. For example, if you set start_char=2, the text "aaa bbb"
  202. will have chars (2,5),(6,9) instead (0,3),(4,7).
  203. :param tokenize: if True, the text should be tokenized.
  204. """
  205. assert isinstance(value, text_type), "%r is not unicode" % value
  206. t = Token(positions, chars, removestops=removestops, mode=mode,
  207. **kwargs)
  208. if not tokenize:
  209. t.original = t.text = value
  210. t.boost = 1.0
  211. if positions:
  212. t.pos = start_pos
  213. if chars:
  214. t.startchar = start_char
  215. t.endchar = start_char + len(value)
  216. yield t
  217. else:
  218. text = u("")
  219. charmap = self.charmap
  220. pos = start_pos
  221. startchar = currentchar = start_char
  222. for char in value:
  223. tchar = charmap[ord(char)]
  224. if tchar:
  225. text += tchar
  226. else:
  227. if currentchar > startchar:
  228. t.text = text
  229. t.boost = 1.0
  230. if keeporiginal:
  231. t.original = t.text
  232. if positions:
  233. t.pos = pos
  234. pos += 1
  235. if chars:
  236. t.startchar = startchar
  237. t.endchar = currentchar
  238. yield t
  239. startchar = currentchar + 1
  240. text = u("")
  241. currentchar += 1
  242. if currentchar > startchar:
  243. t.text = value[startchar:currentchar]
  244. t.boost = 1.0
  245. if keeporiginal:
  246. t.original = t.text
  247. if positions:
  248. t.pos = pos
  249. if chars:
  250. t.startchar = startchar
  251. t.endchar = currentchar
  252. yield t
  253. def SpaceSeparatedTokenizer():
  254. """Returns a RegexTokenizer that splits tokens by whitespace.
  255. >>> sst = SpaceSeparatedTokenizer()
  256. >>> [token.text for token in sst("hi there big-time, what's up")]
  257. ["hi", "there", "big-time,", "what's", "up"]
  258. """
  259. return RegexTokenizer(r"[^ \t\r\n]+")
  260. def CommaSeparatedTokenizer():
  261. """Splits tokens by commas.
  262. Note that the tokenizer calls unicode.strip() on each match of the regular
  263. expression.
  264. >>> cst = CommaSeparatedTokenizer()
  265. >>> [token.text for token in cst("hi there, what's , up")]
  266. ["hi there", "what's", "up"]
  267. """
  268. from whoosh.analysis.filters import StripFilter
  269. return RegexTokenizer(r"[^,]+") | StripFilter()
  270. class PathTokenizer(Tokenizer):
  271. """A simple tokenizer that given a string ``"/a/b/c"`` yields tokens
  272. ``["/a", "/a/b", "/a/b/c"]``.
  273. """
  274. def __init__(self, expression="[^/]+"):
  275. self.expr = rcompile(expression)
  276. def __call__(self, value, positions=False, start_pos=0, **kwargs):
  277. assert isinstance(value, text_type), "%r is not unicode" % value
  278. token = Token(positions, **kwargs)
  279. pos = start_pos
  280. for match in self.expr.finditer(value):
  281. token.text = value[:match.end()]
  282. if positions:
  283. token.pos = pos
  284. pos += 1
  285. yield token