analyzers.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. # Copyright 2007 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. from whoosh.analysis.acore import Composable, CompositionError
  28. from whoosh.analysis.tokenizers import Tokenizer
  29. from whoosh.analysis.filters import LowercaseFilter
  30. from whoosh.analysis.filters import StopFilter, STOP_WORDS
  31. from whoosh.analysis.morph import StemFilter
  32. from whoosh.analysis.intraword import IntraWordFilter
  33. from whoosh.analysis.tokenizers import default_pattern
  34. from whoosh.analysis.tokenizers import CommaSeparatedTokenizer
  35. from whoosh.analysis.tokenizers import IDTokenizer
  36. from whoosh.analysis.tokenizers import RegexTokenizer
  37. from whoosh.analysis.tokenizers import SpaceSeparatedTokenizer
  38. from whoosh.lang.porter import stem
  39. # Analyzers
  40. class Analyzer(Composable):
  41. """ Abstract base class for analyzers.
  42. """
  43. def __repr__(self):
  44. return "%s()" % self.__class__.__name__
  45. def __eq__(self, other):
  46. return (other
  47. and self.__class__ is other.__class__
  48. and self.__dict__ == other.__dict__)
  49. def __call__(self, value, **kwargs):
  50. raise NotImplementedError
  51. def clean(self):
  52. pass
  53. class CompositeAnalyzer(Analyzer):
  54. def __init__(self, *composables):
  55. self.items = []
  56. for comp in composables:
  57. if isinstance(comp, CompositeAnalyzer):
  58. self.items.extend(comp.items)
  59. else:
  60. self.items.append(comp)
  61. # Tokenizers must start a chain, and then only filters after that
  62. # (because analyzers take a string and return a generator of tokens,
  63. # and filters take and return generators of tokens)
  64. for item in self.items[1:]:
  65. if isinstance(item, Tokenizer):
  66. raise CompositionError("Only one tokenizer allowed at the start"
  67. " of the analyzer: %r" % self.items)
  68. def __repr__(self):
  69. return "%s(%s)" % (self.__class__.__name__,
  70. ", ".join(repr(item) for item in self.items))
  71. def __call__(self, value, no_morph=False, **kwargs):
  72. items = self.items
  73. # Start with tokenizer
  74. gen = items[0](value, **kwargs)
  75. # Run filters
  76. for item in items[1:]:
  77. if not (no_morph and hasattr(item, "is_morph") and item.is_morph):
  78. gen = item(gen)
  79. return gen
  80. def __getitem__(self, item):
  81. return self.items.__getitem__(item)
  82. def __len__(self):
  83. return len(self.items)
  84. def __eq__(self, other):
  85. return (other
  86. and self.__class__ is other.__class__
  87. and self.items == other.items)
  88. def clean(self):
  89. for item in self.items:
  90. if hasattr(item, "clean"):
  91. item.clean()
  92. def has_morph(self):
  93. return any(item.is_morph for item in self.items)
  94. # Functions that return composed analyzers
  95. def IDAnalyzer(lowercase=False):
  96. """Deprecated, just use an IDTokenizer directly, with a LowercaseFilter if
  97. desired.
  98. """
  99. tokenizer = IDTokenizer()
  100. if lowercase:
  101. tokenizer = tokenizer | LowercaseFilter()
  102. return tokenizer
  103. def KeywordAnalyzer(lowercase=False, commas=False):
  104. """Parses whitespace- or comma-separated tokens.
  105. >>> ana = KeywordAnalyzer()
  106. >>> [token.text for token in ana("Hello there, this is a TEST")]
  107. ["Hello", "there,", "this", "is", "a", "TEST"]
  108. :param lowercase: whether to lowercase the tokens.
  109. :param commas: if True, items are separated by commas rather than
  110. whitespace.
  111. """
  112. if commas:
  113. tokenizer = CommaSeparatedTokenizer()
  114. else:
  115. tokenizer = SpaceSeparatedTokenizer()
  116. if lowercase:
  117. tokenizer = tokenizer | LowercaseFilter()
  118. return tokenizer
  119. def RegexAnalyzer(expression=r"\w+(\.?\w+)*", gaps=False):
  120. """Deprecated, just use a RegexTokenizer directly.
  121. """
  122. return RegexTokenizer(expression=expression, gaps=gaps)
  123. def SimpleAnalyzer(expression=default_pattern, gaps=False):
  124. """Composes a RegexTokenizer with a LowercaseFilter.
  125. >>> ana = SimpleAnalyzer()
  126. >>> [token.text for token in ana("Hello there, this is a TEST")]
  127. ["hello", "there", "this", "is", "a", "test"]
  128. :param expression: The regular expression pattern to use to extract tokens.
  129. :param gaps: If True, the tokenizer *splits* on the expression, rather
  130. than matching on the expression.
  131. """
  132. return RegexTokenizer(expression=expression, gaps=gaps) | LowercaseFilter()
  133. def StandardAnalyzer(expression=default_pattern, stoplist=STOP_WORDS,
  134. minsize=2, maxsize=None, gaps=False):
  135. """Composes a RegexTokenizer with a LowercaseFilter and optional
  136. StopFilter.
  137. >>> ana = StandardAnalyzer()
  138. >>> [token.text for token in ana("Testing is testing and testing")]
  139. ["testing", "testing", "testing"]
  140. :param expression: The regular expression pattern to use to extract tokens.
  141. :param stoplist: A list of stop words. Set this to None to disable
  142. the stop word filter.
  143. :param minsize: Words smaller than this are removed from the stream.
  144. :param maxsize: Words longer that this are removed from the stream.
  145. :param gaps: If True, the tokenizer *splits* on the expression, rather
  146. than matching on the expression.
  147. """
  148. ret = RegexTokenizer(expression=expression, gaps=gaps)
  149. chain = ret | LowercaseFilter()
  150. if stoplist is not None:
  151. chain = chain | StopFilter(stoplist=stoplist, minsize=minsize,
  152. maxsize=maxsize)
  153. return chain
  154. def StemmingAnalyzer(expression=default_pattern, stoplist=STOP_WORDS,
  155. minsize=2, maxsize=None, gaps=False, stemfn=stem,
  156. ignore=None, cachesize=50000):
  157. """Composes a RegexTokenizer with a lower case filter, an optional stop
  158. filter, and a stemming filter.
  159. >>> ana = StemmingAnalyzer()
  160. >>> [token.text for token in ana("Testing is testing and testing")]
  161. ["test", "test", "test"]
  162. :param expression: The regular expression pattern to use to extract tokens.
  163. :param stoplist: A list of stop words. Set this to None to disable
  164. the stop word filter.
  165. :param minsize: Words smaller than this are removed from the stream.
  166. :param maxsize: Words longer that this are removed from the stream.
  167. :param gaps: If True, the tokenizer *splits* on the expression, rather
  168. than matching on the expression.
  169. :param ignore: a set of words to not stem.
  170. :param cachesize: the maximum number of stemmed words to cache. The larger
  171. this number, the faster stemming will be but the more memory it will
  172. use. Use None for no cache, or -1 for an unbounded cache.
  173. """
  174. ret = RegexTokenizer(expression=expression, gaps=gaps)
  175. chain = ret | LowercaseFilter()
  176. if stoplist is not None:
  177. chain = chain | StopFilter(stoplist=stoplist, minsize=minsize,
  178. maxsize=maxsize)
  179. return chain | StemFilter(stemfn=stemfn, ignore=ignore,
  180. cachesize=cachesize)
  181. def FancyAnalyzer(expression=r"\s+", stoplist=STOP_WORDS, minsize=2,
  182. maxsize=None, gaps=True, splitwords=True, splitnums=True,
  183. mergewords=False, mergenums=False):
  184. """Composes a RegexTokenizer with an IntraWordFilter, LowercaseFilter, and
  185. StopFilter.
  186. >>> ana = FancyAnalyzer()
  187. >>> [token.text for token in ana("Should I call getInt or get_real?")]
  188. ["should", "call", "getInt", "get", "int", "get_real", "get", "real"]
  189. :param expression: The regular expression pattern to use to extract tokens.
  190. :param stoplist: A list of stop words. Set this to None to disable
  191. the stop word filter.
  192. :param minsize: Words smaller than this are removed from the stream.
  193. :param maxsize: Words longer that this are removed from the stream.
  194. :param gaps: If True, the tokenizer *splits* on the expression, rather
  195. than matching on the expression.
  196. """
  197. return (RegexTokenizer(expression=expression, gaps=gaps)
  198. | IntraWordFilter(splitwords=splitwords, splitnums=splitnums,
  199. mergewords=mergewords, mergenums=mergenums)
  200. | LowercaseFilter()
  201. | StopFilter(stoplist=stoplist, minsize=minsize)
  202. )
  203. def LanguageAnalyzer(lang, expression=default_pattern, gaps=False,
  204. cachesize=50000):
  205. """Configures a simple analyzer for the given language, with a
  206. LowercaseFilter, StopFilter, and StemFilter.
  207. >>> ana = LanguageAnalyzer("es")
  208. >>> [token.text for token in ana("Por el mar corren las liebres")]
  209. ['mar', 'corr', 'liebr']
  210. The list of available languages is in `whoosh.lang.languages`.
  211. You can use :func:`whoosh.lang.has_stemmer` and
  212. :func:`whoosh.lang.has_stopwords` to check if a given language has a
  213. stemming function and/or stop word list available.
  214. :param expression: The regular expression pattern to use to extract tokens.
  215. :param gaps: If True, the tokenizer *splits* on the expression, rather
  216. than matching on the expression.
  217. :param cachesize: the maximum number of stemmed words to cache. The larger
  218. this number, the faster stemming will be but the more memory it will
  219. use.
  220. """
  221. from whoosh.lang import NoStemmer, NoStopWords
  222. # Make the start of the chain
  223. chain = (RegexTokenizer(expression=expression, gaps=gaps)
  224. | LowercaseFilter())
  225. # Add a stop word filter
  226. try:
  227. chain = chain | StopFilter(lang=lang)
  228. except NoStopWords:
  229. pass
  230. # Add a stemming filter
  231. try:
  232. chain = chain | StemFilter(lang=lang, cachesize=cachesize)
  233. except NoStemmer:
  234. pass
  235. return chain