morph.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. # Copyright 2007 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. from whoosh.analysis.filters import Filter
  28. from whoosh.compat import integer_types
  29. from whoosh.lang.dmetaphone import double_metaphone
  30. from whoosh.lang.porter import stem
  31. from whoosh.util.cache import lfu_cache, unbound_cache
  32. class StemFilter(Filter):
  33. """Stems (removes suffixes from) the text of tokens using the Porter
  34. stemming algorithm. Stemming attempts to reduce multiple forms of the same
  35. root word (for example, "rendering", "renders", "rendered", etc.) to a
  36. single word in the index.
  37. >>> stemmer = RegexTokenizer() | StemFilter()
  38. >>> [token.text for token in stemmer("fundamentally willows")]
  39. ["fundament", "willow"]
  40. You can pass your own stemming function to the StemFilter. The default
  41. is the Porter stemming algorithm for English.
  42. >>> stemfilter = StemFilter(stem_function)
  43. You can also use one of the Snowball stemming functions by passing the
  44. `lang` keyword argument.
  45. >>> stemfilter = StemFilter(lang="ru")
  46. The list of available languages is in `whoosh.lang.languages`.
  47. You can use :func:`whoosh.lang.has_stemmer` to check if a given language has
  48. a stemming function available.
  49. By default, this class wraps an LRU cache around the stemming function. The
  50. ``cachesize`` keyword argument sets the size of the cache. To make the
  51. cache unbounded (the class caches every input), use ``cachesize=-1``. To
  52. disable caching, use ``cachesize=None``.
  53. If you compile and install the py-stemmer library, the
  54. :class:`PyStemmerFilter` provides slightly easier access to the language
  55. stemmers in that library.
  56. """
  57. __inittypes__ = dict(stemfn=object, ignore=list)
  58. is_morph = True
  59. def __init__(self, stemfn=stem, lang=None, ignore=None, cachesize=50000):
  60. """
  61. :param stemfn: the function to use for stemming.
  62. :param lang: if not None, overrides the stemfn with a language stemmer
  63. from the ``whoosh.lang.snowball`` package.
  64. :param ignore: a set/list of words that should not be stemmed. This is
  65. converted into a frozenset. If you omit this argument, all tokens
  66. are stemmed.
  67. :param cachesize: the maximum number of words to cache. Use ``-1`` for
  68. an unbounded cache, or ``None`` for no caching.
  69. """
  70. self.stemfn = stemfn
  71. self.lang = lang
  72. self.ignore = frozenset() if ignore is None else frozenset(ignore)
  73. self.cachesize = cachesize
  74. # clear() sets the _stem attr to a cached wrapper around self.stemfn
  75. self.clear()
  76. def __getstate__(self):
  77. # Can't pickle a dynamic function, so we have to remove the _stem
  78. # attribute from the state
  79. return dict([(k, self.__dict__[k]) for k in self.__dict__
  80. if k != "_stem"])
  81. def __setstate__(self, state):
  82. # Check for old instances of StemFilter class, which didn't have a
  83. # cachesize attribute and pickled the cache attribute
  84. if "cachesize" not in state:
  85. self.cachesize = 50000
  86. if "ignores" in state:
  87. self.ignore = state["ignores"]
  88. elif "ignore" not in state:
  89. self.ignore = frozenset()
  90. if "lang" not in state:
  91. self.lang = None
  92. if "cache" in state:
  93. del state["cache"]
  94. self.__dict__.update(state)
  95. # Set the _stem attribute
  96. self.clear()
  97. def clear(self):
  98. if self.lang:
  99. from whoosh.lang import stemmer_for_language
  100. stemfn = stemmer_for_language(self.lang)
  101. else:
  102. stemfn = self.stemfn
  103. if isinstance(self.cachesize, integer_types) and self.cachesize != 0:
  104. if self.cachesize < 0:
  105. self._stem = unbound_cache(stemfn)
  106. elif self.cachesize > 1:
  107. self._stem = lfu_cache(self.cachesize)(stemfn)
  108. else:
  109. self._stem = stemfn
  110. def cache_info(self):
  111. if self.cachesize <= 1:
  112. return None
  113. return self._stem.cache_info()
  114. def __eq__(self, other):
  115. return (other and self.__class__ is other.__class__
  116. and self.stemfn == other.stemfn)
  117. def __call__(self, tokens):
  118. stemfn = self._stem
  119. ignore = self.ignore
  120. for t in tokens:
  121. if not t.stopped:
  122. text = t.text
  123. if text not in ignore:
  124. t.text = stemfn(text)
  125. yield t
  126. class PyStemmerFilter(StemFilter):
  127. """This is a simple subclass of StemFilter that works with the py-stemmer
  128. third-party library. You must have the py-stemmer library installed to use
  129. this filter.
  130. >>> PyStemmerFilter("spanish")
  131. """
  132. def __init__(self, lang="english", ignore=None, cachesize=10000):
  133. """
  134. :param lang: a string identifying the stemming algorithm to use. You
  135. can get a list of available algorithms by with the
  136. :meth:`PyStemmerFilter.algorithms` method. The identification
  137. strings are directly from the py-stemmer library.
  138. :param ignore: a set/list of words that should not be stemmed. This is
  139. converted into a frozenset. If you omit this argument, all tokens
  140. are stemmed.
  141. :param cachesize: the maximum number of words to cache.
  142. """
  143. self.lang = lang
  144. self.ignore = frozenset() if ignore is None else frozenset(ignore)
  145. self.cachesize = cachesize
  146. self._stem = self._get_stemmer_fn()
  147. def algorithms(self):
  148. """Returns a list of stemming algorithms provided by the py-stemmer
  149. library.
  150. """
  151. import Stemmer # @UnresolvedImport
  152. return Stemmer.algorithms()
  153. def cache_info(self):
  154. return None
  155. def _get_stemmer_fn(self):
  156. import Stemmer # @UnresolvedImport
  157. stemmer = Stemmer.Stemmer(self.lang)
  158. stemmer.maxCacheSize = self.cachesize
  159. return stemmer.stemWord
  160. def __getstate__(self):
  161. # Can't pickle a dynamic function, so we have to remove the _stem
  162. # attribute from the state
  163. return dict([(k, self.__dict__[k]) for k in self.__dict__
  164. if k != "_stem"])
  165. def __setstate__(self, state):
  166. # Check for old instances of StemFilter class, which didn't have a
  167. # cachesize attribute and pickled the cache attribute
  168. if "cachesize" not in state:
  169. self.cachesize = 10000
  170. if "ignores" in state:
  171. self.ignore = state["ignores"]
  172. elif "ignore" not in state:
  173. self.ignore = frozenset()
  174. if "cache" in state:
  175. del state["cache"]
  176. self.__dict__.update(state)
  177. # Set the _stem attribute
  178. self._stem = self._get_stemmer_fn()
  179. class DoubleMetaphoneFilter(Filter):
  180. """Transforms the text of the tokens using Lawrence Philips's Double
  181. Metaphone algorithm. This algorithm attempts to encode words in such a way
  182. that similar-sounding words reduce to the same code. This may be useful for
  183. fields containing the names of people and places, and other uses where
  184. tolerance of spelling differences is desireable.
  185. """
  186. is_morph = True
  187. def __init__(self, primary_boost=1.0, secondary_boost=0.5, combine=False):
  188. """
  189. :param primary_boost: the boost to apply to the token containing the
  190. primary code.
  191. :param secondary_boost: the boost to apply to the token containing the
  192. secondary code, if any.
  193. :param combine: if True, the original unencoded tokens are kept in the
  194. stream, preceding the encoded tokens.
  195. """
  196. self.primary_boost = primary_boost
  197. self.secondary_boost = secondary_boost
  198. self.combine = combine
  199. def __eq__(self, other):
  200. return (other
  201. and self.__class__ is other.__class__
  202. and self.primary_boost == other.primary_boost)
  203. def __call__(self, tokens):
  204. primary_boost = self.primary_boost
  205. secondary_boost = self.secondary_boost
  206. combine = self.combine
  207. for t in tokens:
  208. if combine:
  209. yield t
  210. primary, secondary = double_metaphone(t.text)
  211. b = t.boost
  212. # Overwrite the token's text and boost and yield it
  213. if primary:
  214. t.text = primary
  215. t.boost = b * primary_boost
  216. yield t
  217. if secondary:
  218. t.text = secondary
  219. t.boost = b * secondary_boost
  220. yield t