123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267 |
- # Copyright 2007 Matt Chaput. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # 1. Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
- # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # The views and conclusions contained in the software and documentation are
- # those of the authors and should not be interpreted as representing official
- # policies, either expressed or implied, of Matt Chaput.
- from whoosh.analysis.filters import Filter
- from whoosh.compat import integer_types
- from whoosh.lang.dmetaphone import double_metaphone
- from whoosh.lang.porter import stem
- from whoosh.util.cache import lfu_cache, unbound_cache
- class StemFilter(Filter):
- """Stems (removes suffixes from) the text of tokens using the Porter
- stemming algorithm. Stemming attempts to reduce multiple forms of the same
- root word (for example, "rendering", "renders", "rendered", etc.) to a
- single word in the index.
- >>> stemmer = RegexTokenizer() | StemFilter()
- >>> [token.text for token in stemmer("fundamentally willows")]
- ["fundament", "willow"]
- You can pass your own stemming function to the StemFilter. The default
- is the Porter stemming algorithm for English.
- >>> stemfilter = StemFilter(stem_function)
- You can also use one of the Snowball stemming functions by passing the
- `lang` keyword argument.
- >>> stemfilter = StemFilter(lang="ru")
- The list of available languages is in `whoosh.lang.languages`.
- You can use :func:`whoosh.lang.has_stemmer` to check if a given language has
- a stemming function available.
- By default, this class wraps an LRU cache around the stemming function. The
- ``cachesize`` keyword argument sets the size of the cache. To make the
- cache unbounded (the class caches every input), use ``cachesize=-1``. To
- disable caching, use ``cachesize=None``.
- If you compile and install the py-stemmer library, the
- :class:`PyStemmerFilter` provides slightly easier access to the language
- stemmers in that library.
- """
- __inittypes__ = dict(stemfn=object, ignore=list)
- is_morph = True
- def __init__(self, stemfn=stem, lang=None, ignore=None, cachesize=50000):
- """
- :param stemfn: the function to use for stemming.
- :param lang: if not None, overrides the stemfn with a language stemmer
- from the ``whoosh.lang.snowball`` package.
- :param ignore: a set/list of words that should not be stemmed. This is
- converted into a frozenset. If you omit this argument, all tokens
- are stemmed.
- :param cachesize: the maximum number of words to cache. Use ``-1`` for
- an unbounded cache, or ``None`` for no caching.
- """
- self.stemfn = stemfn
- self.lang = lang
- self.ignore = frozenset() if ignore is None else frozenset(ignore)
- self.cachesize = cachesize
- # clear() sets the _stem attr to a cached wrapper around self.stemfn
- self.clear()
- def __getstate__(self):
- # Can't pickle a dynamic function, so we have to remove the _stem
- # attribute from the state
- return dict([(k, self.__dict__[k]) for k in self.__dict__
- if k != "_stem"])
- def __setstate__(self, state):
- # Check for old instances of StemFilter class, which didn't have a
- # cachesize attribute and pickled the cache attribute
- if "cachesize" not in state:
- self.cachesize = 50000
- if "ignores" in state:
- self.ignore = state["ignores"]
- elif "ignore" not in state:
- self.ignore = frozenset()
- if "lang" not in state:
- self.lang = None
- if "cache" in state:
- del state["cache"]
- self.__dict__.update(state)
- # Set the _stem attribute
- self.clear()
- def clear(self):
- if self.lang:
- from whoosh.lang import stemmer_for_language
- stemfn = stemmer_for_language(self.lang)
- else:
- stemfn = self.stemfn
- if isinstance(self.cachesize, integer_types) and self.cachesize != 0:
- if self.cachesize < 0:
- self._stem = unbound_cache(stemfn)
- elif self.cachesize > 1:
- self._stem = lfu_cache(self.cachesize)(stemfn)
- else:
- self._stem = stemfn
- def cache_info(self):
- if self.cachesize <= 1:
- return None
- return self._stem.cache_info()
- def __eq__(self, other):
- return (other and self.__class__ is other.__class__
- and self.stemfn == other.stemfn)
- def __call__(self, tokens):
- stemfn = self._stem
- ignore = self.ignore
- for t in tokens:
- if not t.stopped:
- text = t.text
- if text not in ignore:
- t.text = stemfn(text)
- yield t
- class PyStemmerFilter(StemFilter):
- """This is a simple subclass of StemFilter that works with the py-stemmer
- third-party library. You must have the py-stemmer library installed to use
- this filter.
- >>> PyStemmerFilter("spanish")
- """
- def __init__(self, lang="english", ignore=None, cachesize=10000):
- """
- :param lang: a string identifying the stemming algorithm to use. You
- can get a list of available algorithms by with the
- :meth:`PyStemmerFilter.algorithms` method. The identification
- strings are directly from the py-stemmer library.
- :param ignore: a set/list of words that should not be stemmed. This is
- converted into a frozenset. If you omit this argument, all tokens
- are stemmed.
- :param cachesize: the maximum number of words to cache.
- """
- self.lang = lang
- self.ignore = frozenset() if ignore is None else frozenset(ignore)
- self.cachesize = cachesize
- self._stem = self._get_stemmer_fn()
- def algorithms(self):
- """Returns a list of stemming algorithms provided by the py-stemmer
- library.
- """
- import Stemmer # @UnresolvedImport
- return Stemmer.algorithms()
- def cache_info(self):
- return None
- def _get_stemmer_fn(self):
- import Stemmer # @UnresolvedImport
- stemmer = Stemmer.Stemmer(self.lang)
- stemmer.maxCacheSize = self.cachesize
- return stemmer.stemWord
- def __getstate__(self):
- # Can't pickle a dynamic function, so we have to remove the _stem
- # attribute from the state
- return dict([(k, self.__dict__[k]) for k in self.__dict__
- if k != "_stem"])
- def __setstate__(self, state):
- # Check for old instances of StemFilter class, which didn't have a
- # cachesize attribute and pickled the cache attribute
- if "cachesize" not in state:
- self.cachesize = 10000
- if "ignores" in state:
- self.ignore = state["ignores"]
- elif "ignore" not in state:
- self.ignore = frozenset()
- if "cache" in state:
- del state["cache"]
- self.__dict__.update(state)
- # Set the _stem attribute
- self._stem = self._get_stemmer_fn()
- class DoubleMetaphoneFilter(Filter):
- """Transforms the text of the tokens using Lawrence Philips's Double
- Metaphone algorithm. This algorithm attempts to encode words in such a way
- that similar-sounding words reduce to the same code. This may be useful for
- fields containing the names of people and places, and other uses where
- tolerance of spelling differences is desireable.
- """
- is_morph = True
- def __init__(self, primary_boost=1.0, secondary_boost=0.5, combine=False):
- """
- :param primary_boost: the boost to apply to the token containing the
- primary code.
- :param secondary_boost: the boost to apply to the token containing the
- secondary code, if any.
- :param combine: if True, the original unencoded tokens are kept in the
- stream, preceding the encoded tokens.
- """
- self.primary_boost = primary_boost
- self.secondary_boost = secondary_boost
- self.combine = combine
- def __eq__(self, other):
- return (other
- and self.__class__ is other.__class__
- and self.primary_boost == other.primary_boost)
- def __call__(self, tokens):
- primary_boost = self.primary_boost
- secondary_boost = self.secondary_boost
- combine = self.combine
- for t in tokens:
- if combine:
- yield t
- primary, secondary = double_metaphone(t.text)
- b = t.boost
- # Overwrite the token's text and boost and yield it
- if primary:
- t.text = primary
- t.boost = b * primary_boost
- yield t
- if secondary:
- t.text = secondary
- t.boost = b * secondary_boost
- yield t
|