123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237 |
- # Copyright 2007 Matt Chaput. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # 1. Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
- # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # The views and conclusions contained in the software and documentation are
- # those of the authors and should not be interpreted as representing official
- # policies, either expressed or implied, of Matt Chaput.
- from whoosh.compat import text_type
- from whoosh.compat import xrange
- from whoosh.analysis.acore import Token
- from whoosh.analysis.filters import Filter, LowercaseFilter
- from whoosh.analysis.tokenizers import Tokenizer, RegexTokenizer
- # Tokenizer
- class NgramTokenizer(Tokenizer):
- """Splits input text into N-grams instead of words.
- >>> ngt = NgramTokenizer(4)
- >>> [token.text for token in ngt("hi there")]
- ["hi t", "i th", " the", "ther", "here"]
- Note that this tokenizer does NOT use a regular expression to extract
- words, so the grams emitted by it will contain whitespace, punctuation,
- etc. You may want to massage the input or add a custom filter to this
- tokenizer's output.
- Alternatively, if you only want sub-word grams without whitespace, you
- could combine a RegexTokenizer with NgramFilter instead.
- """
- __inittypes__ = dict(minsize=int, maxsize=int)
- def __init__(self, minsize, maxsize=None):
- """
- :param minsize: The minimum size of the N-grams.
- :param maxsize: The maximum size of the N-grams. If you omit
- this parameter, maxsize == minsize.
- """
- self.min = minsize
- self.max = maxsize or minsize
- def __eq__(self, other):
- if self.__class__ is other.__class__:
- if self.min == other.min and self.max == other.max:
- return True
- return False
- def __call__(self, value, positions=False, chars=False, keeporiginal=False,
- removestops=True, start_pos=0, start_char=0, mode='',
- **kwargs):
- assert isinstance(value, text_type), "%r is not unicode" % value
- inlen = len(value)
- t = Token(positions, chars, removestops=removestops, mode=mode)
- pos = start_pos
- if mode == "query":
- size = min(self.max, inlen)
- for start in xrange(0, inlen - size + 1):
- end = start + size
- if end > inlen:
- continue
- t.text = value[start:end]
- if keeporiginal:
- t.original = t.text
- t.stopped = False
- if positions:
- t.pos = pos
- if chars:
- t.startchar = start_char + start
- t.endchar = start_char + end
- yield t
- pos += 1
- else:
- for start in xrange(0, inlen - self.min + 1):
- for size in xrange(self.min, self.max + 1):
- end = start + size
- if end > inlen:
- continue
- t.text = value[start:end]
- if keeporiginal:
- t.original = t.text
- t.stopped = False
- if positions:
- t.pos = pos
- if chars:
- t.startchar = start_char + start
- t.endchar = start_char + end
- yield t
- pos += 1
- # Filter
- class NgramFilter(Filter):
- """Splits token text into N-grams.
- >>> rext = RegexTokenizer()
- >>> stream = rext("hello there")
- >>> ngf = NgramFilter(4)
- >>> [token.text for token in ngf(stream)]
- ["hell", "ello", "ther", "here"]
- """
- __inittypes__ = dict(minsize=int, maxsize=int)
- def __init__(self, minsize, maxsize=None, at=None):
- """
- :param minsize: The minimum size of the N-grams.
- :param maxsize: The maximum size of the N-grams. If you omit this
- parameter, maxsize == minsize.
- :param at: If 'start', only take N-grams from the start of each word.
- if 'end', only take N-grams from the end of each word. Otherwise,
- take all N-grams from the word (the default).
- """
- self.min = minsize
- self.max = maxsize or minsize
- self.at = 0
- if at == "start":
- self.at = -1
- elif at == "end":
- self.at = 1
- def __eq__(self, other):
- return other and self.__class__ is other.__class__\
- and self.min == other.min and self.max == other.max
- def __call__(self, tokens):
- assert hasattr(tokens, "__iter__")
- at = self.at
- for t in tokens:
- text = t.text
- if len(text) < self.min:
- continue
- chars = t.chars
- if chars:
- startchar = t.startchar
- # Token positions don't mean much for N-grams,
- # so we'll leave the token's original position
- # untouched.
- if t.mode == "query":
- size = min(self.max, len(t.text))
- if at == -1:
- t.text = text[:size]
- if chars:
- t.endchar = startchar + size
- yield t
- elif at == 1:
- t.text = text[0 - size:]
- if chars:
- t.startchar = t.endchar - size
- yield t
- else:
- for start in xrange(0, len(text) - size + 1):
- t.text = text[start:start + size]
- if chars:
- t.startchar = startchar + start
- t.endchar = startchar + start + size
- yield t
- else:
- if at == -1:
- limit = min(self.max, len(text))
- for size in xrange(self.min, limit + 1):
- t.text = text[:size]
- if chars:
- t.endchar = startchar + size
- yield t
- elif at == 1:
- if chars:
- original_startchar = t.startchar
- start = max(0, len(text) - self.max)
- for i in xrange(start, len(text) - self.min + 1):
- t.text = text[i:]
- if chars:
- t.startchar = original_startchar + i
- yield t
- else:
- for start in xrange(0, len(text) - self.min + 1):
- for size in xrange(self.min, self.max + 1):
- end = start + size
- if end > len(text):
- continue
- t.text = text[start:end]
- if chars:
- t.startchar = startchar + start
- t.endchar = startchar + end
- yield t
- # Analyzers
- def NgramAnalyzer(minsize, maxsize=None):
- """Composes an NgramTokenizer and a LowercaseFilter.
- >>> ana = NgramAnalyzer(4)
- >>> [token.text for token in ana("hi there")]
- ["hi t", "i th", " the", "ther", "here"]
- """
- return NgramTokenizer(minsize, maxsize=maxsize) | LowercaseFilter()
- def NgramWordAnalyzer(minsize, maxsize=None, tokenizer=None, at=None):
- if not tokenizer:
- tokenizer = RegexTokenizer()
- return tokenizer | LowercaseFilter() | NgramFilter(minsize, maxsize, at=at)
|