123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338 |
- # Copyright 2007 Matt Chaput. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # 1. Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
- # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # The views and conclusions contained in the software and documentation are
- # those of the authors and should not be interpreted as representing official
- # policies, either expressed or implied, of Matt Chaput.
- from whoosh.compat import u, text_type
- from whoosh.analysis.acore import Composable, Token
- from whoosh.util.text import rcompile
- default_pattern = rcompile(r"\w+(\.?\w+)*")
- # Tokenizers
- class Tokenizer(Composable):
- """Base class for Tokenizers.
- """
- def __eq__(self, other):
- return other and self.__class__ is other.__class__
- class IDTokenizer(Tokenizer):
- """Yields the entire input string as a single token. For use in indexed but
- untokenized fields, such as a document's path.
- >>> idt = IDTokenizer()
- >>> [token.text for token in idt("/a/b 123 alpha")]
- ["/a/b 123 alpha"]
- """
- def __call__(self, value, positions=False, chars=False,
- keeporiginal=False, removestops=True,
- start_pos=0, start_char=0, mode='', **kwargs):
- assert isinstance(value, text_type), "%r is not unicode" % value
- t = Token(positions, chars, removestops=removestops, mode=mode,
- **kwargs)
- t.text = value
- t.boost = 1.0
- if keeporiginal:
- t.original = value
- if positions:
- t.pos = start_pos + 1
- if chars:
- t.startchar = start_char
- t.endchar = start_char + len(value)
- yield t
- class RegexTokenizer(Tokenizer):
- """
- Uses a regular expression to extract tokens from text.
- >>> rex = RegexTokenizer()
- >>> [token.text for token in rex(u("hi there 3.141 big-time under_score"))]
- ["hi", "there", "3.141", "big", "time", "under_score"]
- """
- def __init__(self, expression=default_pattern, gaps=False):
- """
- :param expression: A regular expression object or string. Each match
- of the expression equals a token. Group 0 (the entire matched text)
- is used as the text of the token. If you require more complicated
- handling of the expression match, simply write your own tokenizer.
- :param gaps: If True, the tokenizer *splits* on the expression, rather
- than matching on the expression.
- """
- self.expression = rcompile(expression)
- self.gaps = gaps
- def __eq__(self, other):
- if self.__class__ is other.__class__:
- if self.expression.pattern == other.expression.pattern:
- return True
- return False
- def __call__(self, value, positions=False, chars=False, keeporiginal=False,
- removestops=True, start_pos=0, start_char=0, tokenize=True,
- mode='', **kwargs):
- """
- :param value: The unicode string to tokenize.
- :param positions: Whether to record token positions in the token.
- :param chars: Whether to record character offsets in the token.
- :param start_pos: The position number of the first token. For example,
- if you set start_pos=2, the tokens will be numbered 2,3,4,...
- instead of 0,1,2,...
- :param start_char: The offset of the first character of the first
- token. For example, if you set start_char=2, the text "aaa bbb"
- will have chars (2,5),(6,9) instead (0,3),(4,7).
- :param tokenize: if True, the text should be tokenized.
- """
- assert isinstance(value, text_type), "%s is not unicode" % repr(value)
- t = Token(positions, chars, removestops=removestops, mode=mode,
- **kwargs)
- if not tokenize:
- t.original = t.text = value
- t.boost = 1.0
- if positions:
- t.pos = start_pos
- if chars:
- t.startchar = start_char
- t.endchar = start_char + len(value)
- yield t
- elif not self.gaps:
- # The default: expression matches are used as tokens
- for pos, match in enumerate(self.expression.finditer(value)):
- t.text = match.group(0)
- t.boost = 1.0
- if keeporiginal:
- t.original = t.text
- t.stopped = False
- if positions:
- t.pos = start_pos + pos
- if chars:
- t.startchar = start_char + match.start()
- t.endchar = start_char + match.end()
- yield t
- else:
- # When gaps=True, iterate through the matches and
- # yield the text between them.
- prevend = 0
- pos = start_pos
- for match in self.expression.finditer(value):
- start = prevend
- end = match.start()
- text = value[start:end]
- if text:
- t.text = text
- t.boost = 1.0
- if keeporiginal:
- t.original = t.text
- t.stopped = False
- if positions:
- t.pos = pos
- pos += 1
- if chars:
- t.startchar = start_char + start
- t.endchar = start_char + end
- yield t
- prevend = match.end()
- # If the last "gap" was before the end of the text,
- # yield the last bit of text as a final token.
- if prevend < len(value):
- t.text = value[prevend:]
- t.boost = 1.0
- if keeporiginal:
- t.original = t.text
- t.stopped = False
- if positions:
- t.pos = pos
- if chars:
- t.startchar = prevend
- t.endchar = len(value)
- yield t
- class CharsetTokenizer(Tokenizer):
- """Tokenizes and translates text according to a character mapping object.
- Characters that map to None are considered token break characters. For all
- other characters the map is used to translate the character. This is useful
- for case and accent folding.
- This tokenizer loops character-by-character and so will likely be much
- slower than :class:`RegexTokenizer`.
- One way to get a character mapping object is to convert a Sphinx charset
- table file using :func:`whoosh.support.charset.charset_table_to_dict`.
- >>> from whoosh.support.charset import charset_table_to_dict
- >>> from whoosh.support.charset import default_charset
- >>> charmap = charset_table_to_dict(default_charset)
- >>> chtokenizer = CharsetTokenizer(charmap)
- >>> [t.text for t in chtokenizer(u'Stra\\xdfe ABC')]
- [u'strase', u'abc']
- The Sphinx charset table format is described at
- http://www.sphinxsearch.com/docs/current.html#conf-charset-table.
- """
- __inittype__ = dict(charmap=str)
- def __init__(self, charmap):
- """
- :param charmap: a mapping from integer character numbers to unicode
- characters, as used by the unicode.translate() method.
- """
- self.charmap = charmap
- def __eq__(self, other):
- return (other
- and self.__class__ is other.__class__
- and self.charmap == other.charmap)
- def __call__(self, value, positions=False, chars=False, keeporiginal=False,
- removestops=True, start_pos=0, start_char=0, tokenize=True,
- mode='', **kwargs):
- """
- :param value: The unicode string to tokenize.
- :param positions: Whether to record token positions in the token.
- :param chars: Whether to record character offsets in the token.
- :param start_pos: The position number of the first token. For example,
- if you set start_pos=2, the tokens will be numbered 2,3,4,...
- instead of 0,1,2,...
- :param start_char: The offset of the first character of the first
- token. For example, if you set start_char=2, the text "aaa bbb"
- will have chars (2,5),(6,9) instead (0,3),(4,7).
- :param tokenize: if True, the text should be tokenized.
- """
- assert isinstance(value, text_type), "%r is not unicode" % value
- t = Token(positions, chars, removestops=removestops, mode=mode,
- **kwargs)
- if not tokenize:
- t.original = t.text = value
- t.boost = 1.0
- if positions:
- t.pos = start_pos
- if chars:
- t.startchar = start_char
- t.endchar = start_char + len(value)
- yield t
- else:
- text = u("")
- charmap = self.charmap
- pos = start_pos
- startchar = currentchar = start_char
- for char in value:
- tchar = charmap[ord(char)]
- if tchar:
- text += tchar
- else:
- if currentchar > startchar:
- t.text = text
- t.boost = 1.0
- if keeporiginal:
- t.original = t.text
- if positions:
- t.pos = pos
- pos += 1
- if chars:
- t.startchar = startchar
- t.endchar = currentchar
- yield t
- startchar = currentchar + 1
- text = u("")
- currentchar += 1
- if currentchar > startchar:
- t.text = value[startchar:currentchar]
- t.boost = 1.0
- if keeporiginal:
- t.original = t.text
- if positions:
- t.pos = pos
- if chars:
- t.startchar = startchar
- t.endchar = currentchar
- yield t
- def SpaceSeparatedTokenizer():
- """Returns a RegexTokenizer that splits tokens by whitespace.
- >>> sst = SpaceSeparatedTokenizer()
- >>> [token.text for token in sst("hi there big-time, what's up")]
- ["hi", "there", "big-time,", "what's", "up"]
- """
- return RegexTokenizer(r"[^ \t\r\n]+")
- def CommaSeparatedTokenizer():
- """Splits tokens by commas.
- Note that the tokenizer calls unicode.strip() on each match of the regular
- expression.
- >>> cst = CommaSeparatedTokenizer()
- >>> [token.text for token in cst("hi there, what's , up")]
- ["hi there", "what's", "up"]
- """
- from whoosh.analysis.filters import StripFilter
- return RegexTokenizer(r"[^,]+") | StripFilter()
- class PathTokenizer(Tokenizer):
- """A simple tokenizer that given a string ``"/a/b/c"`` yields tokens
- ``["/a", "/a/b", "/a/b/c"]``.
- """
- def __init__(self, expression="[^/]+"):
- self.expr = rcompile(expression)
- def __call__(self, value, positions=False, start_pos=0, **kwargs):
- assert isinstance(value, text_type), "%r is not unicode" % value
- token = Token(positions, **kwargs)
- pos = start_pos
- for match in self.expr.finditer(value):
- token.text = value[:match.end()]
- if positions:
- token.pos = pos
- pos += 1
- yield token
|