123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535 |
- # Copyright 2007 Matt Chaput. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # 1. Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
- # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # The views and conclusions contained in the software and documentation are
- # those of the authors and should not be interpreted as representing official
- # policies, either expressed or implied, of Matt Chaput.
- from __future__ import division
- import copy
- import fnmatch
- import re
- from collections import defaultdict
- from whoosh import matching
- from whoosh.analysis import Token
- from whoosh.compat import bytes_type, text_type, u
- from whoosh.lang.morph_en import variations
- from whoosh.query import qcore
- class Term(qcore.Query):
- """Matches documents containing the given term (fieldname+text pair).
- >>> Term("content", u"render")
- """
- __inittypes__ = dict(fieldname=str, text=text_type, boost=float)
- def __init__(self, fieldname, text, boost=1.0, minquality=None):
- self.fieldname = fieldname
- self.text = text
- self.boost = boost
- self.minquality = minquality
- def __eq__(self, other):
- return (other
- and self.__class__ is other.__class__
- and self.fieldname == other.fieldname
- and self.text == other.text
- and self.boost == other.boost)
- def __repr__(self):
- r = "%s(%r, %r" % (self.__class__.__name__, self.fieldname, self.text)
- if self.boost != 1.0:
- r += ", boost=%s" % self.boost
- r += ")"
- return r
- def __unicode__(self):
- text = self.text
- if isinstance(text, bytes_type):
- try:
- text = text.decode("ascii")
- except UnicodeDecodeError:
- text = repr(text)
- t = u("%s:%s") % (self.fieldname, text)
- if self.boost != 1:
- t += u("^") + text_type(self.boost)
- return t
- __str__ = __unicode__
- def __hash__(self):
- return hash(self.fieldname) ^ hash(self.text) ^ hash(self.boost)
- def has_terms(self):
- return True
- def tokens(self, boost=1.0):
- yield Token(fieldname=self.fieldname, text=self.text,
- boost=boost * self.boost, startchar=self.startchar,
- endchar=self.endchar, chars=True)
- def terms(self, phrases=False):
- if self.field():
- yield (self.field(), self.text)
- def replace(self, fieldname, oldtext, newtext):
- q = copy.copy(self)
- if q.fieldname == fieldname and q.text == oldtext:
- q.text = newtext
- return q
- def estimate_size(self, ixreader):
- fieldname = self.fieldname
- if fieldname not in ixreader.schema:
- return 0
- field = ixreader.schema[fieldname]
- try:
- text = field.to_bytes(self.text)
- except ValueError:
- return 0
- return ixreader.doc_frequency(fieldname, text)
- def matcher(self, searcher, context=None):
- fieldname = self.fieldname
- text = self.text
- if fieldname not in searcher.schema:
- return matching.NullMatcher()
- field = searcher.schema[fieldname]
- try:
- text = field.to_bytes(text)
- except ValueError:
- return matching.NullMatcher()
- if (self.fieldname, text) in searcher.reader():
- if context is None:
- w = searcher.weighting
- else:
- w = context.weighting
- m = searcher.postings(self.fieldname, text, weighting=w)
- if self.minquality:
- m.set_min_quality(self.minquality)
- if self.boost != 1.0:
- m = matching.WrappingMatcher(m, boost=self.boost)
- return m
- else:
- return matching.NullMatcher()
- class MultiTerm(qcore.Query):
- """Abstract base class for queries that operate on multiple terms in the
- same field.
- """
- constantscore = False
- def _btexts(self, ixreader):
- raise NotImplementedError(self.__class__.__name__)
- def expanded_terms(self, ixreader, phrases=False):
- fieldname = self.field()
- if fieldname:
- for btext in self._btexts(ixreader):
- yield (fieldname, btext)
- def tokens(self, boost=1.0, exreader=None):
- fieldname = self.field()
- if exreader is None:
- btexts = [self.text]
- else:
- btexts = self._btexts(exreader)
- for btext in btexts:
- yield Token(fieldname=fieldname, text=btext,
- boost=boost * self.boost, startchar=self.startchar,
- endchar=self.endchar, chars=True)
- def simplify(self, ixreader):
- fieldname = self.field()
- if fieldname not in ixreader.schema:
- return qcore.NullQuery()
- field = ixreader.schema[fieldname]
- existing = []
- for btext in sorted(set(self._btexts(ixreader))):
- text = field.from_bytes(btext)
- existing.append(Term(fieldname, text, boost=self.boost))
- if len(existing) == 1:
- return existing[0]
- elif existing:
- from whoosh.query import Or
- return Or(existing)
- else:
- return qcore.NullQuery
- def estimate_size(self, ixreader):
- fieldname = self.field()
- return sum(ixreader.doc_frequency(fieldname, btext)
- for btext in self._btexts(ixreader))
- def estimate_min_size(self, ixreader):
- fieldname = self.field()
- return min(ixreader.doc_frequency(fieldname, text)
- for text in self._btexts(ixreader))
- def matcher(self, searcher, context=None):
- from whoosh.query import Or
- fieldname = self.field()
- constantscore = self.constantscore
- reader = searcher.reader()
- qs = [Term(fieldname, word) for word in self._btexts(reader)
- if word]
- if not qs:
- return matching.NullMatcher()
- if len(qs) == 1:
- # If there's only one term, just use it
- m = qs[0].matcher(searcher, context)
- else:
- if constantscore:
- # To tell the sub-query that score doesn't matter, set weighting
- # to None
- if context:
- context = context.set(weighting=None)
- else:
- from whoosh.searching import SearchContext
- context = SearchContext(weighting=None)
- # Or the terms together
- m = Or(qs, boost=self.boost).matcher(searcher, context)
- return m
- class PatternQuery(MultiTerm):
- """An intermediate base class for common methods of Prefix and Wildcard.
- """
- __inittypes__ = dict(fieldname=str, text=text_type, boost=float)
- def __init__(self, fieldname, text, boost=1.0, constantscore=True):
- self.fieldname = fieldname
- self.text = text
- self.boost = boost
- self.constantscore = constantscore
- def __eq__(self, other):
- return (other and self.__class__ is other.__class__
- and self.fieldname == other.fieldname
- and self.text == other.text and self.boost == other.boost
- and self.constantscore == other.constantscore)
- def __repr__(self):
- r = "%s(%r, %r" % (self.__class__.__name__, self.fieldname, self.text)
- if self.boost != 1:
- r += ", boost=%s" % self.boost
- r += ")"
- return r
- def __hash__(self):
- return (hash(self.fieldname) ^ hash(self.text) ^ hash(self.boost)
- ^ hash(self.constantscore))
- def _get_pattern(self):
- raise NotImplementedError
- def _find_prefix(self, text):
- # Subclasses/instances should set the SPECIAL_CHARS attribute to a set
- # of characters that mark the end of the literal prefix
- specialchars = self.SPECIAL_CHARS
- i = 0
- for i, char in enumerate(text):
- if char in specialchars:
- break
- return text[:i]
- def _btexts(self, ixreader):
- field = ixreader.schema[self.fieldname]
- exp = re.compile(self._get_pattern())
- prefix = self._find_prefix(self.text)
- if prefix:
- candidates = ixreader.expand_prefix(self.fieldname, prefix)
- else:
- candidates = ixreader.lexicon(self.fieldname)
- from_bytes = field.from_bytes
- for btext in candidates:
- text = from_bytes(btext)
- if exp.match(text):
- yield btext
- class Prefix(PatternQuery):
- """Matches documents that contain any terms that start with the given text.
- >>> # Match documents containing words starting with 'comp'
- >>> Prefix("content", u"comp")
- """
- def __unicode__(self):
- return "%s:%s*" % (self.fieldname, self.text)
- __str__ = __unicode__
- def _btexts(self, ixreader):
- return ixreader.expand_prefix(self.fieldname, self.text)
- def matcher(self, searcher, context=None):
- if self.text == "":
- from whoosh.query import Every
- eq = Every(self.fieldname, boost=self.boost)
- return eq.matcher(searcher, context)
- else:
- return PatternQuery.matcher(self, searcher, context)
- class Wildcard(PatternQuery):
- """Matches documents that contain any terms that match a "glob" pattern.
- See the Python ``fnmatch`` module for information about globs.
- >>> Wildcard("content", u"in*f?x")
- """
- SPECIAL_CHARS = frozenset("*?[")
- def __unicode__(self):
- return "%s:%s" % (self.fieldname, self.text)
- __str__ = __unicode__
- def _get_pattern(self):
- return fnmatch.translate(self.text)
- def normalize(self):
- # If there are no wildcard characters in this "wildcard", turn it into
- # a simple Term
- text = self.text
- if text == "*":
- from whoosh.query import Every
- return Every(self.fieldname, boost=self.boost)
- if "*" not in text and "?" not in text:
- # If no wildcard chars, convert to a normal term.
- return Term(self.fieldname, self.text, boost=self.boost)
- elif ("?" not in text and text.endswith("*")
- and text.find("*") == len(text) - 1):
- # If the only wildcard char is an asterisk at the end, convert to a
- # Prefix query.
- return Prefix(self.fieldname, self.text[:-1], boost=self.boost)
- else:
- return self
- def matcher(self, searcher, context=None):
- if self.text == "*":
- from whoosh.query import Every
- eq = Every(self.fieldname, boost=self.boost)
- return eq.matcher(searcher, context)
- else:
- return PatternQuery.matcher(self, searcher, context)
- # _btexts() implemented in PatternQuery
- class Regex(PatternQuery):
- """Matches documents that contain any terms that match a regular
- expression. See the Python ``re`` module for information about regular
- expressions.
- """
- SPECIAL_CHARS = frozenset("{}()[].?*+^$\\")
- def __unicode__(self):
- return '%s:r"%s"' % (self.fieldname, self.text)
- __str__ = __unicode__
- def _get_pattern(self):
- return self.text
- def _find_prefix(self, text):
- if "|" in text:
- return ""
- if text.startswith("^"):
- text = text[1:]
- elif text.startswith("\\A"):
- text = text[2:]
- prefix = PatternQuery._find_prefix(self, text)
- lp = len(prefix)
- if lp < len(text) and text[lp] in "*?":
- # we stripped something starting from * or ? - they both MAY mean
- # "0 times". As we had stripped starting from FIRST special char,
- # that implies there were only ordinary chars left of it. Thus,
- # the very last of them is not part of the real prefix:
- prefix = prefix[:-1]
- return prefix
- def matcher(self, searcher, context=None):
- if self.text == ".*":
- from whoosh.query import Every
- eq = Every(self.fieldname, boost=self.boost)
- return eq.matcher(searcher, context)
- else:
- return PatternQuery.matcher(self, searcher, context)
- # _btexts() implemented in PatternQuery
- class ExpandingTerm(MultiTerm):
- """Intermediate base class for queries such as FuzzyTerm and Variations
- that expand into multiple queries, but come from a single term.
- """
- def has_terms(self):
- return True
- def terms(self, phrases=False):
- if self.field():
- yield (self.field(), self.text)
- class FuzzyTerm(ExpandingTerm):
- """Matches documents containing words similar to the given term.
- """
- __inittypes__ = dict(fieldname=str, text=text_type, boost=float,
- maxdist=float, prefixlength=int)
- def __init__(self, fieldname, text, boost=1.0, maxdist=1,
- prefixlength=1, constantscore=True):
- """
- :param fieldname: The name of the field to search.
- :param text: The text to search for.
- :param boost: A boost factor to apply to scores of documents matching
- this query.
- :param maxdist: The maximum edit distance from the given text.
- :param prefixlength: The matched terms must share this many initial
- characters with 'text'. For example, if text is "light" and
- prefixlength is 2, then only terms starting with "li" are checked
- for similarity.
- """
- self.fieldname = fieldname
- self.text = text
- self.boost = boost
- self.maxdist = maxdist
- self.prefixlength = prefixlength
- self.constantscore = constantscore
- def __eq__(self, other):
- return (other and self.__class__ is other.__class__
- and self.fieldname == other.fieldname
- and self.text == other.text
- and self.maxdist == other.maxdist
- and self.prefixlength == other.prefixlength
- and self.boost == other.boost
- and self.constantscore == other.constantscore)
- def __repr__(self):
- r = "%s(%r, %r, boost=%f, maxdist=%d, prefixlength=%d)"
- return r % (self.__class__.__name__, self.fieldname, self.text,
- self.boost, self.maxdist, self.prefixlength)
- def __unicode__(self):
- r = u("%s:%s") % (self.fieldname, self.text) + u("~")
- if self.maxdist > 1:
- r += u("%d") % self.maxdist
- if self.boost != 1.0:
- r += u("^%f") % self.boost
- return r
- __str__ = __unicode__
- def __hash__(self):
- return (hash(self.fieldname) ^ hash(self.text) ^ hash(self.boost)
- ^ hash(self.maxdist) ^ hash(self.prefixlength)
- ^ hash(self.constantscore))
- def _btexts(self, ixreader):
- return ixreader.terms_within(self.fieldname, self.text, self.maxdist,
- prefix=self.prefixlength)
- def replace(self, fieldname, oldtext, newtext):
- q = copy.copy(self)
- if q.fieldname == fieldname and q.text == oldtext:
- q.text = newtext
- return q
- class Variations(ExpandingTerm):
- """Query that automatically searches for morphological variations of the
- given word in the same field.
- """
- def __init__(self, fieldname, text, boost=1.0):
- self.fieldname = fieldname
- self.text = text
- self.boost = boost
- def __repr__(self):
- r = "%s(%r, %r" % (self.__class__.__name__, self.fieldname, self.text)
- if self.boost != 1:
- r += ", boost=%s" % self.boost
- r += ")"
- return r
- def __eq__(self, other):
- return (other and self.__class__ is other.__class__
- and self.fieldname == other.fieldname
- and self.text == other.text and self.boost == other.boost)
- def __hash__(self):
- return hash(self.fieldname) ^ hash(self.text) ^ hash(self.boost)
- def _btexts(self, ixreader):
- fieldname = self.fieldname
- to_bytes = ixreader.schema[fieldname].to_bytes
- for word in variations(self.text):
- try:
- btext = to_bytes(word)
- except ValueError:
- continue
- if (fieldname, btext) in ixreader:
- yield btext
- def __unicode__(self):
- return u("%s:<%s>") % (self.fieldname, self.text)
- __str__ = __unicode__
- def replace(self, fieldname, oldtext, newtext):
- q = copy.copy(self)
- if q.fieldname == fieldname and q.text == oldtext:
- q.text = newtext
- return q
|