123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572 |
- # Copyright 2010 Matt Chaput. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # 1. Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
- # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # The views and conclusions contained in the software and documentation are
- # those of the authors and should not be interpreted as representing official
- # policies, either expressed or implied, of Matt Chaput.
- from __future__ import division
- from whoosh.compat import xrange
- from whoosh.matching import mcore
- class WrappingMatcher(mcore.Matcher):
- """Base class for matchers that wrap sub-matchers.
- """
- def __init__(self, child, boost=1.0):
- self.child = child
- self.boost = boost
- def __repr__(self):
- return "%s(%r, boost=%s)" % (self.__class__.__name__, self.child,
- self.boost)
- def copy(self):
- kwargs = {}
- if hasattr(self, "boost"):
- kwargs["boost"] = self.boost
- return self.__class__(self.child.copy(), **kwargs)
- def depth(self):
- return 1 + self.child.depth()
- def _replacement(self, newchild):
- return self.__class__(newchild, boost=self.boost)
- def replace(self, minquality=0):
- # Replace the child matcher
- r = self.child.replace(minquality)
- if r is not self.child:
- # If the child changed, return a new wrapper on the new child
- return self._replacement(r)
- else:
- return self
- def id(self):
- return self.child.id()
- def all_ids(self):
- return self.child.all_ids()
- def is_active(self):
- return self.child.is_active()
- def reset(self):
- self.child.reset()
- def children(self):
- return [self.child]
- def supports(self, astype):
- return self.child.supports(astype)
- def value(self):
- return self.child.value()
- def value_as(self, astype):
- return self.child.value_as(astype)
- def spans(self):
- return self.child.spans()
- def skip_to(self, id):
- return self.child.skip_to(id)
- def next(self):
- self.child.next()
- def supports_block_quality(self):
- return self.child.supports_block_quality()
- def skip_to_quality(self, minquality):
- return self.child.skip_to_quality(minquality / self.boost)
- def max_quality(self):
- return self.child.max_quality() * self.boost
- def block_quality(self):
- return self.child.block_quality() * self.boost
- def weight(self):
- return self.child.weight() * self.boost
- def score(self):
- return self.child.score() * self.boost
- class MultiMatcher(mcore.Matcher):
- """Serializes the results of a list of sub-matchers.
- """
- def __init__(self, matchers, idoffsets, scorer=None, current=0):
- """
- :param matchers: a list of Matcher objects.
- :param idoffsets: a list of offsets corresponding to items in the
- ``matchers`` list.
- """
- self.matchers = matchers
- self.offsets = idoffsets
- self.scorer = scorer
- self.current = current
- self._next_matcher()
- def __repr__(self):
- return "%s(%r, %r, current=%s)" % (self.__class__.__name__,
- self.matchers, self.offsets,
- self.current)
- def is_active(self):
- return self.current < len(self.matchers)
- def reset(self):
- for mr in self.matchers:
- mr.reset()
- self.current = 0
- def children(self):
- return [self.matchers[self.current]]
- def _next_matcher(self):
- matchers = self.matchers
- while (self.current < len(matchers)
- and not matchers[self.current].is_active()):
- self.current += 1
- def copy(self):
- return self.__class__([mr.copy() for mr in self.matchers],
- self.offsets, current=self.current)
- def depth(self):
- if self.is_active():
- return 1 + max(mr.depth() for mr in self.matchers[self.current:])
- else:
- return 0
- def replace(self, minquality=0):
- m = self
- if minquality:
- # Skip sub-matchers that don't have a high enough max quality to
- # contribute
- while (m.is_active()
- and m.matchers[m.current].max_quality() < minquality):
- m = self.__class__(self.matchers, self.offsets, self.scorer,
- m.current + 1)
- m._next_matcher()
- if not m.is_active():
- return mcore.NullMatcher()
- # TODO: Possible optimization: if the last matcher is current, replace
- # this with the last matcher, but wrap it with a matcher that adds the
- # offset. Have to check whether that's actually faster, though.
- return m
- def id(self):
- current = self.current
- return self.matchers[current].id() + self.offsets[current]
- def all_ids(self):
- offsets = self.offsets
- for i, mr in enumerate(self.matchers):
- for id in mr.all_ids():
- yield id + offsets[i]
- def spans(self):
- return self.matchers[self.current].spans()
- def supports(self, astype):
- return self.matchers[self.current].supports(astype)
- def value(self):
- return self.matchers[self.current].value()
- def value_as(self, astype):
- return self.matchers[self.current].value_as(astype)
- def next(self):
- if not self.is_active():
- raise mcore.ReadTooFar
- self.matchers[self.current].next()
- if not self.matchers[self.current].is_active():
- self._next_matcher()
- def skip_to(self, id):
- if not self.is_active():
- raise mcore.ReadTooFar
- if id <= self.id():
- return
- matchers = self.matchers
- offsets = self.offsets
- r = False
- while self.current < len(matchers) and id > self.id():
- mr = matchers[self.current]
- sr = mr.skip_to(id - offsets[self.current])
- r = sr or r
- if mr.is_active():
- break
- self._next_matcher()
- return r
- def supports_block_quality(self):
- return all(mr.supports_block_quality() for mr
- in self.matchers[self.current:])
- def max_quality(self):
- return max(m.max_quality() for m in self.matchers[self.current:])
- def block_quality(self):
- return self.matchers[self.current].block_quality()
- def weight(self):
- return self.matchers[self.current].weight()
- def score(self):
- return self.scorer.score(self)
- def ExcludeMatcher(child, excluded, boost=1.0):
- return FilterMatcher(child, excluded, exclude=True, boost=boost)
- class FilterMatcher(WrappingMatcher):
- """Filters the postings from the wrapped based on whether the IDs are
- present in or absent from a set.
- """
- def __init__(self, child, ids, exclude=False, boost=1.0):
- """
- :param child: the child matcher.
- :param ids: a set of IDs to filter by.
- :param exclude: by default, only IDs from the wrapped matcher that are
- **in** the set are used. If this argument is True, only IDs from
- the wrapped matcher that are **not in** the set are used.
- """
- super(FilterMatcher, self).__init__(child)
- self._ids = ids
- self._exclude = exclude
- self.boost = boost
- self._find_next()
- def __repr__(self):
- return "%s(%r, %r, %r, boost=%s)" % (self.__class__.__name__,
- self.child, self._ids,
- self._exclude, self.boost)
- def reset(self):
- self.child.reset()
- self._find_next()
- def copy(self):
- return self.__class__(self.child.copy(), self._ids, self._exclude,
- boost=self.boost)
- def _replacement(self, newchild):
- return self.__class__(newchild, self._ids, exclude=self._exclude,
- boost=self.boost)
- def _find_next(self):
- child = self.child
- ids = self._ids
- r = False
- if self._exclude:
- while child.is_active() and child.id() in ids:
- r = child.next() or r
- else:
- while child.is_active() and child.id() not in ids:
- r = child.next() or r
- return r
- def next(self):
- self.child.next()
- self._find_next()
- def skip_to(self, id):
- self.child.skip_to(id)
- self._find_next()
- def all_ids(self):
- ids = self._ids
- if self._exclude:
- return (id for id in self.child.all_ids() if id not in ids)
- else:
- return (id for id in self.child.all_ids() if id in ids)
- def all_items(self):
- ids = self._ids
- if self._exclude:
- return (item for item in self.child.all_items()
- if item[0] not in ids)
- else:
- return (item for item in self.child.all_items() if item[0] in ids)
- class InverseMatcher(WrappingMatcher):
- """Synthetic matcher, generates postings that are NOT present in the
- wrapped matcher.
- """
- def __init__(self, child, limit, missing=None, weight=1.0, id=0):
- super(InverseMatcher, self).__init__(child)
- self.limit = limit
- self._weight = weight
- self.missing = missing or (lambda id: False)
- self._id = id
- self._find_next()
- def copy(self):
- return self.__class__(self.child.copy(), self.limit,
- weight=self._weight, missing=self.missing,
- id=self._id)
- def _replacement(self, newchild):
- return self.__class__(newchild, self.limit, missing=self.missing,
- weight=self._weight, id=self._id)
- def is_active(self):
- return self._id < self.limit
- def reset(self):
- self.child.reset()
- self._id = 0
- self._find_next()
- def supports_block_quality(self):
- return False
- def _find_next(self):
- child = self.child
- missing = self.missing
- # If the current docnum isn't missing and the child matcher is
- # exhausted (so we don't have to worry about skipping its matches), we
- # don't have to do anything
- if not child.is_active() and not missing(self._id):
- return
- # Skip missing documents
- while self._id < self.limit and missing(self._id):
- self._id += 1
- # Catch the child matcher up to where this matcher is
- if child.is_active() and child.id() < self._id:
- child.skip_to(self._id)
- # While self._id is missing or is in the child matcher, increase it
- while child.is_active() and self._id < self.limit:
- if missing(self._id):
- self._id += 1
- continue
- if self._id == child.id():
- self._id += 1
- child.next()
- continue
- break
- def id(self):
- return self._id
- def all_ids(self):
- return mcore.Matcher.all_ids(self)
- def next(self):
- if self._id >= self.limit:
- raise mcore.ReadTooFar
- self._id += 1
- self._find_next()
- def skip_to(self, id):
- if self._id >= self.limit:
- raise mcore.ReadTooFar
- if id < self._id:
- return
- self._id = id
- self._find_next()
- def weight(self):
- return self._weight
- def score(self):
- return self._weight
- class RequireMatcher(WrappingMatcher):
- """Matches postings that are in both sub-matchers, but only uses scores
- from the first.
- """
- def __init__(self, a, b):
- from whoosh.matching.binary import IntersectionMatcher
- self.a = a
- self.b = b
- WrappingMatcher.__init__(self, IntersectionMatcher(a, b))
- def copy(self):
- return self.__class__(self.a.copy(), self.b.copy())
- def supports_block_quality(self):
- return self.a.supports_block_quality()
- def replace(self, minquality=0):
- if not self.child.is_active():
- # If one of the sub-matchers is inactive, go inactive
- return mcore.NullMatcher()
- elif minquality and self.a.max_quality() < minquality:
- # If the required matcher doesn't have a high enough max quality
- # to possibly contribute, return an inactive matcher
- return mcore.NullMatcher()
- new_a = self.a.replace(minquality)
- new_b = self.b.replace()
- if not new_a.is_active():
- return mcore.NullMatcher()
- elif new_a is not self.a or new_b is not self.b:
- # If one of the sub-matchers changed, return a new Require
- return self.__class__(new_a, self.b)
- else:
- return self
- def max_quality(self):
- return self.a.max_quality()
- def block_quality(self):
- return self.a.block_quality()
- def skip_to_quality(self, minquality):
- skipped = self.a.skip_to_quality(minquality)
- self.child._find_next()
- return skipped
- def weight(self):
- return self.a.weight()
- def score(self):
- return self.a.score()
- def supports(self, astype):
- return self.a.supports(astype)
- def value(self):
- return self.a.value()
- def value_as(self, astype):
- return self.a.value_as(astype)
- class ConstantScoreWrapperMatcher(WrappingMatcher):
- def __init__(self, child, score=1.0):
- WrappingMatcher.__init__(self, child)
- self._score = score
- def copy(self):
- return self.__class__(self.child.copy(), score=self._score)
- def _replacement(self, newchild):
- return self.__class__(newchild, score=self._score)
- def max_quality(self):
- return self._score
- def block_quality(self):
- return self._score
- def score(self):
- return self._score
- class SingleTermMatcher(WrappingMatcher):
- """Makes a tree of matchers act as if they were a matcher for a single
- term for the purposes of "what terms are matching?" questions.
- """
- def __init__(self, child, term):
- WrappingMatcher.__init__(self, child)
- self._term = term
- def term(self):
- return self._term
- def replace(self, minquality=0):
- return self
- class CoordMatcher(WrappingMatcher):
- """Modifies the computed score to penalize documents that don't match all
- terms in the matcher tree.
- Because this matcher modifies the score, it may give unexpected results
- when compared to another matcher returning the unmodified score.
- """
- def __init__(self, child, scale=1.0):
- WrappingMatcher.__init__(self, child)
- self._termcount = len(list(child.term_matchers()))
- self._maxqual = child.max_quality()
- self._scale = scale
- def _replacement(self, newchild):
- return self.__class__(newchild, scale=self._scale)
- def _sqr(self, score, matching):
- # This is the "SQR" (Short Query Ranking) function used by Apple's old
- # V-twin search library, described in the paper "V-Twin: A Lightweight
- # Engine for Interactive Use".
- #
- # http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.56.1916
- # score - document score using the current weighting function
- # matching - number of matching terms in the current document
- termcount = self._termcount # Number of terms in this tree
- scale = self._scale # Scaling factor
- sqr = ((score + ((matching - 1) / (termcount - scale) ** 2))
- * ((termcount - 1) / termcount))
- return sqr
- def max_quality(self):
- return self._sqr(self.child.max_quality(), self._termcount)
- def block_quality(self):
- return self._sqr(self.child.block_quality(), self._termcount)
- def score(self):
- child = self.child
- score = child.score()
- matching = 0
- for _ in child.matching_terms(child.id()):
- matching += 1
- return self._sqr(score, matching)
|