|
- # Copyright 2010 Matt Chaput. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # 1. Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
- # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # The views and conclusions contained in the software and documentation are
- # those of the authors and should not be interpreted as representing official
- # policies, either expressed or implied, of Matt Chaput.
- from whoosh.matching import mcore
- class BiMatcher(mcore.Matcher):
- """Base class for matchers that combine the results of two sub-matchers in
- some way.
- """
- def __init__(self, a, b):
- super(BiMatcher, self).__init__()
- self.a = a
- self.b = b
- def reset(self):
- self.a.reset()
- self.b.reset()
- def __repr__(self):
- return "%s(%r, %r)" % (self.__class__.__name__, self.a, self.b)
- def children(self):
- return [self.a, self.b]
- def copy(self):
- return self.__class__(self.a.copy(), self.b.copy())
- def depth(self):
- return 1 + max(self.a.depth(), self.b.depth())
- def skip_to(self, id):
- if not self.is_active():
- raise mcore.ReadTooFar
- ra = self.a.skip_to(id)
- rb = self.b.skip_to(id)
- return ra or rb
- def supports_block_quality(self):
- return (self.a.supports_block_quality()
- and self.b.supports_block_quality())
- def supports(self, astype):
- return self.a.supports(astype) and self.b.supports(astype)
- class AdditiveBiMatcher(BiMatcher):
- """Base class for binary matchers where the scores of the sub-matchers are
- added together.
- """
- def max_quality(self):
- q = 0.0
- if self.a.is_active():
- q += self.a.max_quality()
- if self.b.is_active():
- q += self.b.max_quality()
- return q
- def block_quality(self):
- bq = 0.0
- if self.a.is_active():
- bq += self.a.block_quality()
- if self.b.is_active():
- bq += self.b.block_quality()
- return bq
- def weight(self):
- return (self.a.weight() + self.b.weight())
- def score(self):
- return (self.a.score() + self.b.score())
- def __eq__(self, other):
- return self.__class__ is type(other)
- def __lt__(self, other):
- return type(other) is self.__class__
- def __ne__(self, other):
- return not self.__eq__(other)
- def __gt__(self, other):
- return not (self.__lt__(other) or self.__eq__(other))
- def __le__(self, other):
- return self.__eq__(other) or self.__lt__(other)
- def __ge__(self, other):
- return self.__eq__(other) or self.__gt__(other)
- class UnionMatcher(AdditiveBiMatcher):
- """Matches the union (OR) of the postings in the two sub-matchers.
- """
- _id = None
- def replace(self, minquality=0):
- a = self.a
- b = self.b
- a_active = a.is_active()
- b_active = b.is_active()
- # If neither sub-matcher on its own has a high enough max quality to
- # contribute, convert to an intersection matcher
- if minquality and a_active and b_active:
- a_max = a.max_quality()
- b_max = b.max_quality()
- if a_max < minquality and b_max < minquality:
- return IntersectionMatcher(a, b).replace(minquality)
- elif a_max < minquality:
- return AndMaybeMatcher(b, a)
- elif b_max < minquality:
- return AndMaybeMatcher(a, b)
- # If one or both of the sub-matchers are inactive, convert
- if not (a_active or b_active):
- return mcore.NullMatcher()
- elif not a_active:
- return b.replace(minquality)
- elif not b_active:
- return a.replace(minquality)
- a = a.replace(minquality - b.max_quality() if minquality else 0)
- b = b.replace(minquality - a.max_quality() if minquality else 0)
- # If one of the sub-matchers changed, return a new union
- if a is not self.a or b is not self.b:
- return self.__class__(a, b)
- else:
- self._id = None
- return self
- def is_active(self):
- return self.a.is_active() or self.b.is_active()
- def skip_to(self, id):
- self._id = None
- ra = rb = False
- if self.a.is_active():
- ra = self.a.skip_to(id)
- if self.b.is_active():
- rb = self.b.skip_to(id)
- return ra or rb
- def id(self):
- _id = self._id
- if _id is not None:
- return _id
- a = self.a
- b = self.b
- if not a.is_active():
- _id = b.id()
- elif not b.is_active():
- _id = a.id()
- else:
- _id = min(a.id(), b.id())
- self._id = _id
- return _id
- # Using sets is faster in most cases, but could potentially use a lot of
- # memory. Comment out this method override to not use sets.
- #def all_ids(self):
- # return iter(sorted(set(self.a.all_ids()) | set(self.b.all_ids())))
- def next(self):
- self._id = None
- a = self.a
- b = self.b
- a_active = a.is_active()
- b_active = b.is_active()
- # Shortcut when one matcher is inactive
- if not (a_active or b_active):
- raise mcore.ReadTooFar
- elif not a_active:
- return b.next()
- elif not b_active:
- return a.next()
- a_id = a.id()
- b_id = b.id()
- ar = br = None
- # After all that, here's the actual implementation
- if a_id <= b_id:
- ar = a.next()
- if b_id <= a_id:
- br = b.next()
- return ar or br
- def spans(self):
- if not self.a.is_active():
- return self.b.spans()
- if not self.b.is_active():
- return self.a.spans()
- id_a = self.a.id()
- id_b = self.b.id()
- if id_a < id_b:
- return self.a.spans()
- elif id_b < id_a:
- return self.b.spans()
- else:
- return sorted(set(self.a.spans()) | set(self.b.spans()))
- def weight(self):
- a = self.a
- b = self.b
- if not a.is_active():
- return b.weight()
- if not b.is_active():
- return a.weight()
- id_a = a.id()
- id_b = b.id()
- if id_a < id_b:
- return a.weight()
- elif id_b < id_a:
- return b.weight()
- else:
- return (a.weight() + b.weight())
- def score(self):
- a = self.a
- b = self.b
- if not a.is_active():
- return b.score()
- if not b.is_active():
- return a.score()
- id_a = a.id()
- id_b = b.id()
- if id_a < id_b:
- return a.score()
- elif id_b < id_a:
- return b.score()
- else:
- return (a.score() + b.score())
- def skip_to_quality(self, minquality):
- self._id = None
- a = self.a
- b = self.b
- if not (a.is_active() or b.is_active()):
- raise mcore.ReadTooFar
- # Short circuit if one matcher is inactive
- if not a.is_active():
- return b.skip_to_quality(minquality)
- elif not b.is_active():
- return a.skip_to_quality(minquality)
- skipped = 0
- aq = a.block_quality()
- bq = b.block_quality()
- while a.is_active() and b.is_active() and aq + bq <= minquality:
- if aq < bq:
- skipped += a.skip_to_quality(minquality - bq)
- aq = a.block_quality()
- else:
- skipped += b.skip_to_quality(minquality - aq)
- bq = b.block_quality()
- return skipped
- class DisjunctionMaxMatcher(UnionMatcher):
- """Matches the union (OR) of two sub-matchers. Where both sub-matchers
- match the same posting, returns the weight/score of the higher-scoring
- posting.
- """
- # TODO: this class inherits from AdditiveBiMatcher (through UnionMatcher)
- # but it does not add the scores of the sub-matchers together (it
- # overrides all methods that perform addition). Need to clean up the
- # inheritance.
- def __init__(self, a, b, tiebreak=0.0):
- super(DisjunctionMaxMatcher, self).__init__(a, b)
- self.tiebreak = tiebreak
- def copy(self):
- return self.__class__(self.a.copy(), self.b.copy(),
- tiebreak=self.tiebreak)
- def replace(self, minquality=0):
- a = self.a
- b = self.b
- a_active = a.is_active()
- b_active = b.is_active()
- # DisMax takes the max of the sub-matcher qualities instead of adding
- # them, so we need special logic here
- if minquality and a_active and b_active:
- a_max = a.max_quality()
- b_max = b.max_quality()
- if a_max < minquality and b_max < minquality:
- # If neither sub-matcher has a high enough max quality to
- # contribute, return an inactive matcher
- return mcore.NullMatcher()
- elif b_max < minquality:
- # If the b matcher can't contribute, return a
- return a.replace(minquality)
- elif a_max < minquality:
- # If the a matcher can't contribute, return b
- return b.replace(minquality)
- if not (a_active or b_active):
- return mcore.NullMatcher()
- elif not a_active:
- return b.replace(minquality)
- elif not b_active:
- return a.replace(minquality)
- # We CAN pass the minquality down here, since we don't add the two
- # scores together
- a = a.replace(minquality)
- b = b.replace(minquality)
- a_active = a.is_active()
- b_active = b.is_active()
- # It's kind of tedious to check for inactive sub-matchers all over
- # again here after we replace them, but it's probably better than
- # returning a replacement with an inactive sub-matcher
- if not (a_active and b_active):
- return mcore.NullMatcher()
- elif not a_active:
- return b
- elif not b_active:
- return a
- elif a is not self.a or b is not self.b:
- # If one of the sub-matchers changed, return a new DisMax
- return self.__class__(a, b)
- else:
- return self
- def score(self):
- if not self.a.is_active():
- return self.b.score()
- elif not self.b.is_active():
- return self.a.score()
- else:
- return max(self.a.score(), self.b.score())
- def max_quality(self):
- return max(self.a.max_quality(), self.b.max_quality())
- def block_quality(self):
- return max(self.a.block_quality(), self.b.block_quality())
- def skip_to_quality(self, minquality):
- a = self.a
- b = self.b
- # Short circuit if one matcher is inactive
- if not a.is_active():
- sk = b.skip_to_quality(minquality)
- return sk
- elif not b.is_active():
- return a.skip_to_quality(minquality)
- skipped = 0
- aq = a.block_quality()
- bq = b.block_quality()
- while a.is_active() and b.is_active() and max(aq, bq) <= minquality:
- if aq <= minquality:
- skipped += a.skip_to_quality(minquality)
- aq = a.block_quality()
- if bq <= minquality:
- skipped += b.skip_to_quality(minquality)
- bq = b.block_quality()
- return skipped
- class IntersectionMatcher(AdditiveBiMatcher):
- """Matches the intersection (AND) of the postings in the two sub-matchers.
- """
- def __init__(self, a, b):
- super(IntersectionMatcher, self).__init__(a, b)
- self._find_first()
- def reset(self):
- self.a.reset()
- self.b.reset()
- self._find_first()
- def _find_first(self):
- if (self.a.is_active()
- and self.b.is_active()
- and self.a.id() != self.b.id()):
- self._find_next()
- def replace(self, minquality=0):
- a = self.a
- b = self.b
- a_active = a.is_active()
- b_active = b.is_active()
- if not (a_active and b_active):
- # Intersection matcher requires that both sub-matchers be active
- return mcore.NullMatcher()
- if minquality:
- a_max = a.max_quality()
- b_max = b.max_quality()
- if a_max + b_max < minquality:
- # If the combined quality of the sub-matchers can't contribute,
- # return an inactive matcher
- return mcore.NullMatcher()
- # Require that the replacements be able to contribute results
- # higher than the minquality
- a_min = minquality - b_max
- b_min = minquality - a_max
- else:
- a_min = b_min = 0
- a = a.replace(a_min)
- b = b.replace(b_min)
- a_active = a.is_active()
- b_active = b.is_active()
- if not (a_active or b_active):
- return mcore.NullMatcher()
- elif not a_active:
- return b
- elif not b_active:
- return a
- elif a is not self.a or b is not self.b:
- return self.__class__(a, b)
- else:
- return self
- def is_active(self):
- return self.a.is_active() and self.b.is_active()
- def _find_next(self):
- a = self.a
- b = self.b
- a_id = a.id()
- b_id = b.id()
- assert a_id != b_id
- r = False
- while a.is_active() and b.is_active() and a_id != b_id:
- if a_id < b_id:
- ra = a.skip_to(b_id)
- if not a.is_active():
- return
- r = r or ra
- a_id = a.id()
- else:
- rb = b.skip_to(a_id)
- if not b.is_active():
- return
- r = r or rb
- b_id = b.id()
- return r
- def id(self):
- return self.a.id()
- # Using sets is faster in some cases, but could potentially use a lot of
- # memory
- def all_ids(self):
- return iter(sorted(set(self.a.all_ids()) & set(self.b.all_ids())))
- def skip_to(self, id):
- if not self.is_active():
- raise mcore.ReadTooFar
- ra = self.a.skip_to(id)
- rb = self.b.skip_to(id)
- if self.is_active():
- rn = False
- if self.a.id() != self.b.id():
- rn = self._find_next()
- return ra or rb or rn
- def skip_to_quality(self, minquality):
- a = self.a
- b = self.b
- minquality = minquality
- skipped = 0
- aq = a.block_quality()
- bq = b.block_quality()
- while a.is_active() and b.is_active() and aq + bq <= minquality:
- if aq < bq:
- # If the block quality of A is less than B, skip A ahead until
- # it can contribute at least the balance of the required min
- # quality when added to B
- sk = a.skip_to_quality(minquality - bq)
- skipped += sk
- if not sk and a.is_active():
- # The matcher couldn't skip ahead for some reason, so just
- # advance and try again
- a.next()
- else:
- # And vice-versa
- sk = b.skip_to_quality(minquality - aq)
- skipped += sk
- if not sk and b.is_active():
- b.next()
- if not a.is_active() or not b.is_active():
- # One of the matchers is exhausted
- break
- if a.id() != b.id():
- # We want to always leave in a state where the matchers are at
- # the same document, so call _find_next() to sync them
- self._find_next()
- # Get the block qualities at the new matcher positions
- aq = a.block_quality()
- bq = b.block_quality()
- return skipped
- def next(self):
- if not self.is_active():
- raise mcore.ReadTooFar
- # We must assume that the ids are equal whenever next() is called (they
- # should have been made equal by _find_next), so advance them both
- ar = self.a.next()
- if self.is_active():
- nr = self._find_next()
- return ar or nr
- def spans(self):
- return sorted(set(self.a.spans()) | set(self.b.spans()))
- class AndNotMatcher(BiMatcher):
- """Matches the postings in the first sub-matcher that are NOT present in
- the second sub-matcher.
- """
- def __init__(self, a, b):
- super(AndNotMatcher, self).__init__(a, b)
- self._find_first()
- def reset(self):
- self.a.reset()
- self.b.reset()
- self._find_first()
- def _find_first(self):
- if (self.a.is_active()
- and self.b.is_active()
- and self.a.id() == self.b.id()):
- self._find_next()
- def is_active(self):
- return self.a.is_active()
- def _find_next(self):
- pos = self.a
- neg = self.b
- if not neg.is_active():
- return
- pos_id = pos.id()
- r = False
- if neg.id() < pos_id:
- neg.skip_to(pos_id)
- while pos.is_active() and neg.is_active() and pos_id == neg.id():
- nr = pos.next()
- if not pos.is_active():
- break
- r = r or nr
- pos_id = pos.id()
- neg.skip_to(pos_id)
- return r
- def supports_block_quality(self):
- return self.a.supports_block_quality()
- def replace(self, minquality=0):
- if not self.a.is_active():
- # The a matcher is required, so if it's inactive, return an
- # inactive matcher
- return mcore.NullMatcher()
- elif (minquality
- and self.a.max_quality() < minquality):
- # If the quality of the required matcher isn't high enough to
- # contribute, return an inactive matcher
- return mcore.NullMatcher()
- elif not self.b.is_active():
- # If the prohibited matcher is inactive, convert to just the
- # required matcher
- return self.a.replace(minquality)
- a = self.a.replace(minquality)
- b = self.b.replace()
- if a is not self.a or b is not self.b:
- # If one of the sub-matchers was replaced, return a new AndNot
- return self.__class__(a, b)
- else:
- return self
- def max_quality(self):
- return self.a.max_quality()
- def block_quality(self):
- return self.a.block_quality()
- def skip_to_quality(self, minquality):
- skipped = self.a.skip_to_quality(minquality)
- self._find_next()
- return skipped
- def id(self):
- return self.a.id()
- def next(self):
- if not self.a.is_active():
- raise mcore.ReadTooFar
- ar = self.a.next()
- nr = False
- if self.a.is_active() and self.b.is_active():
- nr = self._find_next()
- return ar or nr
- def skip_to(self, id):
- if not self.a.is_active():
- raise mcore.ReadTooFar
- if id < self.a.id():
- return
- self.a.skip_to(id)
- if self.b.is_active():
- self.b.skip_to(id)
- self._find_next()
- def weight(self):
- return self.a.weight()
- def score(self):
- return self.a.score()
- def supports(self, astype):
- return self.a.supports(astype)
- def value(self):
- return self.a.value()
- def value_as(self, astype):
- return self.a.value_as(astype)
- class AndMaybeMatcher(AdditiveBiMatcher):
- """Matches postings in the first sub-matcher, and if the same posting is
- in the second sub-matcher, adds their scores.
- """
- def __init__(self, a, b):
- AdditiveBiMatcher.__init__(self, a, b)
- self._first_b()
- def reset(self):
- self.a.reset()
- self.b.reset()
- self._first_b()
- def _first_b(self):
- a = self.a
- b = self.b
- if a.is_active() and b.is_active() and a.id() != b.id():
- b.skip_to(a.id())
- def is_active(self):
- return self.a.is_active()
- def id(self):
- return self.a.id()
- def next(self):
- if not self.a.is_active():
- raise mcore.ReadTooFar
- ar = self.a.next()
- br = False
- if self.a.is_active() and self.b.is_active():
- br = self.b.skip_to(self.a.id())
- return ar or br
- def skip_to(self, id):
- if not self.a.is_active():
- raise mcore.ReadTooFar
- ra = self.a.skip_to(id)
- rb = False
- if self.a.is_active() and self.b.is_active():
- rb = self.b.skip_to(id)
- return ra or rb
- def replace(self, minquality=0):
- a = self.a
- b = self.b
- a_active = a.is_active()
- b_active = b.is_active()
- if not a_active:
- return mcore.NullMatcher()
- elif minquality and b_active:
- if a.max_quality() + b.max_quality() < minquality:
- # If the combined max quality of the sub-matchers isn't high
- # enough to possibly contribute, return an inactive matcher
- return mcore.NullMatcher()
- elif a.max_quality() < minquality:
- # If the max quality of the main sub-matcher isn't high enough
- # to ever contribute without the optional sub- matcher, change
- # into an IntersectionMatcher
- return IntersectionMatcher(self.a, self.b)
- elif not b_active:
- return a.replace(minquality)
- new_a = a.replace(minquality - b.max_quality())
- new_b = b.replace(minquality - a.max_quality())
- if new_a is not a or new_b is not b:
- # If one of the sub-matchers changed, return a new AndMaybe
- return self.__class__(new_a, new_b)
- else:
- return self
- def skip_to_quality(self, minquality):
- a = self.a
- b = self.b
- minquality = minquality
- if not a.is_active():
- raise mcore.ReadTooFar
- if not b.is_active():
- return a.skip_to_quality(minquality)
- skipped = 0
- aq = a.block_quality()
- bq = b.block_quality()
- while a.is_active() and b.is_active() and aq + bq <= minquality:
- if aq < bq:
- skipped += a.skip_to_quality(minquality - bq)
- aq = a.block_quality()
- else:
- skipped += b.skip_to_quality(minquality - aq)
- bq = b.block_quality()
- return skipped
- def weight(self):
- if self.a.id() == self.b.id():
- return self.a.weight() + self.b.weight()
- else:
- return self.a.weight()
- def score(self):
- if self.b.is_active() and self.a.id() == self.b.id():
- return self.a.score() + self.b.score()
- else:
- return self.a.score()
- def supports(self, astype):
- return self.a.supports(astype)
- def value(self):
- return self.a.value()
- def value_as(self, astype):
- return self.a.value_as(astype)
|