| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312 |
- # Copyright 2010 Matt Chaput. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # 1. Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
- # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # The views and conclusions contained in the software and documentation are
- # those of the authors and should not be interpreted as representing official
- # policies, either expressed or implied, of Matt Chaput.
- from __future__ import division
- from array import array
- from whoosh.compat import xrange
- from whoosh.matching import mcore
- class CombinationMatcher(mcore.Matcher):
- def __init__(self, submatchers, boost=1.0):
- self._submatchers = submatchers
- self._boost = boost
- def supports_block_quality(self):
- return all(m.supports_block_quality() for m in self._submatchers)
- def max_quality(self):
- return max(m.max_quality() for m in self._submatchers
- if m.is_active()) * self._boost
- def supports(self, astype):
- return all(m.supports(astype) for m in self._submatchers)
- def children(self):
- return iter(self._submatchers)
- def score(self):
- return sum(m.score() for m in self._submatchers) * self._boost
- class PreloadedUnionMatcher(CombinationMatcher):
- """Instead of marching the sub-matchers along in parallel, this
- matcher pre-reads the scores for EVERY MATCHING DOCUMENT, trading memory
- for speed.
- This is faster than the implementation using a binary tree of
- :class:`~whoosh.matching.binary.UnionMatcher` objects (possibly just
- because of less overhead), but it doesn't allow getting information about
- the "current" document other than the score, because there isn't really a
- current document, just an array of scores.
- """
- def __init__(self, submatchers, doccount, boost=1.0, scored=True):
- CombinationMatcher.__init__(self, submatchers, boost=boost)
- self._doccount = doccount
- a = array("d")
- active = [subm for subm in self._submatchers if subm.is_active()]
- if active:
- offset = self._docnum = min(m.id() for m in active)
- for m in active:
- while m.is_active():
- if scored:
- score = m.score() * boost
- else:
- score = boost
- docnum = m.id()
- place = docnum - offset
- if len(a) <= place:
- a.extend(0 for _ in xrange(place - len(a) + 1))
- a[place] += score
- m.next()
- self._a = a
- self._offset = offset
- else:
- self._docnum = 0
- self._offset = 0
- self._a = a
- def is_active(self):
- return self._docnum - self._offset < len(self._a)
- def id(self):
- return self._docnum
- def score(self):
- return self._a[self._docnum - self._offset]
- def next(self):
- a = self._a
- offset = self._offset
- place = self._docnum - offset
- place += 1
- while place < len(a) and a[place] == 0:
- place += 1
- self._docnum = place + offset
- def max_quality(self):
- return max(self._a[self._docnum - self._offset:])
- def block_quality(self):
- return self.max_quality()
- def skip_to(self, docnum):
- if docnum < self._docnum:
- return
- self._docnum = docnum
- i = docnum - self._offset
- if i < len(self._a) and self._a[i] == 0:
- self.next()
- def skip_to_quality(self, minquality):
- a = self._a
- offset = self._offset
- place = self._docnum - offset
- skipped = 0
- while place < len(a) and a[place] <= minquality:
- place += 1
- skipped = 1
- self._docnum = place + offset
- return skipped
- def supports(self, astype):
- # This matcher doesn't support any posting values
- return False
- def all_ids(self):
- a = self._a
- offset = self._offset
- place = self._docnum - offset
- while place < len(a):
- if a[place] > 0:
- yield place + offset
- place += 1
- class ArrayUnionMatcher(CombinationMatcher):
- """Instead of marching the sub-matchers along in parallel, this matcher
- pre-reads the scores for a large block of documents at a time from each
- matcher, accumulating the scores in an array.
- This is faster than the implementation using a binary tree of
- :class:`~whoosh.matching.binary.UnionMatcher` objects (possibly just
- because of less overhead), but it doesn't allow getting information about
- the "current" document other than the score, because there isn't really a
- current document, just an array of scores.
- """
- def __init__(self, submatchers, doccount, boost=1.0, scored=True,
- partsize=2048):
- CombinationMatcher.__init__(self, submatchers, boost=boost)
- self._scored = scored
- self._doccount = doccount
- if not partsize:
- partsize = doccount
- self._partsize = partsize
- self._a = array("d", (0 for _ in xrange(self._partsize)))
- self._docnum = self._min_id()
- self._read_part()
- def __repr__(self):
- return ("%s(%r, boost=%f, scored=%r, partsize=%d)"
- % (self.__class__.__name__, self._submatchers, self._boost,
- self._scored, self._partsize))
- def _min_id(self):
- active = [subm for subm in self._submatchers if subm.is_active()]
- if active:
- return min(subm.id() for subm in active)
- else:
- return self._doccount
- def _read_part(self):
- scored = self._scored
- boost = self._boost
- limit = min(self._docnum + self._partsize, self._doccount)
- offset = self._docnum
- a = self._a
- # Clear the array
- for i in xrange(self._partsize):
- a[i] = 0
- # Add the scores from the submatchers into the array
- for m in self._submatchers:
- while m.is_active() and m.id() < limit:
- i = m.id() - offset
- if scored:
- a[i] += m.score() * boost
- else:
- a[i] = 1
- m.next()
- self._offset = offset
- self._limit = limit
- def _find_next(self):
- a = self._a
- docnum = self._docnum
- offset = self._offset
- limit = self._limit
- while docnum < limit:
- if a[docnum - offset] > 0:
- break
- docnum += 1
- if docnum == limit:
- self._docnum = self._min_id()
- self._read_part()
- else:
- self._docnum = docnum
- def supports(self, astype):
- # This matcher doesn't support any posting values
- return False
- def is_active(self):
- return self._docnum < self._doccount
- def max_quality(self):
- return max(m.max_quality() for m in self._submatchers)
- def block_quality(self):
- return max(self._a)
- def skip_to(self, docnum):
- if docnum < self._offset:
- # We've already passed it
- return
- elif docnum < self._limit:
- # It's in the current part
- self._docnum = docnum
- self._find_next()
- return
- # Advance all active submatchers
- submatchers = self._submatchers
- active = False
- for subm in submatchers:
- if subm.is_active():
- subm.skip_to(docnum)
- if any(subm.is_active() for subm in submatchers):
- # Rebuffer
- self._docnum = self._min_id()
- self._read_part()
- else:
- self._docnum = self._doccount
- def skip_to_quality(self, minquality):
- skipped = 0
- while self.is_active() and self.block_quality() <= minquality:
- skipped += 1
- self._docnum = self._limit
- self._read_part()
- if self.is_active():
- self._find_next()
- return skipped
- def id(self):
- return self._docnum
- def all_ids(self):
- doccount = self._doccount
- docnum = self._docnum
- offset = self._offset
- limit = self._limit
- a = self._a
- while docnum < doccount:
- if a[docnum - offset] > 0:
- yield docnum
- docnum += 1
- if docnum == limit:
- self._docnum = docnum
- self._read_part()
- offset = self._offset
- limit = self._limit
- def next(self):
- self._docnum += 1
- return self._find_next()
- def score(self):
- return self._a[self._docnum - self._offset]
|