combo.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312
  1. # Copyright 2010 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. from __future__ import division
  28. from array import array
  29. from whoosh.compat import xrange
  30. from whoosh.matching import mcore
  31. class CombinationMatcher(mcore.Matcher):
  32. def __init__(self, submatchers, boost=1.0):
  33. self._submatchers = submatchers
  34. self._boost = boost
  35. def supports_block_quality(self):
  36. return all(m.supports_block_quality() for m in self._submatchers)
  37. def max_quality(self):
  38. return max(m.max_quality() for m in self._submatchers
  39. if m.is_active()) * self._boost
  40. def supports(self, astype):
  41. return all(m.supports(astype) for m in self._submatchers)
  42. def children(self):
  43. return iter(self._submatchers)
  44. def score(self):
  45. return sum(m.score() for m in self._submatchers) * self._boost
  46. class PreloadedUnionMatcher(CombinationMatcher):
  47. """Instead of marching the sub-matchers along in parallel, this
  48. matcher pre-reads the scores for EVERY MATCHING DOCUMENT, trading memory
  49. for speed.
  50. This is faster than the implementation using a binary tree of
  51. :class:`~whoosh.matching.binary.UnionMatcher` objects (possibly just
  52. because of less overhead), but it doesn't allow getting information about
  53. the "current" document other than the score, because there isn't really a
  54. current document, just an array of scores.
  55. """
  56. def __init__(self, submatchers, doccount, boost=1.0, scored=True):
  57. CombinationMatcher.__init__(self, submatchers, boost=boost)
  58. self._doccount = doccount
  59. a = array("d")
  60. active = [subm for subm in self._submatchers if subm.is_active()]
  61. if active:
  62. offset = self._docnum = min(m.id() for m in active)
  63. for m in active:
  64. while m.is_active():
  65. if scored:
  66. score = m.score() * boost
  67. else:
  68. score = boost
  69. docnum = m.id()
  70. place = docnum - offset
  71. if len(a) <= place:
  72. a.extend(0 for _ in xrange(place - len(a) + 1))
  73. a[place] += score
  74. m.next()
  75. self._a = a
  76. self._offset = offset
  77. else:
  78. self._docnum = 0
  79. self._offset = 0
  80. self._a = a
  81. def is_active(self):
  82. return self._docnum - self._offset < len(self._a)
  83. def id(self):
  84. return self._docnum
  85. def score(self):
  86. return self._a[self._docnum - self._offset]
  87. def next(self):
  88. a = self._a
  89. offset = self._offset
  90. place = self._docnum - offset
  91. place += 1
  92. while place < len(a) and a[place] == 0:
  93. place += 1
  94. self._docnum = place + offset
  95. def max_quality(self):
  96. return max(self._a[self._docnum - self._offset:])
  97. def block_quality(self):
  98. return self.max_quality()
  99. def skip_to(self, docnum):
  100. if docnum < self._docnum:
  101. return
  102. self._docnum = docnum
  103. i = docnum - self._offset
  104. if i < len(self._a) and self._a[i] == 0:
  105. self.next()
  106. def skip_to_quality(self, minquality):
  107. a = self._a
  108. offset = self._offset
  109. place = self._docnum - offset
  110. skipped = 0
  111. while place < len(a) and a[place] <= minquality:
  112. place += 1
  113. skipped = 1
  114. self._docnum = place + offset
  115. return skipped
  116. def supports(self, astype):
  117. # This matcher doesn't support any posting values
  118. return False
  119. def all_ids(self):
  120. a = self._a
  121. offset = self._offset
  122. place = self._docnum - offset
  123. while place < len(a):
  124. if a[place] > 0:
  125. yield place + offset
  126. place += 1
  127. class ArrayUnionMatcher(CombinationMatcher):
  128. """Instead of marching the sub-matchers along in parallel, this matcher
  129. pre-reads the scores for a large block of documents at a time from each
  130. matcher, accumulating the scores in an array.
  131. This is faster than the implementation using a binary tree of
  132. :class:`~whoosh.matching.binary.UnionMatcher` objects (possibly just
  133. because of less overhead), but it doesn't allow getting information about
  134. the "current" document other than the score, because there isn't really a
  135. current document, just an array of scores.
  136. """
  137. def __init__(self, submatchers, doccount, boost=1.0, scored=True,
  138. partsize=2048):
  139. CombinationMatcher.__init__(self, submatchers, boost=boost)
  140. self._scored = scored
  141. self._doccount = doccount
  142. if not partsize:
  143. partsize = doccount
  144. self._partsize = partsize
  145. self._a = array("d", (0 for _ in xrange(self._partsize)))
  146. self._docnum = self._min_id()
  147. self._read_part()
  148. def __repr__(self):
  149. return ("%s(%r, boost=%f, scored=%r, partsize=%d)"
  150. % (self.__class__.__name__, self._submatchers, self._boost,
  151. self._scored, self._partsize))
  152. def _min_id(self):
  153. active = [subm for subm in self._submatchers if subm.is_active()]
  154. if active:
  155. return min(subm.id() for subm in active)
  156. else:
  157. return self._doccount
  158. def _read_part(self):
  159. scored = self._scored
  160. boost = self._boost
  161. limit = min(self._docnum + self._partsize, self._doccount)
  162. offset = self._docnum
  163. a = self._a
  164. # Clear the array
  165. for i in xrange(self._partsize):
  166. a[i] = 0
  167. # Add the scores from the submatchers into the array
  168. for m in self._submatchers:
  169. while m.is_active() and m.id() < limit:
  170. i = m.id() - offset
  171. if scored:
  172. a[i] += m.score() * boost
  173. else:
  174. a[i] = 1
  175. m.next()
  176. self._offset = offset
  177. self._limit = limit
  178. def _find_next(self):
  179. a = self._a
  180. docnum = self._docnum
  181. offset = self._offset
  182. limit = self._limit
  183. while docnum < limit:
  184. if a[docnum - offset] > 0:
  185. break
  186. docnum += 1
  187. if docnum == limit:
  188. self._docnum = self._min_id()
  189. self._read_part()
  190. else:
  191. self._docnum = docnum
  192. def supports(self, astype):
  193. # This matcher doesn't support any posting values
  194. return False
  195. def is_active(self):
  196. return self._docnum < self._doccount
  197. def max_quality(self):
  198. return max(m.max_quality() for m in self._submatchers)
  199. def block_quality(self):
  200. return max(self._a)
  201. def skip_to(self, docnum):
  202. if docnum < self._offset:
  203. # We've already passed it
  204. return
  205. elif docnum < self._limit:
  206. # It's in the current part
  207. self._docnum = docnum
  208. self._find_next()
  209. return
  210. # Advance all active submatchers
  211. submatchers = self._submatchers
  212. active = False
  213. for subm in submatchers:
  214. if subm.is_active():
  215. subm.skip_to(docnum)
  216. if any(subm.is_active() for subm in submatchers):
  217. # Rebuffer
  218. self._docnum = self._min_id()
  219. self._read_part()
  220. else:
  221. self._docnum = self._doccount
  222. def skip_to_quality(self, minquality):
  223. skipped = 0
  224. while self.is_active() and self.block_quality() <= minquality:
  225. skipped += 1
  226. self._docnum = self._limit
  227. self._read_part()
  228. if self.is_active():
  229. self._find_next()
  230. return skipped
  231. def id(self):
  232. return self._docnum
  233. def all_ids(self):
  234. doccount = self._doccount
  235. docnum = self._docnum
  236. offset = self._offset
  237. limit = self._limit
  238. a = self._a
  239. while docnum < doccount:
  240. if a[docnum - offset] > 0:
  241. yield docnum
  242. docnum += 1
  243. if docnum == limit:
  244. self._docnum = docnum
  245. self._read_part()
  246. offset = self._offset
  247. limit = self._limit
  248. def next(self):
  249. self._docnum += 1
  250. return self._find_next()
  251. def score(self):
  252. return self._a[self._docnum - self._offset]