collectors.py 41 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165
  1. # Copyright 2012 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. """
  28. This module contains "collector" objects. Collectors provide a way to gather
  29. "raw" results from a :class:`whoosh.matching.Matcher` object, implement
  30. sorting, filtering, collation, etc., and produce a
  31. :class:`whoosh.searching.Results` object.
  32. The basic collectors are:
  33. TopCollector
  34. Returns the top N matching results sorted by score, using block-quality
  35. optimizations to skip blocks of documents that can't contribute to the top
  36. N. The :meth:`whoosh.searching.Searcher.search` method uses this type of
  37. collector by default or when you specify a ``limit``.
  38. UnlimitedCollector
  39. Returns all matching results sorted by score. The
  40. :meth:`whoosh.searching.Searcher.search` method uses this type of collector
  41. when you specify ``limit=None`` or you specify a limit equal to or greater
  42. than the number of documents in the searcher.
  43. SortingCollector
  44. Returns all matching results sorted by a :class:`whoosh.sorting.Facet`
  45. object. The :meth:`whoosh.searching.Searcher.search` method uses this type
  46. of collector when you use the ``sortedby`` parameter.
  47. Here's an example of a simple collector that instead of remembering the matched
  48. documents just counts up the number of matches::
  49. class CountingCollector(Collector):
  50. def prepare(self, top_searcher, q, context):
  51. # Always call super method in prepare
  52. Collector.prepare(self, top_searcher, q, context)
  53. self.count = 0
  54. def collect(self, sub_docnum):
  55. self.count += 1
  56. c = CountingCollector()
  57. mysearcher.search_with_collector(myquery, c)
  58. print(c.count)
  59. There are also several wrapping collectors that extend or modify the
  60. functionality of other collectors. The meth:`whoosh.searching.Searcher.search`
  61. method uses many of these when you specify various parameters.
  62. NOTE: collectors are not designed to be reentrant or thread-safe. It is
  63. generally a good idea to create a new collector for each search.
  64. """
  65. import os
  66. import threading
  67. from array import array
  68. from bisect import insort
  69. from collections import defaultdict
  70. from heapq import heapify, heappush, heapreplace
  71. from whoosh import sorting
  72. from whoosh.compat import abstractmethod, iteritems, itervalues, xrange
  73. from whoosh.searching import Results, TimeLimit
  74. from whoosh.util import now
  75. # Functions
  76. def ilen(iterator):
  77. total = 0
  78. for _ in iterator:
  79. total += 1
  80. return total
  81. # Base class
  82. class Collector(object):
  83. """Base class for collectors.
  84. """
  85. def prepare(self, top_searcher, q, context):
  86. """This method is called before a search.
  87. Subclasses can override this to perform set-up work, but
  88. they should still call the superclass's method because it sets several
  89. necessary attributes on the collector object:
  90. self.top_searcher
  91. The top-level searcher.
  92. self.q
  93. The query object
  94. self.context
  95. ``context.needs_current`` controls whether a wrapping collector
  96. requires that this collector's matcher be in a valid state at every
  97. call to ``collect()``. If this is ``False``, the collector is free
  98. to use faster methods that don't necessarily keep the matcher
  99. updated, such as ``matcher.all_ids()``.
  100. :param top_searcher: the top-level :class:`whoosh.searching.Searcher`
  101. object.
  102. :param q: the :class:`whoosh.query.Query` object being searched for.
  103. :param context: a :class:`whoosh.searching.SearchContext` object
  104. containing information about the search.
  105. """
  106. self.top_searcher = top_searcher
  107. self.q = q
  108. self.context = context
  109. self.starttime = now()
  110. self.runtime = None
  111. self.docset = set()
  112. def run(self):
  113. # Collect matches for each sub-searcher
  114. try:
  115. for subsearcher, offset in self.top_searcher.leaf_searchers():
  116. self.set_subsearcher(subsearcher, offset)
  117. self.collect_matches()
  118. finally:
  119. self.finish()
  120. def set_subsearcher(self, subsearcher, offset):
  121. """This method is called each time the collector starts on a new
  122. sub-searcher.
  123. Subclasses can override this to perform set-up work, but
  124. they should still call the superclass's method because it sets several
  125. necessary attributes on the collector object:
  126. self.subsearcher
  127. The current sub-searcher. If the top-level searcher is atomic, this
  128. is the same as the top-level searcher.
  129. self.offset
  130. The document number offset of the current searcher. You must add
  131. this number to the document number passed to
  132. :meth:`Collector.collect` to get the top-level document number
  133. for use in results.
  134. self.matcher
  135. A :class:`whoosh.matching.Matcher` object representing the matches
  136. for the query in the current sub-searcher.
  137. """
  138. self.subsearcher = subsearcher
  139. self.offset = offset
  140. self.matcher = self.q.matcher(subsearcher, self.context)
  141. def computes_count(self):
  142. """Returns True if the collector naturally computes the exact number of
  143. matching documents. Collectors that use block optimizations will return
  144. False since they might skip blocks containing matching documents.
  145. Note that if this method returns False you can still call :meth:`count`,
  146. but it means that method might have to do more work to calculate the
  147. number of matching documents.
  148. """
  149. return True
  150. def all_ids(self):
  151. """Returns a sequence of docnums matched in this collector. (Only valid
  152. after the collector is run.)
  153. The default implementation is based on the docset. If a collector does
  154. not maintain the docset, it will need to override this method.
  155. """
  156. return self.docset
  157. def count(self):
  158. """Returns the total number of documents matched in this collector.
  159. (Only valid after the collector is run.)
  160. The default implementation is based on the docset. If a collector does
  161. not maintain the docset, it will need to override this method.
  162. """
  163. return len(self.docset)
  164. def collect_matches(self):
  165. """This method calls :meth:`Collector.matches` and then for each
  166. matched document calls :meth:`Collector.collect`. Sub-classes that
  167. want to intervene between finding matches and adding them to the
  168. collection (for example, to filter out certain documents) can override
  169. this method.
  170. """
  171. collect = self.collect
  172. for sub_docnum in self.matches():
  173. collect(sub_docnum)
  174. @abstractmethod
  175. def collect(self, sub_docnum):
  176. """This method is called for every matched document. It should do the
  177. work of adding a matched document to the results, and it should return
  178. an object to use as a "sorting key" for the given document (such as the
  179. document's score, a key generated by a facet, or just None). Subclasses
  180. must implement this method.
  181. If you want the score for the current document, use
  182. ``self.matcher.score()``.
  183. Overriding methods should add the current document offset
  184. (``self.offset``) to the ``sub_docnum`` to get the top-level document
  185. number for the matching document to add to results.
  186. :param sub_docnum: the document number of the current match within the
  187. current sub-searcher. You must add ``self.offset`` to this number
  188. to get the document's top-level document number.
  189. """
  190. raise NotImplementedError
  191. @abstractmethod
  192. def sort_key(self, sub_docnum):
  193. """Returns a sorting key for the current match. This should return the
  194. same value returned by :meth:`Collector.collect`, but without the side
  195. effect of adding the current document to the results.
  196. If the collector has been prepared with ``context.needs_current=True``,
  197. this method can use ``self.matcher`` to get information, for example
  198. the score. Otherwise, it should only use the provided ``sub_docnum``,
  199. since the matcher may be in an inconsistent state.
  200. Subclasses must implement this method.
  201. """
  202. raise NotImplementedError
  203. def remove(self, global_docnum):
  204. """Removes a document from the collector. Not that this method uses the
  205. global document number as opposed to :meth:`Collector.collect` which
  206. takes a segment-relative docnum.
  207. """
  208. items = self.items
  209. for i in xrange(len(items)):
  210. if items[i][1] == global_docnum:
  211. items.pop(i)
  212. return
  213. raise KeyError(global_docnum)
  214. def _step_through_matches(self):
  215. matcher = self.matcher
  216. while matcher.is_active():
  217. yield matcher.id()
  218. matcher.next()
  219. def matches(self):
  220. """Yields a series of relative document numbers for matches
  221. in the current subsearcher.
  222. """
  223. # We jump through a lot of hoops to avoid stepping through the matcher
  224. # "manually" if we can because all_ids() is MUCH faster
  225. if self.context.needs_current:
  226. return self._step_through_matches()
  227. else:
  228. return self.matcher.all_ids()
  229. def finish(self):
  230. """This method is called after a search.
  231. Subclasses can override this to perform set-up work, but
  232. they should still call the superclass's method because it sets several
  233. necessary attributes on the collector object:
  234. self.runtime
  235. The time (in seconds) the search took.
  236. """
  237. self.runtime = now() - self.starttime
  238. def _results(self, items, **kwargs):
  239. # Fills in a Results object with the invariant information and the
  240. # given "items" (a list of (score, docnum) tuples)
  241. r = Results(self.top_searcher, self.q, items, **kwargs)
  242. r.runtime = self.runtime
  243. r.collector = self
  244. return r
  245. @abstractmethod
  246. def results(self):
  247. """Returns a :class:`~whoosh.searching.Results` object containing the
  248. results of the search. Subclasses must implement this method
  249. """
  250. raise NotImplementedError
  251. # Scored collectors
  252. class ScoredCollector(Collector):
  253. """Base class for collectors that sort the results based on document score.
  254. """
  255. def __init__(self, replace=10):
  256. """
  257. :param replace: Number of matches between attempts to replace the
  258. matcher with a more efficient version.
  259. """
  260. Collector.__init__(self)
  261. self.replace = replace
  262. def prepare(self, top_searcher, q, context):
  263. # This collector requires a valid matcher at each step
  264. Collector.prepare(self, top_searcher, q, context)
  265. if top_searcher.weighting.use_final:
  266. self.final_fn = top_searcher.weighting.final
  267. else:
  268. self.final_fn = None
  269. # Heap containing top N (score, 0-docnum) pairs
  270. self.items = []
  271. # Minimum score a document must have to make it into the top N. This is
  272. # used by the block-quality optimizations
  273. self.minscore = 0
  274. # Number of times the matcher was replaced (for debugging)
  275. self.replaced_times = 0
  276. # Number of blocks skipped by quality optimizations (for debugging)
  277. self.skipped_times = 0
  278. def sort_key(self, sub_docnum):
  279. return 0 - self.matcher.score()
  280. def _collect(self, global_docnum, score):
  281. # Concrete subclasses should override this method to collect matching
  282. # documents
  283. raise NotImplementedError
  284. def _use_block_quality(self):
  285. # Concrete subclasses should override this method to return True if the
  286. # collector should use block quality optimizations
  287. return False
  288. def collect(self, sub_docnum):
  289. # Do common work to calculate score and top-level document number
  290. global_docnum = self.offset + sub_docnum
  291. score = self.matcher.score()
  292. if self.final_fn:
  293. score = self.final_fn(self.top_searcher, global_docnum, score)
  294. # Call specialized method on subclass
  295. return self._collect(global_docnum, score)
  296. def matches(self):
  297. minscore = self.minscore
  298. matcher = self.matcher
  299. usequality = self._use_block_quality()
  300. replace = self.replace
  301. replacecounter = 0
  302. # A flag to indicate whether we should check block quality at the start
  303. # of the next loop
  304. checkquality = True
  305. while matcher.is_active():
  306. # If the replacement counter has reached 0, try replacing the
  307. # matcher with a more efficient version
  308. if replace:
  309. if replacecounter == 0 or self.minscore != minscore:
  310. self.matcher = matcher = matcher.replace(minscore or 0)
  311. self.replaced_times += 1
  312. if not matcher.is_active():
  313. break
  314. usequality = self._use_block_quality()
  315. replacecounter = self.replace
  316. if self.minscore != minscore:
  317. checkquality = True
  318. minscore = self.minscore
  319. replacecounter -= 1
  320. # If we're using block quality optimizations, and the checkquality
  321. # flag is true, try to skip ahead to the next block with the
  322. # minimum required quality
  323. if usequality and checkquality and minscore is not None:
  324. self.skipped_times += matcher.skip_to_quality(minscore)
  325. # Skipping ahead might have moved the matcher to the end of the
  326. # posting list
  327. if not matcher.is_active():
  328. break
  329. yield matcher.id()
  330. # Move to the next document. This method returns True if the
  331. # matcher has entered a new block, so we should check block quality
  332. # again.
  333. checkquality = matcher.next()
  334. class TopCollector(ScoredCollector):
  335. """A collector that only returns the top "N" scored results.
  336. """
  337. def __init__(self, limit=10, usequality=True, **kwargs):
  338. """
  339. :param limit: the maximum number of results to return.
  340. :param usequality: whether to use block-quality optimizations. This may
  341. be useful for debugging.
  342. """
  343. ScoredCollector.__init__(self, **kwargs)
  344. self.limit = limit
  345. self.usequality = usequality
  346. self.total = 0
  347. def _use_block_quality(self):
  348. return (self.usequality
  349. and not self.top_searcher.weighting.use_final
  350. and self.matcher.supports_block_quality())
  351. def computes_count(self):
  352. return not self._use_block_quality()
  353. def all_ids(self):
  354. # Since this collector can skip blocks, it doesn't track the total
  355. # number of matching documents, so if the user asks for all matched
  356. # docs we need to re-run the search using docs_for_query
  357. return self.top_searcher.docs_for_query(self.q)
  358. def count(self):
  359. if self.computes_count():
  360. return self.total
  361. else:
  362. return ilen(self.all_ids())
  363. # ScoredCollector.collect calls this
  364. def _collect(self, global_docnum, score):
  365. items = self.items
  366. self.total += 1
  367. # Document numbers are negated before putting them in the heap so that
  368. # higher document numbers have lower "priority" in the queue. Lower
  369. # document numbers should always come before higher document numbers
  370. # with the same score to keep the order stable.
  371. if len(items) < self.limit:
  372. # The heap isn't full, so add this document
  373. heappush(items, (score, 0 - global_docnum))
  374. # Negate score to act as sort key so higher scores appear first
  375. return 0 - score
  376. elif score > items[0][0]:
  377. # The heap is full, but if this document has a high enough
  378. # score to make the top N, add it to the heap
  379. heapreplace(items, (score, 0 - global_docnum))
  380. self.minscore = items[0][0]
  381. # Negate score to act as sort key so higher scores appear first
  382. return 0 - score
  383. else:
  384. return 0
  385. def remove(self, global_docnum):
  386. negated = 0 - global_docnum
  387. items = self.items
  388. # Remove the document if it's on the list (it may not be since
  389. # TopCollector forgets documents that don't make the top N list)
  390. for i in xrange(len(items)):
  391. if items[i][1] == negated:
  392. items.pop(i)
  393. # Restore the heap invariant
  394. heapify(items)
  395. self.minscore = items[0][0] if items else 0
  396. return
  397. def results(self):
  398. # The items are stored (postive score, negative docnum) so the heap
  399. # keeps the highest scores and lowest docnums, in order from lowest to
  400. # highest. Since for the results we want the highest scores first,
  401. # sort the heap in reverse order
  402. items = self.items
  403. items.sort(reverse=True)
  404. # De-negate the docnums for presentation to the user
  405. items = [(score, 0 - docnum) for score, docnum in items]
  406. return self._results(items)
  407. class UnlimitedCollector(ScoredCollector):
  408. """A collector that returns **all** scored results.
  409. """
  410. def __init__(self, reverse=False):
  411. ScoredCollector.__init__(self)
  412. self.reverse = reverse
  413. # ScoredCollector.collect calls this
  414. def _collect(self, global_docnum, score):
  415. self.items.append((score, global_docnum))
  416. self.docset.add(global_docnum)
  417. # Negate score to act as sort key so higher scores appear first
  418. return 0 - score
  419. def results(self):
  420. # Sort by negated scores so that higher scores go first, then by
  421. # document number to keep the order stable when documents have the
  422. # same score
  423. self.items.sort(key=lambda x: (0 - x[0], x[1]), reverse=self.reverse)
  424. return self._results(self.items, docset=self.docset)
  425. # Sorting collector
  426. class SortingCollector(Collector):
  427. """A collector that returns results sorted by a given
  428. :class:`whoosh.sorting.Facet` object. See :doc:`/facets` for more
  429. information.
  430. """
  431. def __init__(self, sortedby, limit=10, reverse=False):
  432. """
  433. :param sortedby: see :doc:`/facets`.
  434. :param reverse: If True, reverse the overall results. Note that you
  435. can reverse individual facets in a multi-facet sort key as well.
  436. """
  437. Collector.__init__(self)
  438. self.sortfacet = sorting.MultiFacet.from_sortedby(sortedby)
  439. self.limit = limit
  440. self.reverse = reverse
  441. def prepare(self, top_searcher, q, context):
  442. self.categorizer = self.sortfacet.categorizer(top_searcher)
  443. # If the categorizer requires a valid matcher, then tell the child
  444. # collector that we need it
  445. rm = context.needs_current or self.categorizer.needs_current
  446. Collector.prepare(self, top_searcher, q, context.set(needs_current=rm))
  447. # List of (sortkey, docnum) pairs
  448. self.items = []
  449. def set_subsearcher(self, subsearcher, offset):
  450. Collector.set_subsearcher(self, subsearcher, offset)
  451. self.categorizer.set_searcher(subsearcher, offset)
  452. def sort_key(self, sub_docnum):
  453. return self.categorizer.key_for(self.matcher, sub_docnum)
  454. def collect(self, sub_docnum):
  455. global_docnum = self.offset + sub_docnum
  456. sortkey = self.sort_key(sub_docnum)
  457. self.items.append((sortkey, global_docnum))
  458. self.docset.add(global_docnum)
  459. return sortkey
  460. def results(self):
  461. items = self.items
  462. items.sort(reverse=self.reverse)
  463. if self.limit:
  464. items = items[:self.limit]
  465. return self._results(items, docset=self.docset)
  466. class UnsortedCollector(Collector):
  467. def prepare(self, top_searcher, q, context):
  468. Collector.prepare(self, top_searcher, q, context.set(weighting=None))
  469. self.items = []
  470. def collect(self, sub_docnum):
  471. global_docnum = self.offset + sub_docnum
  472. self.items.append((None, global_docnum))
  473. self.docset.add(global_docnum)
  474. def results(self):
  475. items = self.items
  476. return self._results(items, docset=self.docset)
  477. # Wrapping collectors
  478. class WrappingCollector(Collector):
  479. """Base class for collectors that wrap other collectors.
  480. """
  481. def __init__(self, child):
  482. self.child = child
  483. @property
  484. def top_searcher(self):
  485. return self.child.top_searcher
  486. @property
  487. def context(self):
  488. return self.child.context
  489. def prepare(self, top_searcher, q, context):
  490. self.child.prepare(top_searcher, q, context)
  491. def set_subsearcher(self, subsearcher, offset):
  492. self.child.set_subsearcher(subsearcher, offset)
  493. self.subsearcher = subsearcher
  494. self.matcher = self.child.matcher
  495. self.offset = self.child.offset
  496. def all_ids(self):
  497. return self.child.all_ids()
  498. def count(self):
  499. return self.child.count()
  500. def collect_matches(self):
  501. for sub_docnum in self.matches():
  502. self.collect(sub_docnum)
  503. def sort_key(self, sub_docnum):
  504. return self.child.sort_key(sub_docnum)
  505. def collect(self, sub_docnum):
  506. return self.child.collect(sub_docnum)
  507. def remove(self, global_docnum):
  508. return self.child.remove(global_docnum)
  509. def matches(self):
  510. return self.child.matches()
  511. def finish(self):
  512. self.child.finish()
  513. def results(self):
  514. return self.child.results()
  515. # Allow and disallow collector
  516. class FilterCollector(WrappingCollector):
  517. """A collector that lets you allow and/or restrict certain document numbers
  518. in the results::
  519. uc = collectors.UnlimitedCollector()
  520. ins = query.Term("chapter", "rendering")
  521. outs = query.Term("status", "restricted")
  522. fc = FilterCollector(uc, allow=ins, restrict=outs)
  523. mysearcher.search_with_collector(myquery, fc)
  524. print(fc.results())
  525. This collector discards a document if:
  526. * The allowed set is not None and a document number is not in the set, or
  527. * The restrict set is not None and a document number is in the set.
  528. (So, if the same document number is in both sets, that document will be
  529. discarded.)
  530. If you have a reference to the collector, you can use
  531. ``FilterCollector.filtered_count`` to get the number of matching documents
  532. filtered out of the results by the collector.
  533. """
  534. def __init__(self, child, allow=None, restrict=None):
  535. """
  536. :param child: the collector to wrap.
  537. :param allow: a query, Results object, or set-like object containing
  538. docnument numbers that are allowed in the results, or None (meaning
  539. everything is allowed).
  540. :param restrict: a query, Results object, or set-like object containing
  541. document numbers to disallow from the results, or None (meaning
  542. nothing is disallowed).
  543. """
  544. self.child = child
  545. self.allow = allow
  546. self.restrict = restrict
  547. def prepare(self, top_searcher, q, context):
  548. self.child.prepare(top_searcher, q, context)
  549. allow = self.allow
  550. restrict = self.restrict
  551. ftc = top_searcher._filter_to_comb
  552. self._allow = ftc(allow) if allow else None
  553. self._restrict = ftc(restrict) if restrict else None
  554. self.filtered_count = 0
  555. def all_ids(self):
  556. child = self.child
  557. _allow = self._allow
  558. _restrict = self._restrict
  559. for global_docnum in child.all_ids():
  560. if (
  561. (_allow and global_docnum not in _allow) or
  562. (_restrict and global_docnum in _restrict)
  563. ):
  564. continue
  565. yield global_docnum
  566. def count(self):
  567. child = self.child
  568. if child.computes_count():
  569. return child.count()
  570. else:
  571. return ilen(self.all_ids())
  572. def collect_matches(self):
  573. child = self.child
  574. _allow = self._allow
  575. _restrict = self._restrict
  576. if _allow is not None or _restrict is not None:
  577. filtered_count = self.filtered_count
  578. for sub_docnum in child.matches():
  579. global_docnum = self.offset + sub_docnum
  580. if ((_allow is not None and global_docnum not in _allow)
  581. or (_restrict is not None and global_docnum in _restrict)):
  582. filtered_count += 1
  583. continue
  584. child.collect(sub_docnum)
  585. self.filtered_count = filtered_count
  586. else:
  587. # If there was no allow or restrict set, don't do anything special,
  588. # just forward the call to the child collector
  589. child.collect_matches()
  590. def results(self):
  591. r = self.child.results()
  592. r.collector = self
  593. r.filtered_count = self.filtered_count
  594. r.allowed = self.allow
  595. r.restricted = self.restrict
  596. return r
  597. # Facet grouping collector
  598. class FacetCollector(WrappingCollector):
  599. """A collector that creates groups of documents based on
  600. :class:`whoosh.sorting.Facet` objects. See :doc:`/facets` for more
  601. information.
  602. This collector is used if you specify a ``groupedby`` parameter in the
  603. :meth:`whoosh.searching.Searcher.search` method. You can use the
  604. :meth:`whoosh.searching.Results.groups` method to access the facet groups.
  605. If you have a reference to the collector can also use
  606. ``FacetedCollector.facetmaps`` to access the groups directly::
  607. uc = collectors.UnlimitedCollector()
  608. fc = FacetedCollector(uc, sorting.FieldFacet("category"))
  609. mysearcher.search_with_collector(myquery, fc)
  610. print(fc.facetmaps)
  611. """
  612. def __init__(self, child, groupedby, maptype=None):
  613. """
  614. :param groupedby: see :doc:`/facets`.
  615. :param maptype: a :class:`whoosh.sorting.FacetMap` type to use for any
  616. facets that don't specify their own.
  617. """
  618. self.child = child
  619. self.facets = sorting.Facets.from_groupedby(groupedby)
  620. self.maptype = maptype
  621. def prepare(self, top_searcher, q, context):
  622. facets = self.facets
  623. # For each facet we're grouping by:
  624. # - Create a facetmap (to hold the groups)
  625. # - Create a categorizer (to generate document keys)
  626. self.facetmaps = {}
  627. self.categorizers = {}
  628. # Set needs_current to True if any of the categorizers require the
  629. # current document to work
  630. needs_current = context.needs_current
  631. for facetname, facet in facets.items():
  632. self.facetmaps[facetname] = facet.map(self.maptype)
  633. ctr = facet.categorizer(top_searcher)
  634. self.categorizers[facetname] = ctr
  635. needs_current = needs_current or ctr.needs_current
  636. context = context.set(needs_current=needs_current)
  637. self.child.prepare(top_searcher, q, context)
  638. def set_subsearcher(self, subsearcher, offset):
  639. WrappingCollector.set_subsearcher(self, subsearcher, offset)
  640. # Tell each categorizer about the new subsearcher and offset
  641. for categorizer in itervalues(self.categorizers):
  642. categorizer.set_searcher(self.child.subsearcher, self.child.offset)
  643. def collect(self, sub_docnum):
  644. matcher = self.child.matcher
  645. global_docnum = sub_docnum + self.child.offset
  646. # We want the sort key for the document so we can (by default) sort
  647. # the facet groups
  648. sortkey = self.child.collect(sub_docnum)
  649. # For each facet we're grouping by
  650. for name, categorizer in iteritems(self.categorizers):
  651. add = self.facetmaps[name].add
  652. # We have to do more work if the facet allows overlapping groups
  653. if categorizer.allow_overlap:
  654. for key in categorizer.keys_for(matcher, sub_docnum):
  655. add(categorizer.key_to_name(key), global_docnum, sortkey)
  656. else:
  657. key = categorizer.key_for(matcher, sub_docnum)
  658. key = categorizer.key_to_name(key)
  659. add(key, global_docnum, sortkey)
  660. return sortkey
  661. def results(self):
  662. r = self.child.results()
  663. r._facetmaps = self.facetmaps
  664. return r
  665. # Collapsing collector
  666. class CollapseCollector(WrappingCollector):
  667. """A collector that collapses results based on a facet. That is, it
  668. eliminates all but the top N results that share the same facet key.
  669. Documents with an empty key for the facet are never eliminated.
  670. The "top" results within each group is determined by the result ordering
  671. (e.g. highest score in a scored search) or an optional second "ordering"
  672. facet.
  673. If you have a reference to the collector you can use
  674. ``CollapseCollector.collapsed_counts`` to access the number of documents
  675. eliminated based on each key::
  676. tc = TopCollector(limit=20)
  677. cc = CollapseCollector(tc, "group", limit=3)
  678. mysearcher.search_with_collector(myquery, cc)
  679. print(cc.collapsed_counts)
  680. See :ref:`collapsing` for more information.
  681. """
  682. def __init__(self, child, keyfacet, limit=1, order=None):
  683. """
  684. :param child: the collector to wrap.
  685. :param keyfacet: a :class:`whoosh.sorting.Facet` to use for collapsing.
  686. All but the top N documents that share a key will be eliminated
  687. from the results.
  688. :param limit: the maximum number of documents to keep for each key.
  689. :param order: an optional :class:`whoosh.sorting.Facet` to use
  690. to determine the "top" document(s) to keep when collapsing. The
  691. default (``orderfaceet=None``) uses the results order (e.g. the
  692. highest score in a scored search).
  693. """
  694. self.child = child
  695. self.keyfacet = sorting.MultiFacet.from_sortedby(keyfacet)
  696. self.limit = limit
  697. if order:
  698. self.orderfacet = sorting.MultiFacet.from_sortedby(order)
  699. else:
  700. self.orderfacet = None
  701. def prepare(self, top_searcher, q, context):
  702. # Categorizer for getting the collapse key of a document
  703. self.keyer = self.keyfacet.categorizer(top_searcher)
  704. # Categorizer for getting the collapse order of a document
  705. self.orderer = None
  706. if self.orderfacet:
  707. self.orderer = self.orderfacet.categorizer(top_searcher)
  708. # Dictionary mapping keys to lists of (sortkey, global_docnum) pairs
  709. # representing the best docs for that key
  710. self.lists = defaultdict(list)
  711. # Dictionary mapping keys to the number of documents that have been
  712. # filtered out with that key
  713. self.collapsed_counts = defaultdict(int)
  714. # Total number of documents filtered out by collapsing
  715. self.collapsed_total = 0
  716. # If the keyer or orderer require a valid matcher, tell the child
  717. # collector we need it
  718. needs_current = (context.needs_current
  719. or self.keyer.needs_current
  720. or (self.orderer and self.orderer.needs_current))
  721. self.child.prepare(top_searcher, q,
  722. context.set(needs_current=needs_current))
  723. def set_subsearcher(self, subsearcher, offset):
  724. WrappingCollector.set_subsearcher(self, subsearcher, offset)
  725. # Tell the keyer and (optional) orderer about the new subsearcher
  726. self.keyer.set_searcher(subsearcher, offset)
  727. if self.orderer:
  728. self.orderer.set_searcher(subsearcher, offset)
  729. def all_ids(self):
  730. child = self.child
  731. limit = self.limit
  732. counters = defaultdict(int)
  733. for subsearcher, offset in child.subsearchers():
  734. self.set_subsearcher(subsearcher, offset)
  735. matcher = child.matcher
  736. keyer = self.keyer
  737. for sub_docnum in child.matches():
  738. ckey = keyer.key_for(matcher, sub_docnum)
  739. if ckey is not None:
  740. if ckey in counters and counters[ckey] >= limit:
  741. continue
  742. else:
  743. counters[ckey] += 1
  744. yield offset + sub_docnum
  745. def count(self):
  746. if self.child.computes_count():
  747. return self.child.count() - self.collapsed_total
  748. else:
  749. return ilen(self.all_ids())
  750. def collect_matches(self):
  751. lists = self.lists
  752. limit = self.limit
  753. keyer = self.keyer
  754. orderer = self.orderer
  755. collapsed_counts = self.collapsed_counts
  756. child = self.child
  757. matcher = child.matcher
  758. offset = child.offset
  759. for sub_docnum in child.matches():
  760. # Collapsing category key
  761. ckey = keyer.key_to_name(keyer.key_for(matcher, sub_docnum))
  762. if not ckey:
  763. # If the document isn't in a collapsing category, just add it
  764. child.collect(sub_docnum)
  765. else:
  766. global_docnum = offset + sub_docnum
  767. if orderer:
  768. # If user specified a collapse order, use it
  769. sortkey = orderer.key_for(child.matcher, sub_docnum)
  770. else:
  771. # Otherwise, use the results order
  772. sortkey = child.sort_key(sub_docnum)
  773. # Current list of best docs for this collapse key
  774. best = lists[ckey]
  775. add = False
  776. if len(best) < limit:
  777. # If the heap is not full yet, just add this document
  778. add = True
  779. elif sortkey < best[-1][0]:
  780. # If the heap is full but this document has a lower sort
  781. # key than the highest key currently on the heap, replace
  782. # the "least-best" document
  783. # Tell the child collector to remove the document
  784. child.remove(best.pop()[1])
  785. add = True
  786. if add:
  787. insort(best, (sortkey, global_docnum))
  788. child.collect(sub_docnum)
  789. else:
  790. # Remember that a document was filtered
  791. collapsed_counts[ckey] += 1
  792. self.collapsed_total += 1
  793. def results(self):
  794. r = self.child.results()
  795. r.collapsed_counts = self.collapsed_counts
  796. return r
  797. # Time limit collector
  798. class TimeLimitCollector(WrappingCollector):
  799. """A collector that raises a :class:`TimeLimit` exception if the search
  800. does not complete within a certain number of seconds::
  801. uc = collectors.UnlimitedCollector()
  802. tlc = TimeLimitedCollector(uc, timelimit=5.8)
  803. try:
  804. mysearcher.search_with_collector(myquery, tlc)
  805. except collectors.TimeLimit:
  806. print("The search ran out of time!")
  807. # We can still get partial results from the collector
  808. print(tlc.results())
  809. IMPORTANT: On Unix systems (systems where signal.SIGALRM is defined), the
  810. code uses signals to stop searching immediately when the time limit is
  811. reached. On Windows, the OS does not support this functionality, so the
  812. search only checks the time between each found document, so if a matcher
  813. is slow the search could exceed the time limit.
  814. """
  815. def __init__(self, child, timelimit, greedy=False, use_alarm=True):
  816. """
  817. :param child: the collector to wrap.
  818. :param timelimit: the maximum amount of time (in seconds) to
  819. allow for searching. If the search takes longer than this, it will
  820. raise a ``TimeLimit`` exception.
  821. :param greedy: if ``True``, the collector will finish adding the most
  822. recent hit before raising the ``TimeLimit`` exception.
  823. :param use_alarm: if ``True`` (the default), the collector will try to
  824. use signal.SIGALRM (on UNIX).
  825. """
  826. self.child = child
  827. self.timelimit = timelimit
  828. self.greedy = greedy
  829. if use_alarm:
  830. import signal
  831. self.use_alarm = use_alarm and hasattr(signal, "SIGALRM")
  832. else:
  833. self.use_alarm = False
  834. self.timer = None
  835. self.timedout = False
  836. def prepare(self, top_searcher, q, context):
  837. self.child.prepare(top_searcher, q, context)
  838. self.timedout = False
  839. if self.use_alarm:
  840. import signal
  841. signal.signal(signal.SIGALRM, self._was_signaled)
  842. # Start a timer thread. If the timer fires, it will call this object's
  843. # _timestop() method
  844. self.timer = threading.Timer(self.timelimit, self._timestop)
  845. self.timer.start()
  846. def _timestop(self):
  847. # Called when the timer expires
  848. self.timer = None
  849. # Set an attribute that will be noticed in the collect_matches() loop
  850. self.timedout = True
  851. if self.use_alarm:
  852. import signal
  853. os.kill(os.getpid(), signal.SIGALRM)
  854. def _was_signaled(self, signum, frame):
  855. raise TimeLimit
  856. def collect_matches(self):
  857. child = self.child
  858. greedy = self.greedy
  859. for sub_docnum in child.matches():
  860. # If the timer fired since the last loop and we're not greedy,
  861. # raise the exception
  862. if self.timedout and not greedy:
  863. raise TimeLimit
  864. child.collect(sub_docnum)
  865. # If the timer fired since we entered the loop or it fired earlier
  866. # but we were greedy, raise now
  867. if self.timedout:
  868. raise TimeLimit
  869. def finish(self):
  870. if self.timer:
  871. self.timer.cancel()
  872. self.timer = None
  873. self.child.finish()
  874. # Matched terms collector
  875. class TermsCollector(WrappingCollector):
  876. """A collector that remembers which terms appeared in which terms appeared
  877. in each matched document.
  878. This collector is used if you specify ``terms=True`` in the
  879. :meth:`whoosh.searching.Searcher.search` method.
  880. If you have a reference to the collector can also use
  881. ``TermsCollector.termslist`` to access the term lists directly::
  882. uc = collectors.UnlimitedCollector()
  883. tc = TermsCollector(uc)
  884. mysearcher.search_with_collector(myquery, tc)
  885. # tc.termdocs is a dictionary mapping (fieldname, text) tuples to
  886. # sets of document numbers
  887. print(tc.termdocs)
  888. # tc.docterms is a dictionary mapping docnums to lists of
  889. # (fieldname, text) tuples
  890. print(tc.docterms)
  891. """
  892. def __init__(self, child, settype=set):
  893. self.child = child
  894. self.settype = settype
  895. def prepare(self, top_searcher, q, context):
  896. # This collector requires a valid matcher at each step
  897. self.child.prepare(top_searcher, q, context.set(needs_current=True))
  898. # A dictionary mapping (fieldname, text) pairs to arrays of docnums
  899. self.termdocs = defaultdict(lambda: array("I"))
  900. # A dictionary mapping docnums to lists of (fieldname, text) pairs
  901. self.docterms = defaultdict(list)
  902. def set_subsearcher(self, subsearcher, offset):
  903. WrappingCollector.set_subsearcher(self, subsearcher, offset)
  904. # Store a list of all the term matchers in the matcher tree
  905. self.termmatchers = list(self.child.matcher.term_matchers())
  906. def collect(self, sub_docnum):
  907. child = self.child
  908. termdocs = self.termdocs
  909. docterms = self.docterms
  910. child.collect(sub_docnum)
  911. global_docnum = child.offset + sub_docnum
  912. # For each term matcher...
  913. for tm in self.termmatchers:
  914. # If the term matcher is matching the current document...
  915. if tm.is_active() and tm.id() == sub_docnum:
  916. # Add it to the list of matching documents for the term
  917. term = tm.term()
  918. termdocs[term].append(global_docnum)
  919. docterms[global_docnum].append(term)
  920. def results(self):
  921. r = self.child.results()
  922. r.termdocs = dict(self.termdocs)
  923. r.docterms = dict(self.docterms)
  924. return r