mcore.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622
  1. # Copyright 2010 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. """
  28. This module contains "matcher" classes. Matchers deal with posting lists. The
  29. most basic matcher, which reads the list of postings for a term, will be
  30. provided by the backend implementation (for example,
  31. :class:`whoosh.filedb.filepostings.FilePostingReader`). The classes in this
  32. module provide additional functionality, such as combining the results of two
  33. matchers, or modifying the results of a matcher.
  34. You do not need to deal with the classes in this module unless you need to
  35. write your own Matcher implementation to provide some new functionality. These
  36. classes are not instantiated by the user. They are usually created by a
  37. :class:`~whoosh.query.Query` object's :meth:`~whoosh.query.Query.matcher()`
  38. method, which returns the appropriate matcher to implement the query (for
  39. example, the :class:`~whoosh.query.Or` query's
  40. :meth:`~whoosh.query.Or.matcher()` method returns a
  41. :py:class:`~whoosh.matching.UnionMatcher` object).
  42. Certain backends support "quality" optimizations. These backends have the
  43. ability to skip ahead if it knows the current block of postings can't
  44. contribute to the top N documents. If the matcher tree and backend support
  45. these optimizations, the matcher's :meth:`Matcher.supports_block_quality()`
  46. method will return ``True``.
  47. """
  48. import sys
  49. from itertools import repeat
  50. from whoosh.compat import izip, xrange
  51. from whoosh.compat import abstractmethod
  52. # Exceptions
  53. class ReadTooFar(Exception):
  54. """Raised when :meth:`~whoosh.matching.Matcher.next()` or
  55. :meth:`~whoosh.matching.Matcher.skip_to()` are called on an inactive
  56. matcher.
  57. """
  58. class NoQualityAvailable(Exception):
  59. """Raised when quality methods are called on a matcher that does not
  60. support block quality optimizations.
  61. """
  62. # Classes
  63. class Matcher(object):
  64. """Base class for all matchers.
  65. """
  66. @abstractmethod
  67. def is_active(self):
  68. """Returns True if this matcher is still "active", that is, it has not
  69. yet reached the end of the posting list.
  70. """
  71. raise NotImplementedError
  72. @abstractmethod
  73. def reset(self):
  74. """Returns to the start of the posting list.
  75. Note that reset() may not do what you expect after you call
  76. :meth:`Matcher.replace()`, since this can mean calling reset() not on
  77. the original matcher, but on an optimized replacement.
  78. """
  79. raise NotImplementedError
  80. def term(self):
  81. """Returns a ``("fieldname", "termtext")`` tuple for the term this
  82. matcher matches, or None if this matcher is not a term matcher.
  83. """
  84. return None
  85. def term_matchers(self):
  86. """Returns an iterator of term matchers in this tree.
  87. """
  88. if self.term() is not None:
  89. yield self
  90. else:
  91. for cm in self.children():
  92. for m in cm.term_matchers():
  93. yield m
  94. def matching_terms(self, id=None):
  95. """Returns an iterator of ``("fieldname", "termtext")`` tuples for the
  96. **currently matching** term matchers in this tree.
  97. """
  98. if not self.is_active():
  99. return
  100. if id is None:
  101. id = self.id()
  102. elif id != self.id():
  103. return
  104. t = self.term()
  105. if t is None:
  106. for c in self.children():
  107. for t in c.matching_terms(id):
  108. yield t
  109. else:
  110. yield t
  111. def is_leaf(self):
  112. return not bool(self.children())
  113. def children(self):
  114. """Returns an (possibly empty) list of the submatchers of this
  115. matcher.
  116. """
  117. return []
  118. def replace(self, minquality=0):
  119. """Returns a possibly-simplified version of this matcher. For example,
  120. if one of the children of a UnionMatcher is no longer active, calling
  121. this method on the UnionMatcher will return the other child.
  122. """
  123. return self
  124. @abstractmethod
  125. def copy(self):
  126. """Returns a copy of this matcher.
  127. """
  128. raise NotImplementedError
  129. def depth(self):
  130. """Returns the depth of the tree under this matcher, or 0 if this
  131. matcher does not have any children.
  132. """
  133. return 0
  134. def supports_block_quality(self):
  135. """Returns True if this matcher supports the use of ``quality`` and
  136. ``block_quality``.
  137. """
  138. return False
  139. def max_quality(self):
  140. """Returns the maximum possible quality measurement for this matcher,
  141. according to the current weighting algorithm. Raises
  142. ``NoQualityAvailable`` if the matcher or weighting do not support
  143. quality measurements.
  144. """
  145. raise NoQualityAvailable(self.__class__)
  146. def block_quality(self):
  147. """Returns a quality measurement of the current block of postings,
  148. according to the current weighting algorithm. Raises
  149. ``NoQualityAvailable`` if the matcher or weighting do not support
  150. quality measurements.
  151. """
  152. raise NoQualityAvailable(self.__class__)
  153. @abstractmethod
  154. def id(self):
  155. """Returns the ID of the current posting.
  156. """
  157. raise NotImplementedError
  158. def all_ids(self):
  159. """Returns a generator of all IDs in the matcher.
  160. What this method returns for a matcher that has already read some
  161. postings (whether it only yields the remaining postings or all postings
  162. from the beginning) is undefined, so it's best to only use this method
  163. on fresh matchers.
  164. """
  165. i = 0
  166. m = self
  167. while m.is_active():
  168. yield m.id()
  169. m.next()
  170. i += 1
  171. if i == 10:
  172. m = m.replace()
  173. i = 0
  174. def all_items(self):
  175. """Returns a generator of all (ID, encoded value) pairs in the matcher.
  176. What this method returns for a matcher that has already read some
  177. postings (whether it only yields the remaining postings or all postings
  178. from the beginning) is undefined, so it's best to only use this method
  179. on fresh matchers.
  180. """
  181. i = 0
  182. m = self
  183. while self.is_active():
  184. yield (m.id(), m.value())
  185. m.next()
  186. i += 1
  187. if i == 10:
  188. m = m.replace()
  189. i = 0
  190. def items_as(self, astype):
  191. """Returns a generator of all (ID, decoded value) pairs in the matcher.
  192. What this method returns for a matcher that has already read some
  193. postings (whether it only yields the remaining postings or all postings
  194. from the beginning) is undefined, so it's best to only use this method
  195. on fresh matchers.
  196. """
  197. while self.is_active():
  198. yield (self.id(), self.value_as(astype))
  199. self.next()
  200. @abstractmethod
  201. def value(self):
  202. """Returns the encoded value of the current posting.
  203. """
  204. raise NotImplementedError
  205. @abstractmethod
  206. def supports(self, astype):
  207. """Returns True if the field's format supports the named data type,
  208. for example 'frequency' or 'characters'.
  209. """
  210. raise NotImplementedError("supports not implemented in %s"
  211. % self.__class__)
  212. @abstractmethod
  213. def value_as(self, astype):
  214. """Returns the value(s) of the current posting as the given type.
  215. """
  216. raise NotImplementedError("value_as not implemented in %s"
  217. % self.__class__)
  218. def spans(self):
  219. """Returns a list of :class:`~whoosh.query.spans.Span` objects for the
  220. matches in this document. Raises an exception if the field being
  221. searched does not store positions.
  222. """
  223. from whoosh.query.spans import Span
  224. if self.supports("characters"):
  225. return [Span(pos, startchar=startchar, endchar=endchar)
  226. for pos, startchar, endchar in self.value_as("characters")]
  227. elif self.supports("positions"):
  228. return [Span(pos) for pos in self.value_as("positions")]
  229. else:
  230. raise Exception("Field does not support spans")
  231. def skip_to(self, id):
  232. """Moves this matcher to the first posting with an ID equal to or
  233. greater than the given ID.
  234. """
  235. while self.is_active() and self.id() < id:
  236. self.next()
  237. def skip_to_quality(self, minquality):
  238. """Moves this matcher to the next block with greater than the given
  239. minimum quality value.
  240. """
  241. raise NotImplementedError(self.__class__.__name__)
  242. @abstractmethod
  243. def next(self):
  244. """Moves this matcher to the next posting.
  245. """
  246. raise NotImplementedError(self.__class__.__name__)
  247. def weight(self):
  248. """Returns the weight of the current posting.
  249. """
  250. return self.value_as("weight")
  251. @abstractmethod
  252. def score(self):
  253. """Returns the score of the current posting.
  254. """
  255. raise NotImplementedError(self.__class__.__name__)
  256. def __eq__(self, other):
  257. return self.__class__ is type(other)
  258. def __lt__(self, other):
  259. return type(other) is self.__class__
  260. def __ne__(self, other):
  261. return not self.__eq__(other)
  262. def __gt__(self, other):
  263. return not (self.__lt__(other) or self.__eq__(other))
  264. def __le__(self, other):
  265. return self.__eq__(other) or self.__lt__(other)
  266. def __ge__(self, other):
  267. return self.__eq__(other) or self.__gt__(other)
  268. # Simple intermediate classes
  269. class ConstantScoreMatcher(Matcher):
  270. def __init__(self, score=1.0):
  271. self._score = score
  272. def supports_block_quality(self):
  273. return True
  274. def max_quality(self):
  275. return self._score
  276. def block_quality(self):
  277. return self._score
  278. def skip_to_quality(self, minquality):
  279. if minquality >= self._score:
  280. self.go_inactive()
  281. def score(self):
  282. return self._score
  283. # Null matcher
  284. class NullMatcherClass(Matcher):
  285. """Matcher with no postings which is never active.
  286. """
  287. def __call__(self):
  288. return self
  289. def __repr__(self):
  290. return "<NullMatcher>"
  291. def supports_block_quality(self):
  292. return True
  293. def max_quality(self):
  294. return 0
  295. def block_quality(self):
  296. return 0
  297. def skip_to_quality(self, minquality):
  298. return 0
  299. def is_active(self):
  300. return False
  301. def reset(self):
  302. pass
  303. def all_ids(self):
  304. return []
  305. def copy(self):
  306. return self
  307. # Singleton instance
  308. NullMatcher = NullMatcherClass()
  309. class ListMatcher(Matcher):
  310. """Synthetic matcher backed by a list of IDs.
  311. """
  312. def __init__(self, ids, weights=None, values=None, format=None,
  313. scorer=None, position=0, all_weights=None, term=None,
  314. terminfo=None):
  315. """
  316. :param ids: a list of doc IDs.
  317. :param weights: a list of weights corresponding to the list of IDs.
  318. If this argument is not supplied, a list of 1.0 values is used.
  319. :param values: a list of encoded values corresponding to the list of
  320. IDs.
  321. :param format: a :class:`whoosh.formats.Format` object representing the
  322. format of the field.
  323. :param scorer: a :class:`whoosh.scoring.BaseScorer` object for scoring
  324. the postings.
  325. :param term: a ``("fieldname", "text")`` tuple, or None if this is not
  326. a term matcher.
  327. """
  328. self._ids = ids
  329. self._weights = weights
  330. self._all_weights = all_weights
  331. self._values = values
  332. self._i = position
  333. self._format = format
  334. self._scorer = scorer
  335. self._term = term
  336. self._terminfo = terminfo
  337. def __repr__(self):
  338. return "<%s>" % self.__class__.__name__
  339. def is_active(self):
  340. return self._i < len(self._ids)
  341. def reset(self):
  342. self._i = 0
  343. def skip_to(self, id):
  344. if not self.is_active():
  345. raise ReadTooFar
  346. if id < self.id():
  347. return
  348. while self._i < len(self._ids) and self._ids[self._i] < id:
  349. self._i += 1
  350. def term(self):
  351. return self._term
  352. def copy(self):
  353. return self.__class__(self._ids, self._weights, self._values,
  354. self._format, self._scorer, self._i,
  355. self._all_weights)
  356. def replace(self, minquality=0):
  357. if not self.is_active():
  358. return NullMatcher()
  359. elif minquality and self.max_quality() < minquality:
  360. return NullMatcher()
  361. else:
  362. return self
  363. def supports_block_quality(self):
  364. return (self._scorer is not None
  365. and self._scorer.supports_block_quality())
  366. def max_quality(self):
  367. # This matcher treats all postings in the list as one "block", so the
  368. # block quality is the same as the quality of the entire list
  369. if self._scorer:
  370. return self._scorer.block_quality(self)
  371. else:
  372. return self.block_max_weight()
  373. def block_quality(self):
  374. return self._scorer.block_quality(self)
  375. def skip_to_quality(self, minquality):
  376. while self._i < len(self._ids) and self.block_quality() <= minquality:
  377. self._i += 1
  378. return 0
  379. def id(self):
  380. return self._ids[self._i]
  381. def all_ids(self):
  382. return iter(self._ids)
  383. def all_items(self):
  384. values = self._values
  385. if values is None:
  386. values = repeat('')
  387. return izip(self._ids, values)
  388. def value(self):
  389. if self._values:
  390. v = self._values[self._i]
  391. if isinstance(v, list):
  392. # This object supports "values" that are actually lists of
  393. # value strings. This is to support combining the results of
  394. # several different matchers into a single ListMatcher (see the
  395. # TOO_MANY_CLAUSES functionality of MultiTerm). We combine the
  396. # values here instead of combining them first and then making
  397. # the ListMatcher to avoid wasting time combining values if the
  398. # consumer never asks for them.
  399. assert len(v) > 0
  400. if len(v) == 1:
  401. v = v[0]
  402. else:
  403. v = self._format.combine(v)
  404. # Replace the list with the computed value string
  405. self._values[self._i] = v
  406. return v
  407. else:
  408. return ''
  409. def value_as(self, astype):
  410. decoder = self._format.decoder(astype)
  411. return decoder(self.value())
  412. def supports(self, astype):
  413. return self._format.supports(astype)
  414. def next(self):
  415. self._i += 1
  416. def weight(self):
  417. if self._all_weights:
  418. return self._all_weights
  419. elif self._weights:
  420. return self._weights[self._i]
  421. else:
  422. return 1.0
  423. def block_min_length(self):
  424. return self._terminfo.min_length()
  425. def block_max_length(self):
  426. return self._terminfo.max_length()
  427. def block_max_weight(self):
  428. if self._all_weights:
  429. return self._all_weights
  430. elif self._weights:
  431. return max(self._weights)
  432. elif self._terminfo is not None:
  433. return self._terminfo.max_weight()
  434. else:
  435. return 1.0
  436. def score(self):
  437. if self._scorer:
  438. return self._scorer.score(self)
  439. else:
  440. return self.weight()
  441. # Term/vector leaf posting matcher middleware
  442. class LeafMatcher(Matcher):
  443. # Subclasses need to set
  444. # self.scorer -- a Scorer object or None
  445. # self.format -- Format object for the posting values
  446. def __repr__(self):
  447. return "%s(%r, %s)" % (self.__class__.__name__, self.term(),
  448. self.is_active())
  449. def term(self):
  450. return self._term
  451. def items_as(self, astype):
  452. decoder = self.format.decoder(astype)
  453. for id, value in self.all_items():
  454. yield (id, decoder(value))
  455. def supports(self, astype):
  456. return self.format.supports(astype)
  457. def value_as(self, astype):
  458. decoder = self.format.decoder(astype)
  459. return decoder(self.value())
  460. def spans(self):
  461. from whoosh.query.spans import Span
  462. if self.supports("characters"):
  463. return [Span(pos, startchar=startchar, endchar=endchar)
  464. for pos, startchar, endchar in self.value_as("characters")]
  465. elif self.supports("positions"):
  466. return [Span(pos) for pos in self.value_as("positions")]
  467. else:
  468. raise Exception("Field does not support positions (%r)"
  469. % self.term())
  470. def supports_block_quality(self):
  471. return self.scorer and self.scorer.supports_block_quality()
  472. def max_quality(self):
  473. return self.scorer.max_quality()
  474. def block_quality(self):
  475. return self.scorer.block_quality(self)
  476. def score(self):
  477. return self.scorer.score(self)