spans.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881
  1. # Copyright 2010 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. """
  28. This module contains Query objects that deal with "spans".
  29. Span queries allow for positional constraints on matching documents. For
  30. example, the :class:`whoosh.spans.SpanNear` query matches documents where one
  31. term occurs near another. Because you can nest span queries, and wrap them
  32. around almost any non-span query, you can create very complex constraints.
  33. For example, to find documents containing "whoosh" at most 5 positions before
  34. "library" in the "text" field::
  35. from whoosh import query, spans
  36. t1 = query.Term("text", "whoosh")
  37. t2 = query.Term("text", "library")
  38. q = spans.SpanNear(t1, t2, slop=5)
  39. """
  40. from whoosh.matching import mcore, wrappers, binary
  41. from whoosh.query import Query, And, AndMaybe, Or, Term
  42. from whoosh.util import make_binary_tree
  43. # Span class
  44. class Span(object):
  45. __slots__ = ("start", "end", "startchar", "endchar", "boost")
  46. def __init__(self, start, end=None, startchar=None, endchar=None,
  47. boost=1.0):
  48. if end is None:
  49. end = start
  50. assert start <= end
  51. self.start = start
  52. self.end = end
  53. self.startchar = startchar
  54. self.endchar = endchar
  55. self.boost = boost
  56. def __repr__(self):
  57. if self.startchar is not None or self.endchar is not None:
  58. return "<%d-%d %d:%d>" % (self.start, self.end, self.startchar,
  59. self.endchar)
  60. else:
  61. return "<%d-%d>" % (self.start, self.end)
  62. def __eq__(self, span):
  63. return (self.start == span.start
  64. and self.end == span.end
  65. and self.startchar == span.startchar
  66. and self.endchar == span.endchar)
  67. def __ne__(self, span):
  68. return self.start != span.start or self.end != span.end
  69. def __lt__(self, span):
  70. return self.start < span.start
  71. def __gt__(self, span):
  72. return self.start > span.start
  73. def __hash__(self):
  74. return hash((self.start, self.end))
  75. @classmethod
  76. def merge(cls, spans):
  77. """Merges overlapping and touches spans in the given list of spans.
  78. Note that this modifies the original list.
  79. >>> spans = [Span(1,2), Span(3)]
  80. >>> Span.merge(spans)
  81. >>> spans
  82. [<1-3>]
  83. """
  84. i = 0
  85. while i < len(spans) - 1:
  86. here = spans[i]
  87. j = i + 1
  88. while j < len(spans):
  89. there = spans[j]
  90. if there.start > here.end + 1:
  91. break
  92. if here.touches(there) or here.overlaps(there):
  93. here = here.to(there)
  94. spans[i] = here
  95. del spans[j]
  96. else:
  97. j += 1
  98. i += 1
  99. return spans
  100. def to(self, span):
  101. if self.startchar is None:
  102. minchar = span.startchar
  103. elif span.startchar is None:
  104. minchar = self.startchar
  105. else:
  106. minchar = min(self.startchar, span.startchar)
  107. if self.endchar is None:
  108. maxchar = span.endchar
  109. elif span.endchar is None:
  110. maxchar = self.endchar
  111. else:
  112. maxchar = max(self.endchar, span.endchar)
  113. minpos = min(self.start, span.start)
  114. maxpos = max(self.end, span.end)
  115. return self.__class__(minpos, maxpos, minchar, maxchar)
  116. def overlaps(self, span):
  117. return ((self.start >= span.start and self.start <= span.end)
  118. or (self.end >= span.start and self.end <= span.end)
  119. or (span.start >= self.start and span.start <= self.end)
  120. or (span.end >= self.start and span.end <= self.end))
  121. def surrounds(self, span):
  122. return self.start < span.start and self.end > span.end
  123. def is_within(self, span):
  124. return self.start >= span.start and self.end <= span.end
  125. def is_before(self, span):
  126. return self.end < span.start
  127. def is_after(self, span):
  128. return self.start > span.end
  129. def touches(self, span):
  130. return self.start == span.end + 1 or self.end == span.start - 1
  131. def distance_to(self, span):
  132. if self.overlaps(span):
  133. return 0
  134. elif self.is_before(span):
  135. return span.start - self.end
  136. else:
  137. return self.start - span.end
  138. def bisect_spans(spans, start):
  139. lo = 0
  140. hi = len(spans)
  141. while lo < hi:
  142. mid = (lo + hi) // 2
  143. if spans[mid].start < start:
  144. lo = mid + 1
  145. else:
  146. hi = mid
  147. return lo
  148. # Base matchers
  149. class SpanWrappingMatcher(wrappers.WrappingMatcher):
  150. """An abstract matcher class that wraps a "regular" matcher. This matcher
  151. uses the sub-matcher's matching logic, but only matches documents that have
  152. matching spans, i.e. where ``_get_spans()`` returns a non-empty list.
  153. Subclasses must implement the ``_get_spans()`` method, which returns a list
  154. of valid spans for the current document.
  155. """
  156. def __init__(self, child):
  157. super(SpanWrappingMatcher, self).__init__(child)
  158. self._spans = None
  159. if self.is_active():
  160. self._find_next()
  161. def copy(self):
  162. m = self.__class__(self.child.copy())
  163. m._spans = self._spans
  164. return m
  165. def _replacement(self, newchild):
  166. return self.__class__(newchild)
  167. def _find_next(self):
  168. if not self.is_active():
  169. return
  170. child = self.child
  171. r = False
  172. spans = self._get_spans()
  173. while child.is_active() and not spans:
  174. r = child.next() or r
  175. if not child.is_active():
  176. return True
  177. spans = self._get_spans()
  178. self._spans = spans
  179. return r
  180. def spans(self):
  181. return self._spans
  182. def next(self):
  183. self.child.next()
  184. self._find_next()
  185. def skip_to(self, id):
  186. self.child.skip_to(id)
  187. self._find_next()
  188. def all_ids(self):
  189. while self.is_active():
  190. if self.spans():
  191. yield self.id()
  192. self.next()
  193. class SpanBiMatcher(SpanWrappingMatcher):
  194. def copy(self):
  195. return self.__class__(self.a.copy(), self.b.copy())
  196. def depth(self):
  197. return 1 + max(self.a.depth(), self.b.depth())
  198. def replace(self, minquality=0):
  199. # TODO: fix this
  200. if not self.is_active():
  201. return mcore.NullMatcher()
  202. return self
  203. # Queries
  204. class SpanQuery(Query):
  205. """Abstract base class for span-based queries. Each span query type wraps
  206. a "regular" query that implements the basic document-matching functionality
  207. (for example, SpanNear wraps an And query, because SpanNear requires that
  208. the two sub-queries occur in the same documents. The wrapped query is
  209. stored in the ``q`` attribute.
  210. Subclasses usually only need to implement the initializer to set the
  211. wrapped query, and ``matcher()`` to return a span-aware matcher object.
  212. """
  213. def _subm(self, s, context=None):
  214. return self.q.matcher(s, context)
  215. def __repr__(self):
  216. return "%s(%r)" % (self.__class__.__name__, self.q)
  217. def __eq__(self, other):
  218. return (other and self.__class__ is other.__class__
  219. and self.q == other.q)
  220. def __hash__(self):
  221. return hash(self.__class__.__name__) ^ hash(self.q)
  222. def field(self):
  223. return None
  224. def needs_spans(self):
  225. return True
  226. class WrappingSpan(SpanQuery):
  227. def is_leaf(self):
  228. return False
  229. def apply(self, fn):
  230. return self.__class__(fn(self.q), limit=self.limit)
  231. def field(self):
  232. return self.q.field()
  233. class SpanFirst(WrappingSpan):
  234. """Matches spans that end within the first N positions. This lets you
  235. for example only match terms near the beginning of the document.
  236. """
  237. def __init__(self, q, limit=0):
  238. """
  239. :param q: the query to match.
  240. :param limit: the query must match within this position at the start
  241. of a document. The default is ``0``, which means the query must
  242. match at the first position.
  243. """
  244. self.q = q
  245. self.limit = limit
  246. def __eq__(self, other):
  247. return (other and self.__class__ is other.__class__
  248. and self.q == other.q and self.limit == other.limit)
  249. def __hash__(self):
  250. return hash(self.q) ^ hash(self.limit)
  251. def matcher(self, searcher, context=None):
  252. m = self._subm(searcher, context)
  253. return SpanFirst.SpanFirstMatcher(m, limit=self.limit)
  254. class SpanFirstMatcher(SpanWrappingMatcher):
  255. def __init__(self, child, limit=0):
  256. self.limit = limit
  257. super(SpanFirst.SpanFirstMatcher, self).__init__(child)
  258. def copy(self):
  259. return self.__class__(self.child.copy(), limit=self.limit)
  260. def _replacement(self, newchild):
  261. return self.__class__(newchild, limit=self.limit)
  262. def _get_spans(self):
  263. return [span for span in self.child.spans()
  264. if span.end <= self.limit]
  265. class SpanNear(SpanQuery):
  266. """
  267. Note: for new code, use :class:`SpanNear2` instead of this class. SpanNear2
  268. takes a list of sub-queries instead of requiring you to create a binary
  269. tree of query objects.
  270. Matches queries that occur near each other. By default, only matches
  271. queries that occur right next to each other (slop=1) and in order
  272. (ordered=True).
  273. For example, to find documents where "whoosh" occurs next to "library"
  274. in the "text" field::
  275. from whoosh import query, spans
  276. t1 = query.Term("text", "whoosh")
  277. t2 = query.Term("text", "library")
  278. q = spans.SpanNear(t1, t2)
  279. To find documents where "whoosh" occurs at most 5 positions before
  280. "library"::
  281. q = spans.SpanNear(t1, t2, slop=5)
  282. To find documents where "whoosh" occurs at most 5 positions before or after
  283. "library"::
  284. q = spans.SpanNear(t1, t2, slop=5, ordered=False)
  285. You can use the ``phrase()`` class method to create a tree of SpanNear
  286. queries to match a list of terms::
  287. q = spans.SpanNear.phrase("text", ["whoosh", "search", "library"],
  288. slop=2)
  289. """
  290. def __init__(self, a, b, slop=1, ordered=True, mindist=1):
  291. """
  292. :param a: the first query to match.
  293. :param b: the second query that must occur within "slop" positions of
  294. the first query.
  295. :param slop: the number of positions within which the queries must
  296. occur. Default is 1, meaning the queries must occur right next
  297. to each other.
  298. :param ordered: whether a must occur before b. Default is True.
  299. :pram mindist: the minimum distance allowed between the queries.
  300. """
  301. self.q = And([a, b])
  302. self.a = a
  303. self.b = b
  304. self.slop = slop
  305. self.ordered = ordered
  306. self.mindist = mindist
  307. def __repr__(self):
  308. return ("%s(%r, slop=%d, ordered=%s, mindist=%d)"
  309. % (self.__class__.__name__, self.q, self.slop, self.ordered,
  310. self.mindist))
  311. def __eq__(self, other):
  312. return (other and self.__class__ == other.__class__
  313. and self.q == other.q and self.slop == other.slop
  314. and self.ordered == other.ordered
  315. and self.mindist == other.mindist)
  316. def __hash__(self):
  317. return (hash(self.a) ^ hash(self.b) ^ hash(self.slop)
  318. ^ hash(self.ordered) ^ hash(self.mindist))
  319. def is_leaf(self):
  320. return False
  321. def apply(self, fn):
  322. return self.__class__(fn(self.a), fn(self.b), slop=self.slop,
  323. ordered=self.ordered, mindist=self.mindist)
  324. def matcher(self, searcher, context=None):
  325. ma = self.a.matcher(searcher, context)
  326. mb = self.b.matcher(searcher, context)
  327. return SpanNear.SpanNearMatcher(ma, mb, slop=self.slop,
  328. ordered=self.ordered,
  329. mindist=self.mindist)
  330. @classmethod
  331. def phrase(cls, fieldname, words, slop=1, ordered=True):
  332. """Returns a tree of SpanNear queries to match a list of terms.
  333. This class method is a convenience for constructing a phrase query
  334. using a binary tree of SpanNear queries::
  335. SpanNear.phrase("content", ["alfa", "bravo", "charlie", "delta"])
  336. :param fieldname: the name of the field to search in.
  337. :param words: a sequence of texts to search for.
  338. :param slop: the number of positions within which the terms must
  339. occur. Default is 1, meaning the terms must occur right next
  340. to each other.
  341. :param ordered: whether the terms must occur in order. Default is True.
  342. """
  343. terms = [Term(fieldname, word) for word in words]
  344. return make_binary_tree(cls, terms, slop=slop, ordered=ordered)
  345. class SpanNearMatcher(SpanWrappingMatcher):
  346. def __init__(self, a, b, slop=1, ordered=True, mindist=1):
  347. self.a = a
  348. self.b = b
  349. self.slop = slop
  350. self.ordered = ordered
  351. self.mindist = mindist
  352. isect = binary.IntersectionMatcher(a, b)
  353. super(SpanNear.SpanNearMatcher, self).__init__(isect)
  354. def copy(self):
  355. return self.__class__(self.a.copy(), self.b.copy(), slop=self.slop,
  356. ordered=self.ordered, mindist=self.mindist)
  357. def replace(self, minquality=0):
  358. # TODO: fix this
  359. if not self.is_active():
  360. return mcore.NullMatcher()
  361. return self
  362. def _get_spans(self):
  363. slop = self.slop
  364. mindist = self.mindist
  365. ordered = self.ordered
  366. spans = set()
  367. bspans = self.b.spans()
  368. for aspan in self.a.spans():
  369. for bspan in bspans:
  370. if (bspan.end < aspan.start - slop
  371. or (ordered and aspan.start > bspan.start)):
  372. # B is too far in front of A, or B is in front of A
  373. # *at all* when ordered is True
  374. continue
  375. if bspan.start > aspan.end + slop:
  376. # B is too far from A. Since spans are listed in
  377. # start position order, we know that all spans after
  378. # this one will also be too far.
  379. break
  380. # Check the distance between the spans
  381. dist = aspan.distance_to(bspan)
  382. if mindist <= dist <= slop:
  383. spans.add(aspan.to(bspan))
  384. return sorted(spans)
  385. class SpanNear2(SpanQuery):
  386. """
  387. Matches queries that occur near each other. By default, only matches
  388. queries that occur right next to each other (slop=1) and in order
  389. (ordered=True).
  390. New code should use this query type instead of :class:`SpanNear`.
  391. (Unlike :class:`SpanNear`, this query takes a list of subqueries instead of
  392. requiring you to build a binary tree of query objects. This query should
  393. also be slightly faster due to less overhead.)
  394. For example, to find documents where "whoosh" occurs next to "library"
  395. in the "text" field::
  396. from whoosh import query, spans
  397. t1 = query.Term("text", "whoosh")
  398. t2 = query.Term("text", "library")
  399. q = spans.SpanNear2([t1, t2])
  400. To find documents where "whoosh" occurs at most 5 positions before
  401. "library"::
  402. q = spans.SpanNear2([t1, t2], slop=5)
  403. To find documents where "whoosh" occurs at most 5 positions before or after
  404. "library"::
  405. q = spans.SpanNear2(t1, t2, slop=5, ordered=False)
  406. """
  407. def __init__(self, qs, slop=1, ordered=True, mindist=1):
  408. """
  409. :param qs: a sequence of sub-queries to match.
  410. :param slop: the number of positions within which the queries must
  411. occur. Default is 1, meaning the queries must occur right next
  412. to each other.
  413. :param ordered: whether a must occur before b. Default is True.
  414. :pram mindist: the minimum distance allowed between the queries.
  415. """
  416. self.qs = qs
  417. self.slop = slop
  418. self.ordered = ordered
  419. self.mindist = mindist
  420. def __repr__(self):
  421. return ("%s(%r, slop=%d, ordered=%s, mindist=%d)"
  422. % (self.__class__.__name__, self.qs, self.slop, self.ordered,
  423. self.mindist))
  424. def __eq__(self, other):
  425. return (other and self.__class__ == other.__class__
  426. and self.qs == other.qs and self.slop == other.slop
  427. and self.ordered == other.ordered
  428. and self.mindist == other.mindist)
  429. def __hash__(self):
  430. h = hash(self.slop) ^ hash(self.ordered) ^ hash(self.mindist)
  431. for q in self.qs:
  432. h ^= hash(q)
  433. return h
  434. def _and_query(self):
  435. return q.And(self.qs)
  436. def estimate_size(self, ixreader):
  437. return self._and_query().estimate_size(ixreader)
  438. def estimate_min_size(self, ixreader):
  439. return self._and_query().estimate_min_size(ixreader)
  440. def is_leaf(self):
  441. return False
  442. def children(self):
  443. return self.qs
  444. def apply(self, fn):
  445. return self.__class__([fn(q) for q in self.qs], slop=self.slop,
  446. ordered=self.ordered, mindist=self.mindist)
  447. def matcher(self, searcher, context=None):
  448. ms = [q.matcher(searcher, context) for q in self.qs]
  449. return self.SpanNear2Matcher(ms, slop=self.slop, ordered=self.ordered,
  450. mindist=self.mindist)
  451. class SpanNear2Matcher(SpanWrappingMatcher):
  452. def __init__(self, ms, slop=1, ordered=True, mindist=1):
  453. self.ms = ms
  454. self.slop = slop
  455. self.ordered = ordered
  456. self.mindist = mindist
  457. isect = make_binary_tree(binary.IntersectionMatcher, ms)
  458. super(SpanNear2.SpanNear2Matcher, self).__init__(isect)
  459. def copy(self):
  460. return self.__class__([m.copy() for m in self.ms], slop=self.slop,
  461. ordered=self.ordered, mindist=self.mindist)
  462. def replace(self, minquality=0):
  463. # TODO: fix this
  464. if not self.is_active():
  465. return mcore.NullMatcher()
  466. return self
  467. def _get_spans(self):
  468. slop = self.slop
  469. mindist = self.mindist
  470. ordered = self.ordered
  471. ms = self.ms
  472. aspans = ms[0].spans()
  473. i = 1
  474. while i < len(ms) and aspans:
  475. bspans = ms[i].spans()
  476. spans = set()
  477. for aspan in aspans:
  478. # Use a binary search to find the first position we should
  479. # start looking for possible matches
  480. if ordered:
  481. start = aspan.start
  482. else:
  483. start = max(0, aspan.start - slop)
  484. j = bisect_spans(bspans, start)
  485. while j < len(bspans):
  486. bspan = bspans[j]
  487. j += 1
  488. if (bspan.end < aspan.start - slop
  489. or (ordered and aspan.start > bspan.start)):
  490. # B is too far in front of A, or B is in front of A
  491. # *at all* when ordered is True
  492. continue
  493. if bspan.start > aspan.end + slop:
  494. # B is too far from A. Since spans are listed in
  495. # start position order, we know that all spans after
  496. # this one will also be too far.
  497. break
  498. # Check the distance between the spans
  499. dist = aspan.distance_to(bspan)
  500. if mindist <= dist <= slop:
  501. spans.add(aspan.to(bspan))
  502. aspans = sorted(spans)
  503. i += 1
  504. if i == len(ms):
  505. return aspans
  506. else:
  507. return []
  508. class SpanOr(SpanQuery):
  509. """Matches documents that match any of a list of sub-queries. Unlike
  510. query.Or, this class merges together matching spans from the different
  511. sub-queries when they overlap.
  512. """
  513. def __init__(self, subqs):
  514. """
  515. :param subqs: a list of queries to match.
  516. """
  517. self.q = Or(subqs)
  518. self.subqs = subqs
  519. def is_leaf(self):
  520. return False
  521. def apply(self, fn):
  522. return self.__class__([fn(sq) for sq in self.subqs])
  523. def matcher(self, searcher, context=None):
  524. matchers = [q.matcher(searcher, context) for q in self.subqs]
  525. return make_binary_tree(SpanOr.SpanOrMatcher, matchers)
  526. class SpanOrMatcher(SpanBiMatcher):
  527. def __init__(self, a, b):
  528. self.a = a
  529. self.b = b
  530. um = binary.UnionMatcher(a, b)
  531. super(SpanOr.SpanOrMatcher, self).__init__(um)
  532. def _get_spans(self):
  533. a_active = self.a.is_active()
  534. b_active = self.b.is_active()
  535. if a_active:
  536. a_id = self.a.id()
  537. if b_active:
  538. b_id = self.b.id()
  539. if a_id == b_id:
  540. spans = sorted(set(self.a.spans())
  541. | set(self.b.spans()))
  542. elif a_id < b_id:
  543. spans = self.a.spans()
  544. else:
  545. spans = self.b.spans()
  546. else:
  547. spans = self.a.spans()
  548. else:
  549. spans = self.b.spans()
  550. Span.merge(spans)
  551. return spans
  552. class SpanBiQuery(SpanQuery):
  553. # Intermediate base class for methods common to "a/b" span query types
  554. def is_leaf(self):
  555. return False
  556. def apply(self, fn):
  557. return self.__class__(fn(self.a), fn(self.b))
  558. def matcher(self, searcher, context=None):
  559. ma = self.a.matcher(searcher, context)
  560. mb = self.b.matcher(searcher, context)
  561. return self._Matcher(ma, mb)
  562. class SpanNot(SpanBiQuery):
  563. """Matches spans from the first query only if they don't overlap with
  564. spans from the second query. If there are no non-overlapping spans, the
  565. document does not match.
  566. For example, to match documents that contain "bear" at most 2 places after
  567. "apple" in the "text" field but don't have "cute" between them::
  568. from whoosh import query, spans
  569. t1 = query.Term("text", "apple")
  570. t2 = query.Term("text", "bear")
  571. near = spans.SpanNear(t1, t2, slop=2)
  572. q = spans.SpanNot(near, query.Term("text", "cute"))
  573. """
  574. def __init__(self, a, b):
  575. """
  576. :param a: the query to match.
  577. :param b: do not match any spans that overlap with spans from this
  578. query.
  579. """
  580. self.q = AndMaybe(a, b)
  581. self.a = a
  582. self.b = b
  583. class _Matcher(SpanBiMatcher):
  584. def __init__(self, a, b):
  585. self.a = a
  586. self.b = b
  587. amm = binary.AndMaybeMatcher(a, b)
  588. super(SpanNot._Matcher, self).__init__(amm)
  589. def _get_spans(self):
  590. if self.a.id() == self.b.id():
  591. spans = []
  592. bspans = self.b.spans()
  593. for aspan in self.a.spans():
  594. overlapped = False
  595. for bspan in bspans:
  596. if aspan.overlaps(bspan):
  597. overlapped = True
  598. break
  599. if not overlapped:
  600. spans.append(aspan)
  601. return spans
  602. else:
  603. return self.a.spans()
  604. class SpanContains(SpanBiQuery):
  605. """Matches documents where the spans of the first query contain any spans
  606. of the second query.
  607. For example, to match documents where "apple" occurs at most 10 places
  608. before "bear" in the "text" field and "cute" is between them::
  609. from whoosh import query, spans
  610. t1 = query.Term("text", "apple")
  611. t2 = query.Term("text", "bear")
  612. near = spans.SpanNear(t1, t2, slop=10)
  613. q = spans.SpanContains(near, query.Term("text", "cute"))
  614. """
  615. def __init__(self, a, b):
  616. """
  617. :param a: the query to match.
  618. :param b: the query whose spans must occur within the matching spans
  619. of the first query.
  620. """
  621. self.q = And([a, b])
  622. self.a = a
  623. self.b = b
  624. class _Matcher(SpanBiMatcher):
  625. def __init__(self, a, b):
  626. self.a = a
  627. self.b = b
  628. im = binary.IntersectionMatcher(a, b)
  629. super(SpanContains._Matcher, self).__init__(im)
  630. def _get_spans(self):
  631. spans = []
  632. bspans = self.b.spans()
  633. for aspan in self.a.spans():
  634. for bspan in bspans:
  635. if aspan.start > bspan.end:
  636. continue
  637. if aspan.end < bspan.start:
  638. break
  639. if bspan.is_within(aspan):
  640. spans.append(aspan)
  641. break
  642. return spans
  643. class SpanBefore(SpanBiQuery):
  644. """Matches documents where the spans of the first query occur before any
  645. spans of the second query.
  646. For example, to match documents where "apple" occurs anywhere before
  647. "bear"::
  648. from whoosh import query, spans
  649. t1 = query.Term("text", "apple")
  650. t2 = query.Term("text", "bear")
  651. q = spans.SpanBefore(t1, t2)
  652. """
  653. def __init__(self, a, b):
  654. """
  655. :param a: the query that must occur before the second.
  656. :param b: the query that must occur after the first.
  657. """
  658. self.a = a
  659. self.b = b
  660. self.q = And([a, b])
  661. class _Matcher(SpanBiMatcher):
  662. def __init__(self, a, b):
  663. self.a = a
  664. self.b = b
  665. im = binary.IntersectionMatcher(a, b)
  666. super(SpanBefore._Matcher, self).__init__(im)
  667. def _get_spans(self):
  668. bminstart = min(bspan.start for bspan in self.b.spans())
  669. return [aspan for aspan in self.a.spans() if aspan.end < bminstart]
  670. class SpanCondition(SpanBiQuery):
  671. """Matches documents that satisfy both subqueries, but only uses the spans
  672. from the first subquery.
  673. This is useful when you want to place conditions on matches but not have
  674. those conditions affect the spans returned.
  675. For example, to get spans for the term ``alfa`` in documents that also
  676. must contain the term ``bravo``::
  677. SpanCondition(Term("text", u"alfa"), Term("text", u"bravo"))
  678. """
  679. def __init__(self, a, b):
  680. self.a = a
  681. self.b = b
  682. self.q = And([a, b])
  683. class _Matcher(SpanBiMatcher):
  684. def __init__(self, a, b):
  685. self.a = a
  686. im = binary.IntersectionMatcher(a, b)
  687. super(SpanCondition._Matcher, self).__init__(im)
  688. def _get_spans(self):
  689. return self.a.spans()