123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881 |
- # Copyright 2010 Matt Chaput. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # 1. Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
- # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # The views and conclusions contained in the software and documentation are
- # those of the authors and should not be interpreted as representing official
- # policies, either expressed or implied, of Matt Chaput.
- """
- This module contains Query objects that deal with "spans".
- Span queries allow for positional constraints on matching documents. For
- example, the :class:`whoosh.spans.SpanNear` query matches documents where one
- term occurs near another. Because you can nest span queries, and wrap them
- around almost any non-span query, you can create very complex constraints.
- For example, to find documents containing "whoosh" at most 5 positions before
- "library" in the "text" field::
- from whoosh import query, spans
- t1 = query.Term("text", "whoosh")
- t2 = query.Term("text", "library")
- q = spans.SpanNear(t1, t2, slop=5)
- """
- from whoosh.matching import mcore, wrappers, binary
- from whoosh.query import Query, And, AndMaybe, Or, Term
- from whoosh.util import make_binary_tree
- # Span class
- class Span(object):
- __slots__ = ("start", "end", "startchar", "endchar", "boost")
- def __init__(self, start, end=None, startchar=None, endchar=None,
- boost=1.0):
- if end is None:
- end = start
- assert start <= end
- self.start = start
- self.end = end
- self.startchar = startchar
- self.endchar = endchar
- self.boost = boost
- def __repr__(self):
- if self.startchar is not None or self.endchar is not None:
- return "<%d-%d %d:%d>" % (self.start, self.end, self.startchar,
- self.endchar)
- else:
- return "<%d-%d>" % (self.start, self.end)
- def __eq__(self, span):
- return (self.start == span.start
- and self.end == span.end
- and self.startchar == span.startchar
- and self.endchar == span.endchar)
- def __ne__(self, span):
- return self.start != span.start or self.end != span.end
- def __lt__(self, span):
- return self.start < span.start
- def __gt__(self, span):
- return self.start > span.start
- def __hash__(self):
- return hash((self.start, self.end))
- @classmethod
- def merge(cls, spans):
- """Merges overlapping and touches spans in the given list of spans.
- Note that this modifies the original list.
- >>> spans = [Span(1,2), Span(3)]
- >>> Span.merge(spans)
- >>> spans
- [<1-3>]
- """
- i = 0
- while i < len(spans) - 1:
- here = spans[i]
- j = i + 1
- while j < len(spans):
- there = spans[j]
- if there.start > here.end + 1:
- break
- if here.touches(there) or here.overlaps(there):
- here = here.to(there)
- spans[i] = here
- del spans[j]
- else:
- j += 1
- i += 1
- return spans
- def to(self, span):
- if self.startchar is None:
- minchar = span.startchar
- elif span.startchar is None:
- minchar = self.startchar
- else:
- minchar = min(self.startchar, span.startchar)
- if self.endchar is None:
- maxchar = span.endchar
- elif span.endchar is None:
- maxchar = self.endchar
- else:
- maxchar = max(self.endchar, span.endchar)
- minpos = min(self.start, span.start)
- maxpos = max(self.end, span.end)
- return self.__class__(minpos, maxpos, minchar, maxchar)
- def overlaps(self, span):
- return ((self.start >= span.start and self.start <= span.end)
- or (self.end >= span.start and self.end <= span.end)
- or (span.start >= self.start and span.start <= self.end)
- or (span.end >= self.start and span.end <= self.end))
- def surrounds(self, span):
- return self.start < span.start and self.end > span.end
- def is_within(self, span):
- return self.start >= span.start and self.end <= span.end
- def is_before(self, span):
- return self.end < span.start
- def is_after(self, span):
- return self.start > span.end
- def touches(self, span):
- return self.start == span.end + 1 or self.end == span.start - 1
- def distance_to(self, span):
- if self.overlaps(span):
- return 0
- elif self.is_before(span):
- return span.start - self.end
- else:
- return self.start - span.end
- def bisect_spans(spans, start):
- lo = 0
- hi = len(spans)
- while lo < hi:
- mid = (lo + hi) // 2
- if spans[mid].start < start:
- lo = mid + 1
- else:
- hi = mid
- return lo
- # Base matchers
- class SpanWrappingMatcher(wrappers.WrappingMatcher):
- """An abstract matcher class that wraps a "regular" matcher. This matcher
- uses the sub-matcher's matching logic, but only matches documents that have
- matching spans, i.e. where ``_get_spans()`` returns a non-empty list.
- Subclasses must implement the ``_get_spans()`` method, which returns a list
- of valid spans for the current document.
- """
- def __init__(self, child):
- super(SpanWrappingMatcher, self).__init__(child)
- self._spans = None
- if self.is_active():
- self._find_next()
- def copy(self):
- m = self.__class__(self.child.copy())
- m._spans = self._spans
- return m
- def _replacement(self, newchild):
- return self.__class__(newchild)
- def _find_next(self):
- if not self.is_active():
- return
- child = self.child
- r = False
- spans = self._get_spans()
- while child.is_active() and not spans:
- r = child.next() or r
- if not child.is_active():
- return True
- spans = self._get_spans()
- self._spans = spans
- return r
- def spans(self):
- return self._spans
- def next(self):
- self.child.next()
- self._find_next()
- def skip_to(self, id):
- self.child.skip_to(id)
- self._find_next()
- def all_ids(self):
- while self.is_active():
- if self.spans():
- yield self.id()
- self.next()
- class SpanBiMatcher(SpanWrappingMatcher):
- def copy(self):
- return self.__class__(self.a.copy(), self.b.copy())
- def depth(self):
- return 1 + max(self.a.depth(), self.b.depth())
- def replace(self, minquality=0):
- # TODO: fix this
- if not self.is_active():
- return mcore.NullMatcher()
- return self
- # Queries
- class SpanQuery(Query):
- """Abstract base class for span-based queries. Each span query type wraps
- a "regular" query that implements the basic document-matching functionality
- (for example, SpanNear wraps an And query, because SpanNear requires that
- the two sub-queries occur in the same documents. The wrapped query is
- stored in the ``q`` attribute.
- Subclasses usually only need to implement the initializer to set the
- wrapped query, and ``matcher()`` to return a span-aware matcher object.
- """
- def _subm(self, s, context=None):
- return self.q.matcher(s, context)
- def __repr__(self):
- return "%s(%r)" % (self.__class__.__name__, self.q)
- def __eq__(self, other):
- return (other and self.__class__ is other.__class__
- and self.q == other.q)
- def __hash__(self):
- return hash(self.__class__.__name__) ^ hash(self.q)
- def field(self):
- return None
- def needs_spans(self):
- return True
- class WrappingSpan(SpanQuery):
- def is_leaf(self):
- return False
- def apply(self, fn):
- return self.__class__(fn(self.q), limit=self.limit)
- def field(self):
- return self.q.field()
- class SpanFirst(WrappingSpan):
- """Matches spans that end within the first N positions. This lets you
- for example only match terms near the beginning of the document.
- """
- def __init__(self, q, limit=0):
- """
- :param q: the query to match.
- :param limit: the query must match within this position at the start
- of a document. The default is ``0``, which means the query must
- match at the first position.
- """
- self.q = q
- self.limit = limit
- def __eq__(self, other):
- return (other and self.__class__ is other.__class__
- and self.q == other.q and self.limit == other.limit)
- def __hash__(self):
- return hash(self.q) ^ hash(self.limit)
- def matcher(self, searcher, context=None):
- m = self._subm(searcher, context)
- return SpanFirst.SpanFirstMatcher(m, limit=self.limit)
- class SpanFirstMatcher(SpanWrappingMatcher):
- def __init__(self, child, limit=0):
- self.limit = limit
- super(SpanFirst.SpanFirstMatcher, self).__init__(child)
- def copy(self):
- return self.__class__(self.child.copy(), limit=self.limit)
- def _replacement(self, newchild):
- return self.__class__(newchild, limit=self.limit)
- def _get_spans(self):
- return [span for span in self.child.spans()
- if span.end <= self.limit]
- class SpanNear(SpanQuery):
- """
- Note: for new code, use :class:`SpanNear2` instead of this class. SpanNear2
- takes a list of sub-queries instead of requiring you to create a binary
- tree of query objects.
- Matches queries that occur near each other. By default, only matches
- queries that occur right next to each other (slop=1) and in order
- (ordered=True).
- For example, to find documents where "whoosh" occurs next to "library"
- in the "text" field::
- from whoosh import query, spans
- t1 = query.Term("text", "whoosh")
- t2 = query.Term("text", "library")
- q = spans.SpanNear(t1, t2)
- To find documents where "whoosh" occurs at most 5 positions before
- "library"::
- q = spans.SpanNear(t1, t2, slop=5)
- To find documents where "whoosh" occurs at most 5 positions before or after
- "library"::
- q = spans.SpanNear(t1, t2, slop=5, ordered=False)
- You can use the ``phrase()`` class method to create a tree of SpanNear
- queries to match a list of terms::
- q = spans.SpanNear.phrase("text", ["whoosh", "search", "library"],
- slop=2)
- """
- def __init__(self, a, b, slop=1, ordered=True, mindist=1):
- """
- :param a: the first query to match.
- :param b: the second query that must occur within "slop" positions of
- the first query.
- :param slop: the number of positions within which the queries must
- occur. Default is 1, meaning the queries must occur right next
- to each other.
- :param ordered: whether a must occur before b. Default is True.
- :pram mindist: the minimum distance allowed between the queries.
- """
- self.q = And([a, b])
- self.a = a
- self.b = b
- self.slop = slop
- self.ordered = ordered
- self.mindist = mindist
- def __repr__(self):
- return ("%s(%r, slop=%d, ordered=%s, mindist=%d)"
- % (self.__class__.__name__, self.q, self.slop, self.ordered,
- self.mindist))
- def __eq__(self, other):
- return (other and self.__class__ == other.__class__
- and self.q == other.q and self.slop == other.slop
- and self.ordered == other.ordered
- and self.mindist == other.mindist)
- def __hash__(self):
- return (hash(self.a) ^ hash(self.b) ^ hash(self.slop)
- ^ hash(self.ordered) ^ hash(self.mindist))
- def is_leaf(self):
- return False
- def apply(self, fn):
- return self.__class__(fn(self.a), fn(self.b), slop=self.slop,
- ordered=self.ordered, mindist=self.mindist)
- def matcher(self, searcher, context=None):
- ma = self.a.matcher(searcher, context)
- mb = self.b.matcher(searcher, context)
- return SpanNear.SpanNearMatcher(ma, mb, slop=self.slop,
- ordered=self.ordered,
- mindist=self.mindist)
- @classmethod
- def phrase(cls, fieldname, words, slop=1, ordered=True):
- """Returns a tree of SpanNear queries to match a list of terms.
- This class method is a convenience for constructing a phrase query
- using a binary tree of SpanNear queries::
- SpanNear.phrase("content", ["alfa", "bravo", "charlie", "delta"])
- :param fieldname: the name of the field to search in.
- :param words: a sequence of texts to search for.
- :param slop: the number of positions within which the terms must
- occur. Default is 1, meaning the terms must occur right next
- to each other.
- :param ordered: whether the terms must occur in order. Default is True.
- """
- terms = [Term(fieldname, word) for word in words]
- return make_binary_tree(cls, terms, slop=slop, ordered=ordered)
- class SpanNearMatcher(SpanWrappingMatcher):
- def __init__(self, a, b, slop=1, ordered=True, mindist=1):
- self.a = a
- self.b = b
- self.slop = slop
- self.ordered = ordered
- self.mindist = mindist
- isect = binary.IntersectionMatcher(a, b)
- super(SpanNear.SpanNearMatcher, self).__init__(isect)
- def copy(self):
- return self.__class__(self.a.copy(), self.b.copy(), slop=self.slop,
- ordered=self.ordered, mindist=self.mindist)
- def replace(self, minquality=0):
- # TODO: fix this
- if not self.is_active():
- return mcore.NullMatcher()
- return self
- def _get_spans(self):
- slop = self.slop
- mindist = self.mindist
- ordered = self.ordered
- spans = set()
- bspans = self.b.spans()
- for aspan in self.a.spans():
- for bspan in bspans:
- if (bspan.end < aspan.start - slop
- or (ordered and aspan.start > bspan.start)):
- # B is too far in front of A, or B is in front of A
- # *at all* when ordered is True
- continue
- if bspan.start > aspan.end + slop:
- # B is too far from A. Since spans are listed in
- # start position order, we know that all spans after
- # this one will also be too far.
- break
- # Check the distance between the spans
- dist = aspan.distance_to(bspan)
- if mindist <= dist <= slop:
- spans.add(aspan.to(bspan))
- return sorted(spans)
- class SpanNear2(SpanQuery):
- """
- Matches queries that occur near each other. By default, only matches
- queries that occur right next to each other (slop=1) and in order
- (ordered=True).
- New code should use this query type instead of :class:`SpanNear`.
- (Unlike :class:`SpanNear`, this query takes a list of subqueries instead of
- requiring you to build a binary tree of query objects. This query should
- also be slightly faster due to less overhead.)
- For example, to find documents where "whoosh" occurs next to "library"
- in the "text" field::
- from whoosh import query, spans
- t1 = query.Term("text", "whoosh")
- t2 = query.Term("text", "library")
- q = spans.SpanNear2([t1, t2])
- To find documents where "whoosh" occurs at most 5 positions before
- "library"::
- q = spans.SpanNear2([t1, t2], slop=5)
- To find documents where "whoosh" occurs at most 5 positions before or after
- "library"::
- q = spans.SpanNear2(t1, t2, slop=5, ordered=False)
- """
- def __init__(self, qs, slop=1, ordered=True, mindist=1):
- """
- :param qs: a sequence of sub-queries to match.
- :param slop: the number of positions within which the queries must
- occur. Default is 1, meaning the queries must occur right next
- to each other.
- :param ordered: whether a must occur before b. Default is True.
- :pram mindist: the minimum distance allowed between the queries.
- """
- self.qs = qs
- self.slop = slop
- self.ordered = ordered
- self.mindist = mindist
- def __repr__(self):
- return ("%s(%r, slop=%d, ordered=%s, mindist=%d)"
- % (self.__class__.__name__, self.qs, self.slop, self.ordered,
- self.mindist))
- def __eq__(self, other):
- return (other and self.__class__ == other.__class__
- and self.qs == other.qs and self.slop == other.slop
- and self.ordered == other.ordered
- and self.mindist == other.mindist)
- def __hash__(self):
- h = hash(self.slop) ^ hash(self.ordered) ^ hash(self.mindist)
- for q in self.qs:
- h ^= hash(q)
- return h
- def _and_query(self):
- return q.And(self.qs)
- def estimate_size(self, ixreader):
- return self._and_query().estimate_size(ixreader)
- def estimate_min_size(self, ixreader):
- return self._and_query().estimate_min_size(ixreader)
- def is_leaf(self):
- return False
- def children(self):
- return self.qs
- def apply(self, fn):
- return self.__class__([fn(q) for q in self.qs], slop=self.slop,
- ordered=self.ordered, mindist=self.mindist)
- def matcher(self, searcher, context=None):
- ms = [q.matcher(searcher, context) for q in self.qs]
- return self.SpanNear2Matcher(ms, slop=self.slop, ordered=self.ordered,
- mindist=self.mindist)
- class SpanNear2Matcher(SpanWrappingMatcher):
- def __init__(self, ms, slop=1, ordered=True, mindist=1):
- self.ms = ms
- self.slop = slop
- self.ordered = ordered
- self.mindist = mindist
- isect = make_binary_tree(binary.IntersectionMatcher, ms)
- super(SpanNear2.SpanNear2Matcher, self).__init__(isect)
- def copy(self):
- return self.__class__([m.copy() for m in self.ms], slop=self.slop,
- ordered=self.ordered, mindist=self.mindist)
- def replace(self, minquality=0):
- # TODO: fix this
- if not self.is_active():
- return mcore.NullMatcher()
- return self
- def _get_spans(self):
- slop = self.slop
- mindist = self.mindist
- ordered = self.ordered
- ms = self.ms
- aspans = ms[0].spans()
- i = 1
- while i < len(ms) and aspans:
- bspans = ms[i].spans()
- spans = set()
- for aspan in aspans:
- # Use a binary search to find the first position we should
- # start looking for possible matches
- if ordered:
- start = aspan.start
- else:
- start = max(0, aspan.start - slop)
- j = bisect_spans(bspans, start)
- while j < len(bspans):
- bspan = bspans[j]
- j += 1
- if (bspan.end < aspan.start - slop
- or (ordered and aspan.start > bspan.start)):
- # B is too far in front of A, or B is in front of A
- # *at all* when ordered is True
- continue
- if bspan.start > aspan.end + slop:
- # B is too far from A. Since spans are listed in
- # start position order, we know that all spans after
- # this one will also be too far.
- break
- # Check the distance between the spans
- dist = aspan.distance_to(bspan)
- if mindist <= dist <= slop:
- spans.add(aspan.to(bspan))
- aspans = sorted(spans)
- i += 1
- if i == len(ms):
- return aspans
- else:
- return []
- class SpanOr(SpanQuery):
- """Matches documents that match any of a list of sub-queries. Unlike
- query.Or, this class merges together matching spans from the different
- sub-queries when they overlap.
- """
- def __init__(self, subqs):
- """
- :param subqs: a list of queries to match.
- """
- self.q = Or(subqs)
- self.subqs = subqs
- def is_leaf(self):
- return False
- def apply(self, fn):
- return self.__class__([fn(sq) for sq in self.subqs])
- def matcher(self, searcher, context=None):
- matchers = [q.matcher(searcher, context) for q in self.subqs]
- return make_binary_tree(SpanOr.SpanOrMatcher, matchers)
- class SpanOrMatcher(SpanBiMatcher):
- def __init__(self, a, b):
- self.a = a
- self.b = b
- um = binary.UnionMatcher(a, b)
- super(SpanOr.SpanOrMatcher, self).__init__(um)
- def _get_spans(self):
- a_active = self.a.is_active()
- b_active = self.b.is_active()
- if a_active:
- a_id = self.a.id()
- if b_active:
- b_id = self.b.id()
- if a_id == b_id:
- spans = sorted(set(self.a.spans())
- | set(self.b.spans()))
- elif a_id < b_id:
- spans = self.a.spans()
- else:
- spans = self.b.spans()
- else:
- spans = self.a.spans()
- else:
- spans = self.b.spans()
- Span.merge(spans)
- return spans
- class SpanBiQuery(SpanQuery):
- # Intermediate base class for methods common to "a/b" span query types
- def is_leaf(self):
- return False
- def apply(self, fn):
- return self.__class__(fn(self.a), fn(self.b))
- def matcher(self, searcher, context=None):
- ma = self.a.matcher(searcher, context)
- mb = self.b.matcher(searcher, context)
- return self._Matcher(ma, mb)
- class SpanNot(SpanBiQuery):
- """Matches spans from the first query only if they don't overlap with
- spans from the second query. If there are no non-overlapping spans, the
- document does not match.
- For example, to match documents that contain "bear" at most 2 places after
- "apple" in the "text" field but don't have "cute" between them::
- from whoosh import query, spans
- t1 = query.Term("text", "apple")
- t2 = query.Term("text", "bear")
- near = spans.SpanNear(t1, t2, slop=2)
- q = spans.SpanNot(near, query.Term("text", "cute"))
- """
- def __init__(self, a, b):
- """
- :param a: the query to match.
- :param b: do not match any spans that overlap with spans from this
- query.
- """
- self.q = AndMaybe(a, b)
- self.a = a
- self.b = b
- class _Matcher(SpanBiMatcher):
- def __init__(self, a, b):
- self.a = a
- self.b = b
- amm = binary.AndMaybeMatcher(a, b)
- super(SpanNot._Matcher, self).__init__(amm)
- def _get_spans(self):
- if self.a.id() == self.b.id():
- spans = []
- bspans = self.b.spans()
- for aspan in self.a.spans():
- overlapped = False
- for bspan in bspans:
- if aspan.overlaps(bspan):
- overlapped = True
- break
- if not overlapped:
- spans.append(aspan)
- return spans
- else:
- return self.a.spans()
- class SpanContains(SpanBiQuery):
- """Matches documents where the spans of the first query contain any spans
- of the second query.
- For example, to match documents where "apple" occurs at most 10 places
- before "bear" in the "text" field and "cute" is between them::
- from whoosh import query, spans
- t1 = query.Term("text", "apple")
- t2 = query.Term("text", "bear")
- near = spans.SpanNear(t1, t2, slop=10)
- q = spans.SpanContains(near, query.Term("text", "cute"))
- """
- def __init__(self, a, b):
- """
- :param a: the query to match.
- :param b: the query whose spans must occur within the matching spans
- of the first query.
- """
- self.q = And([a, b])
- self.a = a
- self.b = b
- class _Matcher(SpanBiMatcher):
- def __init__(self, a, b):
- self.a = a
- self.b = b
- im = binary.IntersectionMatcher(a, b)
- super(SpanContains._Matcher, self).__init__(im)
- def _get_spans(self):
- spans = []
- bspans = self.b.spans()
- for aspan in self.a.spans():
- for bspan in bspans:
- if aspan.start > bspan.end:
- continue
- if aspan.end < bspan.start:
- break
- if bspan.is_within(aspan):
- spans.append(aspan)
- break
- return spans
- class SpanBefore(SpanBiQuery):
- """Matches documents where the spans of the first query occur before any
- spans of the second query.
- For example, to match documents where "apple" occurs anywhere before
- "bear"::
- from whoosh import query, spans
- t1 = query.Term("text", "apple")
- t2 = query.Term("text", "bear")
- q = spans.SpanBefore(t1, t2)
- """
- def __init__(self, a, b):
- """
- :param a: the query that must occur before the second.
- :param b: the query that must occur after the first.
- """
- self.a = a
- self.b = b
- self.q = And([a, b])
- class _Matcher(SpanBiMatcher):
- def __init__(self, a, b):
- self.a = a
- self.b = b
- im = binary.IntersectionMatcher(a, b)
- super(SpanBefore._Matcher, self).__init__(im)
- def _get_spans(self):
- bminstart = min(bspan.start for bspan in self.b.spans())
- return [aspan for aspan in self.a.spans() if aspan.end < bminstart]
- class SpanCondition(SpanBiQuery):
- """Matches documents that satisfy both subqueries, but only uses the spans
- from the first subquery.
- This is useful when you want to place conditions on matches but not have
- those conditions affect the spans returned.
- For example, to get spans for the term ``alfa`` in documents that also
- must contain the term ``bravo``::
- SpanCondition(Term("text", u"alfa"), Term("text", u"bravo"))
- """
- def __init__(self, a, b):
- self.a = a
- self.b = b
- self.q = And([a, b])
- class _Matcher(SpanBiMatcher):
- def __init__(self, a, b):
- self.a = a
- im = binary.IntersectionMatcher(a, b)
- super(SpanCondition._Matcher, self).__init__(im)
- def _get_spans(self):
- return self.a.spans()
|