123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415 |
- # Copyright 2012 Matt Chaput. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # 1. Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
- # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # The views and conclusions contained in the software and documentation are
- # those of the authors and should not be interpreted as representing official
- # policies, either expressed or implied, of Matt Chaput.
- from whoosh import matching
- from whoosh.compat import text_type, u, xrange
- from whoosh.query import qcore
- from whoosh.query.wrappers import WrappingQuery
- class NestedParent(WrappingQuery):
- """A query that allows you to search for "nested" documents, where you can
- index (possibly multiple levels of) "parent" and "child" documents using
- the :meth:`~whoosh.writing.IndexWriter.group` and/or
- :meth:`~whoosh.writing.IndexWriter.start_group` methods of a
- :class:`whoosh.writing.IndexWriter` to indicate that hierarchically related
- documents should be kept together::
- schema = fields.Schema(type=fields.ID, text=fields.TEXT(stored=True))
- with ix.writer() as w:
- # Say we're indexing chapters (type=chap) and each chapter has a
- # number of paragraphs (type=p)
- with w.group():
- w.add_document(type="chap", text="Chapter 1")
- w.add_document(type="p", text="Able baker")
- w.add_document(type="p", text="Bright morning")
- with w.group():
- w.add_document(type="chap", text="Chapter 2")
- w.add_document(type="p", text="Car trip")
- w.add_document(type="p", text="Dog eared")
- w.add_document(type="p", text="Every day")
- with w.group():
- w.add_document(type="chap", text="Chapter 3")
- w.add_document(type="p", text="Fine day")
- The ``NestedParent`` query wraps two sub-queries: the "parent query"
- matches a class of "parent documents". The "sub query" matches nested
- documents you want to find. For each "sub document" the "sub query" finds,
- this query acts as if it found the corresponding "parent document".
- >>> with ix.searcher() as s:
- ... r = s.search(query.Term("text", "day"))
- ... for hit in r:
- ... print(hit["text"])
- ...
- Chapter 2
- Chapter 3
- """
- def __init__(self, parents, subq, per_parent_limit=None, score_fn=sum):
- """
- :param parents: a query, DocIdSet object, or Results object
- representing the documents you want to use as the "parent"
- documents. Where the sub-query matches, the corresponding document
- in these results will be returned as the match.
- :param subq: a query matching the information you want to find.
- :param per_parent_limit: a maximum number of "sub documents" to search
- per parent. The default is None, meaning no limit.
- :param score_fn: a function to use to combine the scores of matching
- sub-documents to calculate the score returned for the parent
- document. The default is ``sum``, that is, add up the scores of the
- sub-documents.
- """
- self.parents = parents
- self.child = subq
- self.per_parent_limit = per_parent_limit
- self.score_fn = score_fn
- def normalize(self):
- p = self.parents
- if isinstance(p, qcore.Query):
- p = p.normalize()
- q = self.child.normalize()
- if p is qcore.NullQuery or q is qcore.NullQuery:
- return qcore.NullQuery
- return self.__class__(p, q)
- def requires(self):
- return self.child.requires()
- def matcher(self, searcher, context=None):
- bits = searcher._filter_to_comb(self.parents)
- if not bits:
- return matching.NullMatcher
- m = self.child.matcher(searcher, context)
- if not m.is_active():
- return matching.NullMatcher
- return self.NestedParentMatcher(bits, m, self.per_parent_limit,
- searcher.doc_count_all(),
- self.score_fn)
- def deletion_docs(self, searcher):
- bits = searcher._filter_to_comb(self.parents)
- if not bits:
- return
- m = self.child.matcher(searcher, searcher.boolean_context())
- maxdoc = searcher.doc_count_all()
- while m.is_active():
- docnum = m.id()
- parentdoc = bits.before(docnum + 1)
- nextparent = bits.after(docnum) or maxdoc
- for i in xrange(parentdoc, nextparent):
- yield i
- m.skip_to(nextparent)
- class NestedParentMatcher(matching.Matcher):
- def __init__(self, comb, child, per_parent_limit, maxdoc, score_fn):
- self.comb = comb
- self.child = child
- self.per_parent_limit = per_parent_limit
- self.maxdoc = maxdoc
- self.score_fn = score_fn
- self._nextdoc = None
- if self.child.is_active():
- self._gather()
- def is_active(self):
- return self._nextdoc is not None
- def supports_block_quality(self):
- return False
- def _gather(self):
- # This is where the magic happens ;)
- child = self.child
- pplimit = self.per_parent_limit
- # The next document returned by this matcher is the parent of the
- # child's current document. We don't have to worry about whether
- # the parent is deleted, because the query that gave us the parents
- # wouldn't return deleted documents.
- self._nextdoc = self.comb.before(child.id() + 1)
- # The next parent after the child matcher's current document
- nextparent = self.comb.after(child.id()) or self.maxdoc
- # Sum the scores of all matching documents under the parent
- count = 1
- scores = []
- while child.is_active() and child.id() < nextparent:
- if pplimit and count > pplimit:
- child.skip_to(nextparent)
- break
- scores.append(child.score())
- child.next()
- count += 1
- score = self.score_fn(scores) if scores else 0
- self._nextscore = score
- def id(self):
- return self._nextdoc
- def score(self):
- return self._nextscore
- def reset(self):
- self.child.reset()
- self._gather()
- def next(self):
- if self.child.is_active():
- self._gather()
- else:
- if self._nextdoc is None:
- raise matching.ReadTooFar
- else:
- self._nextdoc = None
- def skip_to(self, id):
- self.child.skip_to(id)
- self._gather()
- def value(self):
- raise NotImplementedError(self.__class__)
- def spans(self):
- return []
- class NestedChildren(WrappingQuery):
- """This is the reverse of a :class:`NestedParent` query: instead of taking
- a query that matches children but returns the parent, this query matches
- parents but returns the children.
- This is useful, for example, to search for an album title and return the
- songs in the album::
- schema = fields.Schema(type=fields.ID(stored=True),
- album_name=fields.TEXT(stored=True),
- track_num=fields.NUMERIC(stored=True),
- track_name=fields.TEXT(stored=True),
- lyrics=fields.TEXT)
- ix = RamStorage().create_index(schema)
- # Indexing
- with ix.writer() as w:
- # For each album, index a "group" of a parent "album" document and
- # multiple child "track" documents.
- with w.group():
- w.add_document(type="album",
- artist="The Cure", album_name="Disintegration")
- w.add_document(type="track", track_num=1,
- track_name="Plainsong")
- w.add_document(type="track", track_num=2,
- track_name="Pictures of You")
- # ...
- # ...
- # Find songs where the song name has "heaven" in the title and the
- # album the song is on has "hell" in the title
- qp = QueryParser("lyrics", ix.schema)
- with ix.searcher() as s:
- # A query that matches all parents
- all_albums = qp.parse("type:album")
- # A query that matches the parents we want
- albums_with_hell = qp.parse("album_name:hell")
- # A query that matches the desired albums but returns the tracks
- songs_on_hell_albums = NestedChildren(all_albums, albums_with_hell)
- # A query that matches tracks with heaven in the title
- songs_with_heaven = qp.parse("track_name:heaven")
- # A query that finds tracks with heaven in the title on albums
- # with hell in the title
- q = query.And([songs_on_hell_albums, songs_with_heaven])
- """
- def __init__(self, parents, subq, boost=1.0):
- self.parents = parents
- self.child = subq
- self.boost = boost
- def matcher(self, searcher, context=None):
- bits = searcher._filter_to_comb(self.parents)
- if not bits:
- return matching.NullMatcher
- m = self.child.matcher(searcher, context)
- if not m.is_active():
- return matching.NullMatcher
- return self.NestedChildMatcher(bits, m, searcher.doc_count_all(),
- searcher.reader().is_deleted,
- boost=self.boost)
- class NestedChildMatcher(matching.WrappingMatcher):
- def __init__(self, parent_comb, wanted_parent_matcher, limit,
- is_deleted, boost=1.0):
- self.parent_comb = parent_comb
- self.child = wanted_parent_matcher
- self.limit = limit
- self.is_deleted = is_deleted
- self.boost = boost
- self._nextchild = -1
- self._nextparent = -1
- self._find_next_children()
- def __repr__(self):
- return "%s(%r, %r)" % (self.__class__.__name__,
- self.parent_comb,
- self.child)
- def reset(self):
- self.child.reset()
- self._reset()
- def _reset(self):
- self._nextchild = -1
- self._nextparent = -1
- self._find_next_children()
- def is_active(self):
- return self._nextchild < self._nextparent
- def replace(self, minquality=0):
- return self
- def _find_next_children(self):
- # "comb" contains the doc IDs of all parent documents
- comb = self.parent_comb
- # "m" is the matcher for "wanted" parents
- m = self.child
- # Last doc ID + 1
- limit = self.limit
- # A function that returns True if a doc ID is deleted
- is_deleted = self.is_deleted
- nextchild = self._nextchild
- nextparent = self._nextparent
- while m.is_active():
- # Move the "child id" to the document after the current match
- nextchild = m.id() + 1
- # Move the parent matcher to the next match
- m.next()
- # Find the next parent document (matching or not) after this
- nextparent = comb.after(nextchild)
- if nextparent is None:
- nextparent = limit
- # Skip any deleted child documents
- while is_deleted(nextchild):
- nextchild += 1
- # If skipping deleted documents put us to or past the next
- # parent doc, go again
- if nextchild >= nextparent:
- continue
- else:
- # Otherwise, we're done
- break
- self._nextchild = nextchild
- self._nextparent = nextparent
- def id(self):
- return self._nextchild
- def all_ids(self):
- while self.is_active():
- yield self.id()
- self.next()
- def next(self):
- is_deleted = self.is_deleted
- limit = self.limit
- nextparent = self._nextparent
- # Go to the next document
- nextchild = self._nextchild
- nextchild += 1
- # Skip over any deleted child documents
- while nextchild < nextparent and is_deleted(nextchild):
- nextchild += 1
- self._nextchild = nextchild
- # If we're at or past the next parent doc, go to the next set of
- # children
- if nextchild >= limit:
- return
- elif nextchild >= nextparent:
- self._find_next_children()
- def skip_to(self, docid):
- comb = self.parent_comb
- wanted = self.child
- # self._nextchild is the "current" matching child ID
- if docid <= self._nextchild:
- return
- # self._nextparent is the next parent ID (matching or not)
- if docid < self._nextparent:
- # Just iterate
- while self.is_active() and self.id() < docid:
- self.next()
- elif wanted.is_active():
- # Find the parent before the target ID
- pid = comb.before(docid)
- # Skip the parent matcher to that ID
- wanted.skip_to(pid)
- # If that made the matcher inactive, then we're done
- if not wanted.is_active():
- self._nextchild = self._nextparent = self.limit
- else:
- # Reestablish for the next child after the next matching
- # parent
- self._find_next_children()
- else:
- self._nextchild = self._nextparent = self.limit
- def value(self):
- raise NotImplementedError(self.__class__)
- def score(self):
- return self.boost
- def spans(self):
- return []
|