nested.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
  1. # Copyright 2012 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. from whoosh import matching
  28. from whoosh.compat import text_type, u, xrange
  29. from whoosh.query import qcore
  30. from whoosh.query.wrappers import WrappingQuery
  31. class NestedParent(WrappingQuery):
  32. """A query that allows you to search for "nested" documents, where you can
  33. index (possibly multiple levels of) "parent" and "child" documents using
  34. the :meth:`~whoosh.writing.IndexWriter.group` and/or
  35. :meth:`~whoosh.writing.IndexWriter.start_group` methods of a
  36. :class:`whoosh.writing.IndexWriter` to indicate that hierarchically related
  37. documents should be kept together::
  38. schema = fields.Schema(type=fields.ID, text=fields.TEXT(stored=True))
  39. with ix.writer() as w:
  40. # Say we're indexing chapters (type=chap) and each chapter has a
  41. # number of paragraphs (type=p)
  42. with w.group():
  43. w.add_document(type="chap", text="Chapter 1")
  44. w.add_document(type="p", text="Able baker")
  45. w.add_document(type="p", text="Bright morning")
  46. with w.group():
  47. w.add_document(type="chap", text="Chapter 2")
  48. w.add_document(type="p", text="Car trip")
  49. w.add_document(type="p", text="Dog eared")
  50. w.add_document(type="p", text="Every day")
  51. with w.group():
  52. w.add_document(type="chap", text="Chapter 3")
  53. w.add_document(type="p", text="Fine day")
  54. The ``NestedParent`` query wraps two sub-queries: the "parent query"
  55. matches a class of "parent documents". The "sub query" matches nested
  56. documents you want to find. For each "sub document" the "sub query" finds,
  57. this query acts as if it found the corresponding "parent document".
  58. >>> with ix.searcher() as s:
  59. ... r = s.search(query.Term("text", "day"))
  60. ... for hit in r:
  61. ... print(hit["text"])
  62. ...
  63. Chapter 2
  64. Chapter 3
  65. """
  66. def __init__(self, parents, subq, per_parent_limit=None, score_fn=sum):
  67. """
  68. :param parents: a query, DocIdSet object, or Results object
  69. representing the documents you want to use as the "parent"
  70. documents. Where the sub-query matches, the corresponding document
  71. in these results will be returned as the match.
  72. :param subq: a query matching the information you want to find.
  73. :param per_parent_limit: a maximum number of "sub documents" to search
  74. per parent. The default is None, meaning no limit.
  75. :param score_fn: a function to use to combine the scores of matching
  76. sub-documents to calculate the score returned for the parent
  77. document. The default is ``sum``, that is, add up the scores of the
  78. sub-documents.
  79. """
  80. self.parents = parents
  81. self.child = subq
  82. self.per_parent_limit = per_parent_limit
  83. self.score_fn = score_fn
  84. def normalize(self):
  85. p = self.parents
  86. if isinstance(p, qcore.Query):
  87. p = p.normalize()
  88. q = self.child.normalize()
  89. if p is qcore.NullQuery or q is qcore.NullQuery:
  90. return qcore.NullQuery
  91. return self.__class__(p, q)
  92. def requires(self):
  93. return self.child.requires()
  94. def matcher(self, searcher, context=None):
  95. bits = searcher._filter_to_comb(self.parents)
  96. if not bits:
  97. return matching.NullMatcher
  98. m = self.child.matcher(searcher, context)
  99. if not m.is_active():
  100. return matching.NullMatcher
  101. return self.NestedParentMatcher(bits, m, self.per_parent_limit,
  102. searcher.doc_count_all(),
  103. self.score_fn)
  104. def deletion_docs(self, searcher):
  105. bits = searcher._filter_to_comb(self.parents)
  106. if not bits:
  107. return
  108. m = self.child.matcher(searcher, searcher.boolean_context())
  109. maxdoc = searcher.doc_count_all()
  110. while m.is_active():
  111. docnum = m.id()
  112. parentdoc = bits.before(docnum + 1)
  113. nextparent = bits.after(docnum) or maxdoc
  114. for i in xrange(parentdoc, nextparent):
  115. yield i
  116. m.skip_to(nextparent)
  117. class NestedParentMatcher(matching.Matcher):
  118. def __init__(self, comb, child, per_parent_limit, maxdoc, score_fn):
  119. self.comb = comb
  120. self.child = child
  121. self.per_parent_limit = per_parent_limit
  122. self.maxdoc = maxdoc
  123. self.score_fn = score_fn
  124. self._nextdoc = None
  125. if self.child.is_active():
  126. self._gather()
  127. def is_active(self):
  128. return self._nextdoc is not None
  129. def supports_block_quality(self):
  130. return False
  131. def _gather(self):
  132. # This is where the magic happens ;)
  133. child = self.child
  134. pplimit = self.per_parent_limit
  135. # The next document returned by this matcher is the parent of the
  136. # child's current document. We don't have to worry about whether
  137. # the parent is deleted, because the query that gave us the parents
  138. # wouldn't return deleted documents.
  139. self._nextdoc = self.comb.before(child.id() + 1)
  140. # The next parent after the child matcher's current document
  141. nextparent = self.comb.after(child.id()) or self.maxdoc
  142. # Sum the scores of all matching documents under the parent
  143. count = 1
  144. scores = []
  145. while child.is_active() and child.id() < nextparent:
  146. if pplimit and count > pplimit:
  147. child.skip_to(nextparent)
  148. break
  149. scores.append(child.score())
  150. child.next()
  151. count += 1
  152. score = self.score_fn(scores) if scores else 0
  153. self._nextscore = score
  154. def id(self):
  155. return self._nextdoc
  156. def score(self):
  157. return self._nextscore
  158. def reset(self):
  159. self.child.reset()
  160. self._gather()
  161. def next(self):
  162. if self.child.is_active():
  163. self._gather()
  164. else:
  165. if self._nextdoc is None:
  166. raise matching.ReadTooFar
  167. else:
  168. self._nextdoc = None
  169. def skip_to(self, id):
  170. self.child.skip_to(id)
  171. self._gather()
  172. def value(self):
  173. raise NotImplementedError(self.__class__)
  174. def spans(self):
  175. return []
  176. class NestedChildren(WrappingQuery):
  177. """This is the reverse of a :class:`NestedParent` query: instead of taking
  178. a query that matches children but returns the parent, this query matches
  179. parents but returns the children.
  180. This is useful, for example, to search for an album title and return the
  181. songs in the album::
  182. schema = fields.Schema(type=fields.ID(stored=True),
  183. album_name=fields.TEXT(stored=True),
  184. track_num=fields.NUMERIC(stored=True),
  185. track_name=fields.TEXT(stored=True),
  186. lyrics=fields.TEXT)
  187. ix = RamStorage().create_index(schema)
  188. # Indexing
  189. with ix.writer() as w:
  190. # For each album, index a "group" of a parent "album" document and
  191. # multiple child "track" documents.
  192. with w.group():
  193. w.add_document(type="album",
  194. artist="The Cure", album_name="Disintegration")
  195. w.add_document(type="track", track_num=1,
  196. track_name="Plainsong")
  197. w.add_document(type="track", track_num=2,
  198. track_name="Pictures of You")
  199. # ...
  200. # ...
  201. # Find songs where the song name has "heaven" in the title and the
  202. # album the song is on has "hell" in the title
  203. qp = QueryParser("lyrics", ix.schema)
  204. with ix.searcher() as s:
  205. # A query that matches all parents
  206. all_albums = qp.parse("type:album")
  207. # A query that matches the parents we want
  208. albums_with_hell = qp.parse("album_name:hell")
  209. # A query that matches the desired albums but returns the tracks
  210. songs_on_hell_albums = NestedChildren(all_albums, albums_with_hell)
  211. # A query that matches tracks with heaven in the title
  212. songs_with_heaven = qp.parse("track_name:heaven")
  213. # A query that finds tracks with heaven in the title on albums
  214. # with hell in the title
  215. q = query.And([songs_on_hell_albums, songs_with_heaven])
  216. """
  217. def __init__(self, parents, subq, boost=1.0):
  218. self.parents = parents
  219. self.child = subq
  220. self.boost = boost
  221. def matcher(self, searcher, context=None):
  222. bits = searcher._filter_to_comb(self.parents)
  223. if not bits:
  224. return matching.NullMatcher
  225. m = self.child.matcher(searcher, context)
  226. if not m.is_active():
  227. return matching.NullMatcher
  228. return self.NestedChildMatcher(bits, m, searcher.doc_count_all(),
  229. searcher.reader().is_deleted,
  230. boost=self.boost)
  231. class NestedChildMatcher(matching.WrappingMatcher):
  232. def __init__(self, parent_comb, wanted_parent_matcher, limit,
  233. is_deleted, boost=1.0):
  234. self.parent_comb = parent_comb
  235. self.child = wanted_parent_matcher
  236. self.limit = limit
  237. self.is_deleted = is_deleted
  238. self.boost = boost
  239. self._nextchild = -1
  240. self._nextparent = -1
  241. self._find_next_children()
  242. def __repr__(self):
  243. return "%s(%r, %r)" % (self.__class__.__name__,
  244. self.parent_comb,
  245. self.child)
  246. def reset(self):
  247. self.child.reset()
  248. self._reset()
  249. def _reset(self):
  250. self._nextchild = -1
  251. self._nextparent = -1
  252. self._find_next_children()
  253. def is_active(self):
  254. return self._nextchild < self._nextparent
  255. def replace(self, minquality=0):
  256. return self
  257. def _find_next_children(self):
  258. # "comb" contains the doc IDs of all parent documents
  259. comb = self.parent_comb
  260. # "m" is the matcher for "wanted" parents
  261. m = self.child
  262. # Last doc ID + 1
  263. limit = self.limit
  264. # A function that returns True if a doc ID is deleted
  265. is_deleted = self.is_deleted
  266. nextchild = self._nextchild
  267. nextparent = self._nextparent
  268. while m.is_active():
  269. # Move the "child id" to the document after the current match
  270. nextchild = m.id() + 1
  271. # Move the parent matcher to the next match
  272. m.next()
  273. # Find the next parent document (matching or not) after this
  274. nextparent = comb.after(nextchild)
  275. if nextparent is None:
  276. nextparent = limit
  277. # Skip any deleted child documents
  278. while is_deleted(nextchild):
  279. nextchild += 1
  280. # If skipping deleted documents put us to or past the next
  281. # parent doc, go again
  282. if nextchild >= nextparent:
  283. continue
  284. else:
  285. # Otherwise, we're done
  286. break
  287. self._nextchild = nextchild
  288. self._nextparent = nextparent
  289. def id(self):
  290. return self._nextchild
  291. def all_ids(self):
  292. while self.is_active():
  293. yield self.id()
  294. self.next()
  295. def next(self):
  296. is_deleted = self.is_deleted
  297. limit = self.limit
  298. nextparent = self._nextparent
  299. # Go to the next document
  300. nextchild = self._nextchild
  301. nextchild += 1
  302. # Skip over any deleted child documents
  303. while nextchild < nextparent and is_deleted(nextchild):
  304. nextchild += 1
  305. self._nextchild = nextchild
  306. # If we're at or past the next parent doc, go to the next set of
  307. # children
  308. if nextchild >= limit:
  309. return
  310. elif nextchild >= nextparent:
  311. self._find_next_children()
  312. def skip_to(self, docid):
  313. comb = self.parent_comb
  314. wanted = self.child
  315. # self._nextchild is the "current" matching child ID
  316. if docid <= self._nextchild:
  317. return
  318. # self._nextparent is the next parent ID (matching or not)
  319. if docid < self._nextparent:
  320. # Just iterate
  321. while self.is_active() and self.id() < docid:
  322. self.next()
  323. elif wanted.is_active():
  324. # Find the parent before the target ID
  325. pid = comb.before(docid)
  326. # Skip the parent matcher to that ID
  327. wanted.skip_to(pid)
  328. # If that made the matcher inactive, then we're done
  329. if not wanted.is_active():
  330. self._nextchild = self._nextparent = self.limit
  331. else:
  332. # Reestablish for the next child after the next matching
  333. # parent
  334. self._find_next_children()
  335. else:
  336. self._nextchild = self._nextparent = self.limit
  337. def value(self):
  338. raise NotImplementedError(self.__class__)
  339. def score(self):
  340. return self.boost
  341. def spans(self):
  342. return []