positional.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. # Copyright 2007 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. from __future__ import division
  28. import copy
  29. from whoosh import matching
  30. from whoosh.analysis import Token
  31. from whoosh.compat import u
  32. from whoosh.query import qcore, terms, compound
  33. class Sequence(compound.CompoundQuery):
  34. """Matches documents containing a list of sub-queries in adjacent
  35. positions.
  36. This object has no sanity check to prevent you from using queries in
  37. different fields.
  38. """
  39. JOINT = " NEAR "
  40. intersect_merge = True
  41. def __init__(self, subqueries, slop=1, ordered=True, boost=1.0):
  42. """
  43. :param subqueries: a list of :class:`whoosh.query.Query` objects to
  44. match in sequence.
  45. :param slop: the maximum difference in position allowed between the
  46. subqueries.
  47. :param ordered: if True, the position differences between subqueries
  48. must be positive (that is, each subquery in the list must appear
  49. after the previous subquery in the document).
  50. :param boost: a boost factor to add to the score of documents matching
  51. this query.
  52. """
  53. compound.CompoundQuery.__init__(self, subqueries, boost=boost)
  54. self.slop = slop
  55. self.ordered = ordered
  56. def __eq__(self, other):
  57. return (other and type(self) is type(other)
  58. and self.subqueries == other.subqueries
  59. and self.boost == other.boost)
  60. def __repr__(self):
  61. return "%s(%r, slop=%d, boost=%f)" % (self.__class__.__name__,
  62. self.subqueries, self.slop,
  63. self.boost)
  64. def __hash__(self):
  65. h = hash(self.slop) ^ hash(self.boost)
  66. for q in self.subqueries:
  67. h ^= hash(q)
  68. return h
  69. def normalize(self):
  70. # Because the subqueries are in sequence, we can't do the fancy merging
  71. # that CompoundQuery does
  72. return self.__class__([q.normalize() for q in self.subqueries],
  73. self.slop, self.ordered, self.boost)
  74. def _and_query(self):
  75. return compound.And(self.subqueries)
  76. def estimate_size(self, ixreader):
  77. return self._and_query().estimate_size(ixreader)
  78. def estimate_min_size(self, ixreader):
  79. return self._and_query().estimate_min_size(ixreader)
  80. def _matcher(self, subs, searcher, context):
  81. from whoosh.query.spans import SpanNear
  82. # Tell the sub-queries this matcher will need the current match to get
  83. # spans
  84. context = context.set(needs_current=True)
  85. m = self._tree_matcher(subs, SpanNear.SpanNearMatcher, searcher,
  86. context, None, slop=self.slop,
  87. ordered=self.ordered)
  88. return m
  89. class Ordered(Sequence):
  90. """Matches documents containing a list of sub-queries in the given order.
  91. """
  92. JOINT = " BEFORE "
  93. def _matcher(self, subs, searcher, context):
  94. from whoosh.query.spans import SpanBefore
  95. return self._tree_matcher(subs, SpanBefore._Matcher, searcher,
  96. context, None)
  97. class Phrase(qcore.Query):
  98. """Matches documents containing a given phrase."""
  99. def __init__(self, fieldname, words, slop=1, boost=1.0, char_ranges=None):
  100. """
  101. :param fieldname: the field to search.
  102. :param words: a list of words (unicode strings) in the phrase.
  103. :param slop: the number of words allowed between each "word" in the
  104. phrase; the default of 1 means the phrase must match exactly.
  105. :param boost: a boost factor that to apply to the raw score of
  106. documents matched by this query.
  107. :param char_ranges: if a Phrase object is created by the query parser,
  108. it will set this attribute to a list of (startchar, endchar) pairs
  109. corresponding to the words in the phrase
  110. """
  111. self.fieldname = fieldname
  112. self.words = words
  113. self.slop = slop
  114. self.boost = boost
  115. self.char_ranges = char_ranges
  116. def __eq__(self, other):
  117. return (other and self.__class__ is other.__class__
  118. and self.fieldname == other.fieldname
  119. and self.words == other.words
  120. and self.slop == other.slop
  121. and self.boost == other.boost)
  122. def __repr__(self):
  123. return "%s(%r, %r, slop=%s, boost=%f)" % (self.__class__.__name__,
  124. self.fieldname, self.words,
  125. self.slop, self.boost)
  126. def __unicode__(self):
  127. return u('%s:"%s"') % (self.fieldname, u(" ").join(self.words))
  128. __str__ = __unicode__
  129. def __hash__(self):
  130. h = hash(self.fieldname) ^ hash(self.slop) ^ hash(self.boost)
  131. for w in self.words:
  132. h ^= hash(w)
  133. return h
  134. def has_terms(self):
  135. return True
  136. def terms(self, phrases=False):
  137. if phrases and self.field():
  138. for word in self.words:
  139. yield (self.field(), word)
  140. def tokens(self, boost=1.0):
  141. char_ranges = self.char_ranges
  142. startchar = endchar = None
  143. for i, word in enumerate(self.words):
  144. if char_ranges:
  145. startchar, endchar = char_ranges[i]
  146. yield Token(fieldname=self.fieldname, text=word,
  147. boost=boost * self.boost, startchar=startchar,
  148. endchar=endchar, chars=True)
  149. def normalize(self):
  150. if not self.words:
  151. return qcore.NullQuery
  152. if len(self.words) == 1:
  153. t = terms.Term(self.fieldname, self.words[0])
  154. if self.char_ranges:
  155. t.startchar, t.endchar = self.char_ranges[0]
  156. return t
  157. words = [w for w in self.words if w is not None]
  158. return self.__class__(self.fieldname, words, slop=self.slop,
  159. boost=self.boost, char_ranges=self.char_ranges)
  160. def replace(self, fieldname, oldtext, newtext):
  161. q = copy.copy(self)
  162. if q.fieldname == fieldname:
  163. for i, word in enumerate(q.words):
  164. if word == oldtext:
  165. q.words[i] = newtext
  166. return q
  167. def _and_query(self):
  168. return compound.And([terms.Term(self.fieldname, word)
  169. for word in self.words])
  170. def estimate_size(self, ixreader):
  171. return self._and_query().estimate_size(ixreader)
  172. def estimate_min_size(self, ixreader):
  173. return self._and_query().estimate_min_size(ixreader)
  174. def matcher(self, searcher, context=None):
  175. from whoosh.query import Term, SpanNear2
  176. fieldname = self.fieldname
  177. if fieldname not in searcher.schema:
  178. return matching.NullMatcher()
  179. field = searcher.schema[fieldname]
  180. if not field.format or not field.format.supports("positions"):
  181. raise qcore.QueryError("Phrase search: %r field has no positions"
  182. % self.fieldname)
  183. terms = []
  184. # Build a list of Term queries from the words in the phrase
  185. reader = searcher.reader()
  186. for word in self.words:
  187. try:
  188. word = field.to_bytes(word)
  189. except ValueError:
  190. return matching.NullMatcher()
  191. if (fieldname, word) not in reader:
  192. # Shortcut the query if one of the words doesn't exist.
  193. return matching.NullMatcher()
  194. terms.append(Term(fieldname, word))
  195. # Create the equivalent SpanNear2 query from the terms
  196. q = SpanNear2(terms, slop=self.slop, ordered=True, mindist=1)
  197. # Get the matcher
  198. m = q.matcher(searcher, context)
  199. if self.boost != 1.0:
  200. m = matching.WrappingMatcher(m, boost=self.boost)
  201. return m