ranges.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347
  1. # Copyright 2007 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. from __future__ import division
  28. from whoosh.compat import b, u
  29. from whoosh.query import qcore, terms, compound, wrappers
  30. from whoosh.util.times import datetime_to_long
  31. class RangeMixin(object):
  32. # Contains methods shared by TermRange and NumericRange
  33. def __repr__(self):
  34. return ('%s(%r, %r, %r, %s, %s, boost=%s, constantscore=%s)'
  35. % (self.__class__.__name__, self.fieldname, self.start,
  36. self.end, self.startexcl, self.endexcl, self.boost,
  37. self.constantscore))
  38. def __unicode__(self):
  39. startchar = "{" if self.startexcl else "["
  40. endchar = "}" if self.endexcl else "]"
  41. start = '' if self.start is None else self.start
  42. end = '' if self.end is None else self.end
  43. return u("%s:%s%s TO %s%s") % (self.fieldname, startchar, start, end,
  44. endchar)
  45. __str__ = __unicode__
  46. def __eq__(self, other):
  47. return (other and self.__class__ is other.__class__
  48. and self.fieldname == other.fieldname
  49. and self.start == other.start and self.end == other.end
  50. and self.startexcl == other.startexcl
  51. and self.endexcl == other.endexcl
  52. and self.boost == other.boost
  53. and self.constantscore == other.constantscore)
  54. def __hash__(self):
  55. return (hash(self.fieldname) ^ hash(self.start) ^ hash(self.startexcl)
  56. ^ hash(self.end) ^ hash(self.endexcl) ^ hash(self.boost))
  57. def is_range(self):
  58. return True
  59. def _comparable_start(self):
  60. if self.start is None:
  61. return (qcore.Lowest, 0)
  62. else:
  63. second = 1 if self.startexcl else 0
  64. return (self.start, second)
  65. def _comparable_end(self):
  66. if self.end is None:
  67. return (qcore.Highest, 0)
  68. else:
  69. second = -1 if self.endexcl else 0
  70. return (self.end, second)
  71. def overlaps(self, other):
  72. if not isinstance(other, TermRange):
  73. return False
  74. if self.fieldname != other.fieldname:
  75. return False
  76. start1 = self._comparable_start()
  77. start2 = other._comparable_start()
  78. end1 = self._comparable_end()
  79. end2 = other._comparable_end()
  80. return ((start1 >= start2 and start1 <= end2)
  81. or (end1 >= start2 and end1 <= end2)
  82. or (start2 >= start1 and start2 <= end1)
  83. or (end2 >= start1 and end2 <= end1))
  84. def merge(self, other, intersect=True):
  85. assert self.fieldname == other.fieldname
  86. start1 = self._comparable_start()
  87. start2 = other._comparable_start()
  88. end1 = self._comparable_end()
  89. end2 = other._comparable_end()
  90. if start1 >= start2 and end1 <= end2:
  91. start = start2
  92. end = end2
  93. elif start2 >= start1 and end2 <= end1:
  94. start = start1
  95. end = end1
  96. elif intersect:
  97. start = max(start1, start2)
  98. end = min(end1, end2)
  99. else:
  100. start = min(start1, start2)
  101. end = max(end1, end2)
  102. startval = None if start[0] is qcore.Lowest else start[0]
  103. startexcl = start[1] == 1
  104. endval = None if end[0] is qcore.Highest else end[0]
  105. endexcl = end[1] == -1
  106. boost = max(self.boost, other.boost)
  107. constantscore = self.constantscore or other.constantscore
  108. return self.__class__(self.fieldname, startval, endval, startexcl,
  109. endexcl, boost=boost,
  110. constantscore=constantscore)
  111. class TermRange(RangeMixin, terms.MultiTerm):
  112. """Matches documents containing any terms in a given range.
  113. >>> # Match documents where the indexed "id" field is greater than or equal
  114. >>> # to 'apple' and less than or equal to 'pear'.
  115. >>> TermRange("id", u"apple", u"pear")
  116. """
  117. def __init__(self, fieldname, start, end, startexcl=False, endexcl=False,
  118. boost=1.0, constantscore=True):
  119. """
  120. :param fieldname: The name of the field to search.
  121. :param start: Match terms equal to or greater than this.
  122. :param end: Match terms equal to or less than this.
  123. :param startexcl: If True, the range start is exclusive. If False, the
  124. range start is inclusive.
  125. :param endexcl: If True, the range end is exclusive. If False, the
  126. range end is inclusive.
  127. :param boost: Boost factor that should be applied to the raw score of
  128. results matched by this query.
  129. """
  130. self.fieldname = fieldname
  131. self.start = start
  132. self.end = end
  133. self.startexcl = startexcl
  134. self.endexcl = endexcl
  135. self.boost = boost
  136. self.constantscore = constantscore
  137. def normalize(self):
  138. if self.start in ('', None) and self.end in (u('\uffff'), None):
  139. from whoosh.query import Every
  140. return Every(self.fieldname, boost=self.boost)
  141. elif self.start == self.end:
  142. if self.startexcl or self.endexcl:
  143. return qcore.NullQuery
  144. return terms.Term(self.fieldname, self.start, boost=self.boost)
  145. else:
  146. return TermRange(self.fieldname, self.start, self.end,
  147. self.startexcl, self.endexcl,
  148. boost=self.boost)
  149. #def replace(self, fieldname, oldtext, newtext):
  150. # q = self.copy()
  151. # if q.fieldname == fieldname:
  152. # if q.start == oldtext:
  153. # q.start = newtext
  154. # if q.end == oldtext:
  155. # q.end = newtext
  156. # return q
  157. def _btexts(self, ixreader):
  158. fieldname = self.fieldname
  159. field = ixreader.schema[fieldname]
  160. startexcl = self.startexcl
  161. endexcl = self.endexcl
  162. if self.start is None:
  163. start = b("")
  164. else:
  165. try:
  166. start = field.to_bytes(self.start)
  167. except ValueError:
  168. return
  169. if self.end is None:
  170. end = b("\xFF\xFF\xFF\xFF")
  171. else:
  172. try:
  173. end = field.to_bytes(self.end)
  174. except ValueError:
  175. return
  176. for fname, t in ixreader.terms_from(fieldname, start):
  177. if fname != fieldname:
  178. break
  179. if t == start and startexcl:
  180. continue
  181. if t == end and endexcl:
  182. break
  183. if t > end:
  184. break
  185. yield t
  186. class NumericRange(RangeMixin, qcore.Query):
  187. """A range query for NUMERIC fields. Takes advantage of tiered indexing
  188. to speed up large ranges by matching at a high resolution at the edges of
  189. the range and a low resolution in the middle.
  190. >>> # Match numbers from 10 to 5925 in the "number" field.
  191. >>> nr = NumericRange("number", 10, 5925)
  192. """
  193. def __init__(self, fieldname, start, end, startexcl=False, endexcl=False,
  194. boost=1.0, constantscore=True):
  195. """
  196. :param fieldname: The name of the field to search.
  197. :param start: Match terms equal to or greater than this number. This
  198. should be a number type, not a string.
  199. :param end: Match terms equal to or less than this number. This should
  200. be a number type, not a string.
  201. :param startexcl: If True, the range start is exclusive. If False, the
  202. range start is inclusive.
  203. :param endexcl: If True, the range end is exclusive. If False, the
  204. range end is inclusive.
  205. :param boost: Boost factor that should be applied to the raw score of
  206. results matched by this query.
  207. :param constantscore: If True, the compiled query returns a constant
  208. score (the value of the ``boost`` keyword argument) instead of
  209. actually scoring the matched terms. This gives a nice speed boost
  210. and won't affect the results in most cases since numeric ranges
  211. will almost always be used as a filter.
  212. """
  213. self.fieldname = fieldname
  214. self.start = start
  215. self.end = end
  216. self.startexcl = startexcl
  217. self.endexcl = endexcl
  218. self.boost = boost
  219. self.constantscore = constantscore
  220. def simplify(self, ixreader):
  221. return self._compile_query(ixreader).simplify(ixreader)
  222. def estimate_size(self, ixreader):
  223. return self._compile_query(ixreader).estimate_size(ixreader)
  224. def estimate_min_size(self, ixreader):
  225. return self._compile_query(ixreader).estimate_min_size(ixreader)
  226. def docs(self, searcher):
  227. q = self._compile_query(searcher.reader())
  228. return q.docs(searcher)
  229. def _compile_query(self, ixreader):
  230. from whoosh.fields import NUMERIC
  231. from whoosh.util.numeric import tiered_ranges
  232. field = ixreader.schema[self.fieldname]
  233. if not isinstance(field, NUMERIC):
  234. raise Exception("NumericRange: field %r is not numeric"
  235. % self.fieldname)
  236. start = self.start
  237. if start is not None:
  238. start = field.prepare_number(start)
  239. end = self.end
  240. if end is not None:
  241. end = field.prepare_number(end)
  242. subqueries = []
  243. stb = field.sortable_to_bytes
  244. # Get the term ranges for the different resolutions
  245. ranges = tiered_ranges(field.numtype, field.bits, field.signed,
  246. start, end, field.shift_step,
  247. self.startexcl, self.endexcl)
  248. for startnum, endnum, shift in ranges:
  249. if startnum == endnum:
  250. subq = terms.Term(self.fieldname, stb(startnum, shift))
  251. else:
  252. startbytes = stb(startnum, shift)
  253. endbytes = stb(endnum, shift)
  254. subq = TermRange(self.fieldname, startbytes, endbytes)
  255. subqueries.append(subq)
  256. if len(subqueries) == 1:
  257. q = subqueries[0]
  258. elif subqueries:
  259. q = compound.Or(subqueries, boost=self.boost)
  260. else:
  261. return qcore.NullQuery
  262. if self.constantscore:
  263. q = wrappers.ConstantScoreQuery(q, self.boost)
  264. return q
  265. def matcher(self, searcher, context=None):
  266. q = self._compile_query(searcher.reader())
  267. return q.matcher(searcher, context)
  268. class DateRange(NumericRange):
  269. """This is a very thin subclass of :class:`NumericRange` that only
  270. overrides the initializer and ``__repr__()`` methods to work with datetime
  271. objects instead of numbers. Internally this object converts the datetime
  272. objects it's created with to numbers and otherwise acts like a
  273. ``NumericRange`` query.
  274. >>> DateRange("date", datetime(2010, 11, 3, 3, 0),
  275. ... datetime(2010, 11, 3, 17, 59))
  276. """
  277. def __init__(self, fieldname, start, end, startexcl=False, endexcl=False,
  278. boost=1.0, constantscore=True):
  279. self.startdate = start
  280. self.enddate = end
  281. if start:
  282. start = datetime_to_long(start)
  283. if end:
  284. end = datetime_to_long(end)
  285. super(DateRange, self).__init__(fieldname, start, end,
  286. startexcl=startexcl, endexcl=endexcl,
  287. boost=boost,
  288. constantscore=constantscore)
  289. def __repr__(self):
  290. return '%s(%r, %r, %r, %s, %s, boost=%s)' % (self.__class__.__name__,
  291. self.fieldname,
  292. self.startdate, self.enddate,
  293. self.startexcl, self.endexcl,
  294. self.boost)