numeric.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. # Copyright 2010 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. import math, struct
  28. from array import array
  29. from bisect import bisect_left
  30. from struct import pack, unpack
  31. from whoosh.compat import b, long_type
  32. from whoosh.system import pack_byte, unpack_byte, pack_ushort, unpack_ushort
  33. from whoosh.system import pack_int, unpack_int, pack_uint, unpack_uint
  34. from whoosh.system import pack_long, unpack_long, pack_ulong, unpack_ulong
  35. from whoosh.system import pack_float, unpack_float, pack_double, unpack_double
  36. NaN = struct.unpack("<d", b('\xff\xff\xff\xff\xff\xff\xff\xff'))[0]
  37. typecode_max = {"b": 127, "B": 255, "h": 2 ** 15 - 1, "H": 2 ** 16 - 1,
  38. "i": 2 ** 31 - 1, "I": 2 ** 32 - 1,
  39. "q": 2 ** 63 - 1, "Q": 2 ** 64 - 1}
  40. typecode_min = {"b": 0 - 128, "B": 0, "h": 0 - 2 ** 15, "H": 0,
  41. "i": 0 - 2 ** 31, "I": 0,
  42. "q": 0 - 2 ** 63, "Q": 0}
  43. typecode_pack = {"B": pack_byte, "H": pack_ushort, "i": pack_int,
  44. "I": pack_uint, "q": pack_long, "Q": pack_ulong,
  45. "f": pack_float, "d": pack_double}
  46. typecode_unpack = {"B": unpack_byte, "H": unpack_ushort, "i": unpack_int,
  47. "I": unpack_uint, "q": unpack_long, "Q": unpack_ulong,
  48. "f": unpack_float, "d": unpack_double}
  49. # Functions related to binary representations
  50. def bits_required(maxnum):
  51. """Returns the number of bits required to represent the given (unsigned)
  52. integer.
  53. """
  54. return max(1, math.ceil(math.log(maxnum, 2)))
  55. def typecode_required(maxnum):
  56. if maxnum < 256:
  57. return "B"
  58. elif maxnum < 2 ** 16:
  59. return "H"
  60. elif maxnum < 2 ** 31 - 1:
  61. return "i"
  62. elif maxnum < 2 ** 32:
  63. return "I"
  64. elif maxnum < 2 ** 63 - 1:
  65. return "q"
  66. else:
  67. return "Q"
  68. def max_value(bitcount):
  69. """Returns the maximum (unsigned) integer representable in the given number
  70. of bits.
  71. """
  72. return ~(~0 << bitcount)
  73. def bytes_for_bits(bitcount):
  74. r = int(math.ceil((bitcount + 1) / 8.0))
  75. return r
  76. # Functions for converting numbers to and from sortable representations
  77. _istruct = struct.Struct(">i")
  78. _qstruct = struct.Struct(">q")
  79. _dstruct = struct.Struct(">d")
  80. _ipack, _iunpack = _istruct.pack, _istruct.unpack
  81. _qpack, _qunpack = _qstruct.pack, _qstruct.unpack
  82. _dpack, _dunpack = _dstruct.pack, _dstruct.unpack
  83. def to_sortable(numtype, intsize, signed, x):
  84. if numtype is int or numtype is long_type:
  85. if signed:
  86. x += (1 << intsize - 1)
  87. return x
  88. else:
  89. return float_to_sortable_long(x, signed)
  90. def from_sortable(numtype, intsize, signed, x):
  91. if numtype is int or numtype is long_type:
  92. if signed:
  93. x -= (1 << intsize - 1)
  94. return x
  95. else:
  96. return sortable_long_to_float(x, signed)
  97. def float_to_sortable_long(x, signed):
  98. x = _qunpack(_dpack(x))[0]
  99. if x < 0:
  100. x ^= 0x7fffffffffffffff
  101. if signed:
  102. x += 1 << 63
  103. assert x >= 0
  104. return x
  105. def sortable_long_to_float(x, signed):
  106. if signed:
  107. x -= 1 << 63
  108. if x < 0:
  109. x ^= 0x7fffffffffffffff
  110. x = _dunpack(_qpack(x))[0]
  111. return x
  112. # Functions for generating tiered ranges
  113. def split_ranges(intsize, step, start, end):
  114. """Splits a range of numbers (from ``start`` to ``end``, inclusive)
  115. into a sequence of trie ranges of the form ``(start, end, shift)``. The
  116. consumer of these tuples is expected to shift the ``start`` and ``end``
  117. right by ``shift``.
  118. This is used for generating term ranges for a numeric field. The queries
  119. for the edges of the range are generated at high precision and large blocks
  120. in the middle are generated at low precision.
  121. """
  122. shift = 0
  123. while True:
  124. diff = 1 << (shift + step)
  125. mask = ((1 << step) - 1) << shift
  126. setbits = lambda x: x | ((1 << shift) - 1)
  127. haslower = (start & mask) != 0
  128. hasupper = (end & mask) != mask
  129. not_mask = ~mask & ((1 << intsize + 1) - 1)
  130. nextstart = (start + diff if haslower else start) & not_mask
  131. nextend = (end - diff if hasupper else end) & not_mask
  132. if shift + step >= intsize or nextstart > nextend:
  133. yield (start, setbits(end), shift)
  134. break
  135. if haslower:
  136. yield (start, setbits(start | mask), shift)
  137. if hasupper:
  138. yield (end & not_mask, setbits(end), shift)
  139. start = nextstart
  140. end = nextend
  141. shift += step
  142. def tiered_ranges(numtype, intsize, signed, start, end, shift_step,
  143. startexcl, endexcl):
  144. assert numtype in (int, float)
  145. assert intsize in (8, 16, 32, 64)
  146. # Convert start and end values to sortable ints
  147. if start is None:
  148. start = 0
  149. else:
  150. start = to_sortable(numtype, intsize, signed, start)
  151. if startexcl:
  152. start += 1
  153. if end is None:
  154. end = 2 ** intsize - 1
  155. else:
  156. end = to_sortable(numtype, intsize, signed, end)
  157. if endexcl:
  158. end -= 1
  159. if not shift_step:
  160. return ((start, end, 0),)
  161. # Yield (rstart, rend, shift) ranges for the different resolutions
  162. return split_ranges(intsize, shift_step, start, end)
  163. # Float-to-byte encoding/decoding
  164. def float_to_byte(value, mantissabits=5, zeroexp=2):
  165. """Encodes a floating point number in a single byte.
  166. """
  167. # Assume int size == float size
  168. fzero = (63 - zeroexp) << mantissabits
  169. bits = unpack("i", pack("f", value))[0]
  170. smallfloat = bits >> (24 - mantissabits)
  171. if smallfloat < fzero:
  172. # Map negative numbers and 0 to 0
  173. # Map underflow to next smallest non-zero number
  174. if bits <= 0:
  175. result = chr(0)
  176. else:
  177. result = chr(1)
  178. elif smallfloat >= fzero + 0x100:
  179. # Map overflow to largest number
  180. result = chr(255)
  181. else:
  182. result = chr(smallfloat - fzero)
  183. return b(result)
  184. def byte_to_float(b, mantissabits=5, zeroexp=2):
  185. """Decodes a floating point number stored in a single byte.
  186. """
  187. if type(b) is not int:
  188. b = ord(b)
  189. if b == 0:
  190. return 0.0
  191. bits = (b & 0xff) << (24 - mantissabits)
  192. bits += (63 - zeroexp) << 24
  193. return unpack("f", pack("i", bits))[0]
  194. # Length-to-byte approximation functions
  195. # Old implementation:
  196. #def length_to_byte(length):
  197. # """Returns a logarithmic approximation of the given number, in the range
  198. # 0-255. The approximation has high precision at the low end (e.g.
  199. # 1 -> 0, 2 -> 1, 3 -> 2 ...) and low precision at the high end. Numbers
  200. # equal to or greater than 108116 all approximate to 255.
  201. #
  202. # This is useful for storing field lengths, where the general case is small
  203. # documents and very large documents are more rare.
  204. # """
  205. #
  206. # # This encoding formula works up to 108116 -> 255, so if the length is
  207. # # equal to or greater than that limit, just return 255.
  208. # if length >= 108116:
  209. # return 255
  210. #
  211. # # The parameters of this formula where chosen heuristically so that low
  212. # # numbers would approximate closely, and the byte range 0-255 would cover
  213. # # a decent range of document lengths (i.e. 1 to ~100000).
  214. # return int(round(log((length / 27.0) + 1, 1.033)))
  215. #def _byte_to_length(n):
  216. # return int(round((pow(1.033, n) - 1) * 27))
  217. #_b2l_cache = array("i", (_byte_to_length(i) for i in xrange(256)))
  218. #byte_to_length = _b2l_cache.__getitem__
  219. # New implementation
  220. # Instead of computing the actual formula to get the byte for any given length,
  221. # precompute the length associated with each byte, and use bisect to find the
  222. # nearest value. This gives quite a large speed-up.
  223. #
  224. # Note that this does not give all the same answers as the old, "real"
  225. # implementation since this implementation always "rounds down" (thanks to the
  226. # bisect_left) while the old implementation would "round up" or "round down"
  227. # depending on the input. Since this is a fairly gross approximation anyway,
  228. # I don't think it matters much.
  229. # Values generated using the formula from the "old" implementation above
  230. _length_byte_cache = array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14,
  231. 16, 17, 18, 20, 21, 23, 25, 26, 28, 30, 32, 34, 36, 38, 40, 42, 45, 47, 49, 52,
  232. 54, 57, 60, 63, 66, 69, 72, 75, 79, 82, 86, 89, 93, 97, 101, 106, 110, 114,
  233. 119, 124, 129, 134, 139, 145, 150, 156, 162, 169, 175, 182, 189, 196, 203, 211,
  234. 219, 227, 235, 244, 253, 262, 271, 281, 291, 302, 313, 324, 336, 348, 360, 373,
  235. 386, 399, 414, 428, 443, 459, 475, 491, 508, 526, 544, 563, 583, 603, 623, 645,
  236. 667, 690, 714, 738, 763, 789, 816, 844, 873, 903, 933, 965, 998, 1032, 1066,
  237. 1103, 1140, 1178, 1218, 1259, 1302, 1345, 1391, 1438, 1486, 1536, 1587, 1641,
  238. 1696, 1753, 1811, 1872, 1935, 1999, 2066, 2135, 2207, 2280, 2356, 2435, 2516,
  239. 2600, 2687, 2777, 2869, 2965, 3063, 3165, 3271, 3380, 3492, 3608, 3728, 3852,
  240. 3980, 4112, 4249, 4390, 4536, 4686, 4842, 5002, 5168, 5340, 5517, 5700, 5889,
  241. 6084, 6286, 6494, 6709, 6932, 7161, 7398, 7643, 7897, 8158, 8428, 8707, 8995,
  242. 9293, 9601, 9918, 10247, 10586, 10936, 11298, 11671, 12057, 12456, 12868,
  243. 13294, 13733, 14187, 14656, 15141, 15641, 16159, 16693, 17244, 17814, 18403,
  244. 19011, 19640, 20289, 20959, 21652, 22367, 23106, 23869, 24658, 25472, 26314,
  245. 27183, 28081, 29009, 29967, 30957, 31979, 33035, 34126, 35254, 36418, 37620,
  246. 38863, 40146, 41472, 42841, 44256, 45717, 47227, 48786, 50397, 52061, 53780,
  247. 55556, 57390, 59285, 61242, 63264, 65352, 67510, 69739, 72041, 74419, 76876,
  248. 79414, 82035, 84743, 87541, 90430, 93416, 96499, 99684, 102975, 106374])
  249. def length_to_byte(length):
  250. if length is None:
  251. return 0
  252. if length >= 106374:
  253. return 255
  254. else:
  255. return bisect_left(_length_byte_cache, length)
  256. byte_to_length = _length_byte_cache.__getitem__