numlists.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373
  1. from array import array
  2. from whoosh.compat import xrange
  3. from whoosh.system import emptybytes
  4. from whoosh.system import pack_byte, unpack_byte
  5. from whoosh.system import pack_ushort_le, unpack_ushort_le
  6. from whoosh.system import pack_uint_le, unpack_uint_le
  7. def delta_encode(nums):
  8. base = 0
  9. for n in nums:
  10. yield n - base
  11. base = n
  12. def delta_decode(nums):
  13. base = 0
  14. for n in nums:
  15. base += n
  16. yield base
  17. class GrowableArray(object):
  18. def __init__(self, inittype="B", allow_longs=True):
  19. self.array = array(inittype)
  20. self._allow_longs = allow_longs
  21. def __repr__(self):
  22. return "%s(%r)" % (self.__class__.__name__, self.array)
  23. def __len__(self):
  24. return len(self.array)
  25. def __iter__(self):
  26. return iter(self.array)
  27. def _retype(self, maxnum):
  28. if maxnum < 2 ** 16:
  29. newtype = "H"
  30. elif maxnum < 2 ** 31:
  31. newtype = "i"
  32. elif maxnum < 2 ** 32:
  33. newtype = "I"
  34. elif self._allow_longs:
  35. newtype = "q"
  36. else:
  37. raise OverflowError("%r is too big to fit in an array" % maxnum)
  38. try:
  39. self.array = array(newtype, iter(self.array))
  40. except ValueError:
  41. self.array = list(self.array)
  42. def append(self, n):
  43. try:
  44. self.array.append(n)
  45. except OverflowError:
  46. self._retype(n)
  47. self.array.append(n)
  48. def extend(self, ns):
  49. append = self.append
  50. for n in ns:
  51. append(n)
  52. @property
  53. def typecode(self):
  54. if isinstance(self.array, array):
  55. return self.array.typecode
  56. else:
  57. return "q"
  58. def to_file(self, dbfile):
  59. if isinstance(self.array, array):
  60. dbfile.write_array(self.array)
  61. else:
  62. write_long = dbfile.write_long
  63. for n in self.array:
  64. write_long(n)
  65. # Number list encoding base class
  66. class NumberEncoding(object):
  67. maxint = None
  68. def write_nums(self, f, numbers):
  69. raise NotImplementedError
  70. def read_nums(self, f, n):
  71. raise NotImplementedError
  72. def write_deltas(self, f, numbers):
  73. return self.write_nums(f, list(delta_encode(numbers)))
  74. def read_deltas(self, f, n):
  75. return delta_decode(self.read_nums(f, n))
  76. def get(self, f, pos, i):
  77. f.seek(pos)
  78. n = None
  79. for n in self.read_nums(f, i + 1):
  80. pass
  81. return n
  82. # Fixed width encodings
  83. class FixedEncoding(NumberEncoding):
  84. _encode = None
  85. _decode = None
  86. size = None
  87. def write_nums(self, f, numbers):
  88. _encode = self._encode
  89. for n in numbers:
  90. f.write(_encode(n))
  91. def read_nums(self, f, n):
  92. _decode = self._decode
  93. for _ in xrange(n):
  94. yield _decode(f.read(self.size))
  95. def get(self, f, pos, i):
  96. f.seek(pos + i * self.size)
  97. return self._decode(f.read(self.size))
  98. class ByteEncoding(FixedEncoding):
  99. size = 1
  100. maxint = 255
  101. _encode = pack_byte
  102. _decode = unpack_byte
  103. class UShortEncoding(FixedEncoding):
  104. size = 2
  105. maxint = 2 ** 16 - 1
  106. _encode = pack_ushort_le
  107. _decode = unpack_ushort_le
  108. class UIntEncoding(FixedEncoding):
  109. size = 4
  110. maxint = 2 ** 32 - 1
  111. _encode = pack_uint_le
  112. _decode = unpack_uint_le
  113. # High-bit encoded variable-length integer
  114. class Varints(NumberEncoding):
  115. maxint = None
  116. def write_nums(self, f, numbers):
  117. for n in numbers:
  118. f.write_varint(n)
  119. def read_nums(self, f, n):
  120. for _ in xrange(n):
  121. yield f.read_varint()
  122. # Simple16 algorithm for storing arrays of positive integers (usually delta
  123. # encoded lists of sorted integers)
  124. #
  125. # 1. http://www2008.org/papers/pdf/p387-zhangA.pdf
  126. # 2. http://www2009.org/proceedings/pdf/p401.pdf
  127. class Simple16(NumberEncoding):
  128. # The maximum possible integer value Simple16 can encode is < 2^28.
  129. # Therefore, in order to use Simple16, the application must have its own
  130. # code to encode numbers in the range of [2^28, 2^32). A simple way is just
  131. # write those numbers as 32-bit integers (that is, no compression for very
  132. # big numbers).
  133. _numsize = 16
  134. _bitsize = 28
  135. maxint = 2 ** _bitsize - 1
  136. # Number of stored numbers per code
  137. _num = [28, 21, 21, 21, 14, 9, 8, 7, 6, 6, 5, 5, 4, 3, 2, 1]
  138. # Number of bits for each number per code
  139. _bits = [
  140. (1,) * 28,
  141. (2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
  142. (1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1),
  143. (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2),
  144. (2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2),
  145. (4, 3, 3, 3, 3, 3, 3, 3, 3),
  146. (3, 4, 4, 4, 4, 3, 3, 3),
  147. (4, 4, 4, 4, 4, 4, 4),
  148. (5, 5, 5, 5, 4, 4),
  149. (4, 4, 5, 5, 5, 5),
  150. (6, 6, 6, 5, 5),
  151. (5, 5, 6, 6, 6),
  152. (7, 7, 7, 7),
  153. (10, 9, 9),
  154. (14, 14),
  155. (28,),
  156. ]
  157. def write_nums(self, f, numbers):
  158. _compress = self._compress
  159. i = 0
  160. while i < len(numbers):
  161. value, taken = _compress(numbers, i, len(numbers) - i)
  162. f.write_uint_le(value)
  163. i += taken
  164. def _compress(self, inarray, inoffset, n):
  165. _numsize = self._numsize
  166. _bitsize = self._bitsize
  167. _num = self._num
  168. _bits = self._bits
  169. for key in xrange(_numsize):
  170. value = key << _bitsize
  171. num = _num[key] if _num[key] < n else n
  172. bits = 0
  173. j = 0
  174. while j < num and inarray[inoffset + j] < (1 << _bits[key][j]):
  175. x = inarray[inoffset + j]
  176. value |= x << bits
  177. bits += _bits[key][j]
  178. j += 1
  179. if j == num:
  180. return value, num
  181. raise Exception
  182. def read_nums(self, f, n):
  183. _decompress = self._decompress
  184. i = 0
  185. while i < n:
  186. value = unpack_uint_le(f.read(4))[0]
  187. for v in _decompress(value, n - i):
  188. yield v
  189. i += 1
  190. def _decompress(self, value, n):
  191. _numsize = self._numsize
  192. _bitsize = self._bitsize
  193. _num = self._num
  194. _bits = self._bits
  195. key = value >> _bitsize
  196. num = _num[key] if _num[key] < n else n
  197. bits = 0
  198. for j in xrange(num):
  199. v = value >> bits
  200. yield v & (0xffffffff >> (32 - _bits[key][j]))
  201. bits += _bits[key][j]
  202. def get(self, f, pos, i):
  203. f.seek(pos)
  204. base = 0
  205. value = unpack_uint_le(f.read(4))
  206. key = value >> self._bitsize
  207. num = self._num[key]
  208. while i > base + num:
  209. base += num
  210. value = unpack_uint_le(f.read(4))
  211. key = value >> self._bitsize
  212. num = self._num[key]
  213. offset = i - base
  214. if offset:
  215. value = value >> sum(self._bits[key][:offset])
  216. return value & (2 ** self._bits[key][offset] - 1)
  217. # Google Packed Ints algorithm: a set of four numbers is preceded by a "key"
  218. # byte, which encodes how many bytes each of the next four integers use
  219. # (stored in the byte as four 2-bit numbers)
  220. class GInts(NumberEncoding):
  221. maxint = 2 ** 32 - 1
  222. # Number of future bytes to expect after a "key" byte value of N -- used to
  223. # skip ahead from a key byte
  224. _lens = array("B", [4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 5, 6,
  225. 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9,
  226. 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11,
  227. 12, 13, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7,
  228. 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10,
  229. 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11,
  230. 12, 13, 14, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9,
  231. 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11,
  232. 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12,
  233. 13, 14, 12, 13, 14, 15, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11,
  234. 12, 13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10,
  235. 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 10, 11, 12, 13, 11,
  236. 12, 13, 14, 12, 13, 14, 15, 13, 14, 15, 16])
  237. def key_to_sizes(self, key):
  238. """Returns a list of the sizes of the next four numbers given a key
  239. byte.
  240. """
  241. return [(key >> (i * 2) & 3) + 1 for i in xrange(4)]
  242. def write_nums(self, f, numbers):
  243. buf = emptybytes
  244. count = 0
  245. key = 0
  246. for v in numbers:
  247. shift = count * 2
  248. if v < 256:
  249. buf += pack_byte(v)
  250. elif v < 65536:
  251. key |= 1 << shift
  252. buf += pack_ushort_le(v)
  253. elif v < 16777216:
  254. key |= 2 << shift
  255. buf += pack_uint_le(v)[:3]
  256. else:
  257. key |= 3 << shift
  258. buf += pack_uint_le(v)
  259. count += 1
  260. if count == 4:
  261. f.write_byte(key)
  262. f.write(buf)
  263. count = 0
  264. key = 0
  265. buf = emptybytes # Clear the buffer
  266. # Write out leftovers in the buffer
  267. if count:
  268. f.write_byte(key)
  269. f.write(buf)
  270. def read_nums(self, f, n):
  271. """Read N integers from the bytes stream dbfile. Expects that the file
  272. is positioned at a key byte.
  273. """
  274. count = 0
  275. key = None
  276. for _ in xrange(n):
  277. if count == 0:
  278. key = f.read_byte()
  279. code = key >> (count * 2) & 3
  280. if code == 0:
  281. yield f.read_byte()
  282. elif code == 1:
  283. yield f.read_ushort_le()
  284. elif code == 2:
  285. yield unpack_uint_le(f.read(3) + "\x00")[0]
  286. else:
  287. yield f.read_uint_le()
  288. count = (count + 1) % 4
  289. # def get(self, f, pos, i):
  290. # f.seek(pos)
  291. # base = 0
  292. # key = f.read_byte()
  293. # while i > base + 4:
  294. # base += 4
  295. # f.seek(self._lens[key], 1)
  296. # key = f.read_byte()
  297. #
  298. # for n in self.read_nums(f, (i + 1) - base):
  299. # pass
  300. # return n