| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373 |
- from array import array
- from whoosh.compat import xrange
- from whoosh.system import emptybytes
- from whoosh.system import pack_byte, unpack_byte
- from whoosh.system import pack_ushort_le, unpack_ushort_le
- from whoosh.system import pack_uint_le, unpack_uint_le
- def delta_encode(nums):
- base = 0
- for n in nums:
- yield n - base
- base = n
- def delta_decode(nums):
- base = 0
- for n in nums:
- base += n
- yield base
- class GrowableArray(object):
- def __init__(self, inittype="B", allow_longs=True):
- self.array = array(inittype)
- self._allow_longs = allow_longs
- def __repr__(self):
- return "%s(%r)" % (self.__class__.__name__, self.array)
- def __len__(self):
- return len(self.array)
- def __iter__(self):
- return iter(self.array)
- def _retype(self, maxnum):
- if maxnum < 2 ** 16:
- newtype = "H"
- elif maxnum < 2 ** 31:
- newtype = "i"
- elif maxnum < 2 ** 32:
- newtype = "I"
- elif self._allow_longs:
- newtype = "q"
- else:
- raise OverflowError("%r is too big to fit in an array" % maxnum)
- try:
- self.array = array(newtype, iter(self.array))
- except ValueError:
- self.array = list(self.array)
- def append(self, n):
- try:
- self.array.append(n)
- except OverflowError:
- self._retype(n)
- self.array.append(n)
- def extend(self, ns):
- append = self.append
- for n in ns:
- append(n)
- @property
- def typecode(self):
- if isinstance(self.array, array):
- return self.array.typecode
- else:
- return "q"
- def to_file(self, dbfile):
- if isinstance(self.array, array):
- dbfile.write_array(self.array)
- else:
- write_long = dbfile.write_long
- for n in self.array:
- write_long(n)
- # Number list encoding base class
- class NumberEncoding(object):
- maxint = None
- def write_nums(self, f, numbers):
- raise NotImplementedError
- def read_nums(self, f, n):
- raise NotImplementedError
- def write_deltas(self, f, numbers):
- return self.write_nums(f, list(delta_encode(numbers)))
- def read_deltas(self, f, n):
- return delta_decode(self.read_nums(f, n))
- def get(self, f, pos, i):
- f.seek(pos)
- n = None
- for n in self.read_nums(f, i + 1):
- pass
- return n
- # Fixed width encodings
- class FixedEncoding(NumberEncoding):
- _encode = None
- _decode = None
- size = None
- def write_nums(self, f, numbers):
- _encode = self._encode
- for n in numbers:
- f.write(_encode(n))
- def read_nums(self, f, n):
- _decode = self._decode
- for _ in xrange(n):
- yield _decode(f.read(self.size))
- def get(self, f, pos, i):
- f.seek(pos + i * self.size)
- return self._decode(f.read(self.size))
- class ByteEncoding(FixedEncoding):
- size = 1
- maxint = 255
- _encode = pack_byte
- _decode = unpack_byte
- class UShortEncoding(FixedEncoding):
- size = 2
- maxint = 2 ** 16 - 1
- _encode = pack_ushort_le
- _decode = unpack_ushort_le
- class UIntEncoding(FixedEncoding):
- size = 4
- maxint = 2 ** 32 - 1
- _encode = pack_uint_le
- _decode = unpack_uint_le
- # High-bit encoded variable-length integer
- class Varints(NumberEncoding):
- maxint = None
- def write_nums(self, f, numbers):
- for n in numbers:
- f.write_varint(n)
- def read_nums(self, f, n):
- for _ in xrange(n):
- yield f.read_varint()
- # Simple16 algorithm for storing arrays of positive integers (usually delta
- # encoded lists of sorted integers)
- #
- # 1. http://www2008.org/papers/pdf/p387-zhangA.pdf
- # 2. http://www2009.org/proceedings/pdf/p401.pdf
- class Simple16(NumberEncoding):
- # The maximum possible integer value Simple16 can encode is < 2^28.
- # Therefore, in order to use Simple16, the application must have its own
- # code to encode numbers in the range of [2^28, 2^32). A simple way is just
- # write those numbers as 32-bit integers (that is, no compression for very
- # big numbers).
- _numsize = 16
- _bitsize = 28
- maxint = 2 ** _bitsize - 1
- # Number of stored numbers per code
- _num = [28, 21, 21, 21, 14, 9, 8, 7, 6, 6, 5, 5, 4, 3, 2, 1]
- # Number of bits for each number per code
- _bits = [
- (1,) * 28,
- (2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
- (1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1),
- (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2),
- (2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2),
- (4, 3, 3, 3, 3, 3, 3, 3, 3),
- (3, 4, 4, 4, 4, 3, 3, 3),
- (4, 4, 4, 4, 4, 4, 4),
- (5, 5, 5, 5, 4, 4),
- (4, 4, 5, 5, 5, 5),
- (6, 6, 6, 5, 5),
- (5, 5, 6, 6, 6),
- (7, 7, 7, 7),
- (10, 9, 9),
- (14, 14),
- (28,),
- ]
- def write_nums(self, f, numbers):
- _compress = self._compress
- i = 0
- while i < len(numbers):
- value, taken = _compress(numbers, i, len(numbers) - i)
- f.write_uint_le(value)
- i += taken
- def _compress(self, inarray, inoffset, n):
- _numsize = self._numsize
- _bitsize = self._bitsize
- _num = self._num
- _bits = self._bits
- for key in xrange(_numsize):
- value = key << _bitsize
- num = _num[key] if _num[key] < n else n
- bits = 0
- j = 0
- while j < num and inarray[inoffset + j] < (1 << _bits[key][j]):
- x = inarray[inoffset + j]
- value |= x << bits
- bits += _bits[key][j]
- j += 1
- if j == num:
- return value, num
- raise Exception
- def read_nums(self, f, n):
- _decompress = self._decompress
- i = 0
- while i < n:
- value = unpack_uint_le(f.read(4))[0]
- for v in _decompress(value, n - i):
- yield v
- i += 1
- def _decompress(self, value, n):
- _numsize = self._numsize
- _bitsize = self._bitsize
- _num = self._num
- _bits = self._bits
- key = value >> _bitsize
- num = _num[key] if _num[key] < n else n
- bits = 0
- for j in xrange(num):
- v = value >> bits
- yield v & (0xffffffff >> (32 - _bits[key][j]))
- bits += _bits[key][j]
- def get(self, f, pos, i):
- f.seek(pos)
- base = 0
- value = unpack_uint_le(f.read(4))
- key = value >> self._bitsize
- num = self._num[key]
- while i > base + num:
- base += num
- value = unpack_uint_le(f.read(4))
- key = value >> self._bitsize
- num = self._num[key]
- offset = i - base
- if offset:
- value = value >> sum(self._bits[key][:offset])
- return value & (2 ** self._bits[key][offset] - 1)
- # Google Packed Ints algorithm: a set of four numbers is preceded by a "key"
- # byte, which encodes how many bytes each of the next four integers use
- # (stored in the byte as four 2-bit numbers)
- class GInts(NumberEncoding):
- maxint = 2 ** 32 - 1
- # Number of future bytes to expect after a "key" byte value of N -- used to
- # skip ahead from a key byte
- _lens = array("B", [4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 5, 6,
- 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9,
- 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11,
- 12, 13, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7,
- 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10,
- 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11,
- 12, 13, 14, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9,
- 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11,
- 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12,
- 13, 14, 12, 13, 14, 15, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11,
- 12, 13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10,
- 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 10, 11, 12, 13, 11,
- 12, 13, 14, 12, 13, 14, 15, 13, 14, 15, 16])
- def key_to_sizes(self, key):
- """Returns a list of the sizes of the next four numbers given a key
- byte.
- """
- return [(key >> (i * 2) & 3) + 1 for i in xrange(4)]
- def write_nums(self, f, numbers):
- buf = emptybytes
- count = 0
- key = 0
- for v in numbers:
- shift = count * 2
- if v < 256:
- buf += pack_byte(v)
- elif v < 65536:
- key |= 1 << shift
- buf += pack_ushort_le(v)
- elif v < 16777216:
- key |= 2 << shift
- buf += pack_uint_le(v)[:3]
- else:
- key |= 3 << shift
- buf += pack_uint_le(v)
- count += 1
- if count == 4:
- f.write_byte(key)
- f.write(buf)
- count = 0
- key = 0
- buf = emptybytes # Clear the buffer
- # Write out leftovers in the buffer
- if count:
- f.write_byte(key)
- f.write(buf)
- def read_nums(self, f, n):
- """Read N integers from the bytes stream dbfile. Expects that the file
- is positioned at a key byte.
- """
- count = 0
- key = None
- for _ in xrange(n):
- if count == 0:
- key = f.read_byte()
- code = key >> (count * 2) & 3
- if code == 0:
- yield f.read_byte()
- elif code == 1:
- yield f.read_ushort_le()
- elif code == 2:
- yield unpack_uint_le(f.read(3) + "\x00")[0]
- else:
- yield f.read_uint_le()
- count = (count + 1) % 4
- # def get(self, f, pos, i):
- # f.seek(pos)
- # base = 0
- # key = f.read_byte()
- # while i > base + 4:
- # base += 4
- # f.seek(self._lens[key], 1)
- # key = f.read_byte()
- #
- # for n in self.read_nums(f, (i + 1) - base):
- # pass
- # return n
|