123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703 |
- """
- An implementation of an object that acts like a collection of on/off bits.
- """
- import operator
- from array import array
- from bisect import bisect_left, bisect_right, insort
- from whoosh.compat import integer_types, izip, izip_longest, next, xrange
- from whoosh.util.numeric import bytes_for_bits
- # Number of '1' bits in each byte (0-255)
- _1SPERBYTE = array('B', [0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2,
- 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4,
- 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3,
- 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5,
- 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4,
- 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5,
- 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5,
- 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4,
- 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7,
- 6, 7, 7, 8])
- class DocIdSet(object):
- """Base class for a set of positive integers, implementing a subset of the
- built-in ``set`` type's interface with extra docid-related methods.
- This is a superclass for alternative set implementations to the built-in
- ``set`` which are more memory-efficient and specialized toward storing
- sorted lists of positive integers, though they will inevitably be slower
- than ``set`` for most operations since they're pure Python.
- """
- def __eq__(self, other):
- for a, b in izip(self, other):
- if a != b:
- return False
- return True
- def __neq__(self, other):
- return not self.__eq__(other)
- def __len__(self):
- raise NotImplementedError
- def __iter__(self):
- raise NotImplementedError
- def __contains__(self, i):
- raise NotImplementedError
- def __or__(self, other):
- return self.union(other)
- def __and__(self, other):
- return self.intersection(other)
- def __sub__(self, other):
- return self.difference(other)
- def copy(self):
- raise NotImplementedError
- def add(self, n):
- raise NotImplementedError
- def discard(self, n):
- raise NotImplementedError
- def update(self, other):
- add = self.add
- for i in other:
- add(i)
- def intersection_update(self, other):
- for n in self:
- if n not in other:
- self.discard(n)
- def difference_update(self, other):
- for n in other:
- self.discard(n)
- def invert_update(self, size):
- """Updates the set in-place to contain numbers in the range
- ``[0 - size)`` except numbers that are in this set.
- """
- for i in xrange(size):
- if i in self:
- self.discard(i)
- else:
- self.add(i)
- def intersection(self, other):
- c = self.copy()
- c.intersection_update(other)
- return c
- def union(self, other):
- c = self.copy()
- c.update(other)
- return c
- def difference(self, other):
- c = self.copy()
- c.difference_update(other)
- return c
- def invert(self, size):
- c = self.copy()
- c.invert_update(size)
- return c
- def isdisjoint(self, other):
- a = self
- b = other
- if len(other) < len(self):
- a, b = other, self
- for num in a:
- if num in b:
- return False
- return True
- def before(self, i):
- """Returns the previous integer in the set before ``i``, or None.
- """
- raise NotImplementedError
- def after(self, i):
- """Returns the next integer in the set after ``i``, or None.
- """
- raise NotImplementedError
- def first(self):
- """Returns the first (lowest) integer in the set.
- """
- raise NotImplementedError
- def last(self):
- """Returns the last (highest) integer in the set.
- """
- raise NotImplementedError
- class BaseBitSet(DocIdSet):
- # Methods to override
- def byte_count(self):
- raise NotImplementedError
- def _get_byte(self, i):
- raise NotImplementedError
- def _iter_bytes(self):
- raise NotImplementedError
- # Base implementations
- def __len__(self):
- return sum(_1SPERBYTE[b] for b in self._iter_bytes())
- def __iter__(self):
- base = 0
- for byte in self._iter_bytes():
- for i in xrange(8):
- if byte & (1 << i):
- yield base + i
- base += 8
- def __nonzero__(self):
- return any(n for n in self._iter_bytes())
- __bool__ = __nonzero__
- def __contains__(self, i):
- bucket = i // 8
- if bucket >= self.byte_count():
- return False
- return bool(self._get_byte(bucket) & (1 << (i & 7)))
- def first(self):
- return self.after(-1)
- def last(self):
- return self.before(self.byte_count() * 8 + 1)
- def before(self, i):
- _get_byte = self._get_byte
- size = self.byte_count() * 8
- if i <= 0:
- return None
- elif i >= size:
- i = size - 1
- else:
- i -= 1
- bucket = i // 8
- while i >= 0:
- byte = _get_byte(bucket)
- if not byte:
- bucket -= 1
- i = bucket * 8 + 7
- continue
- if byte & (1 << (i & 7)):
- return i
- if i % 8 == 0:
- bucket -= 1
- i -= 1
- return None
- def after(self, i):
- _get_byte = self._get_byte
- size = self.byte_count() * 8
- if i >= size:
- return None
- elif i < 0:
- i = 0
- else:
- i += 1
- bucket = i // 8
- while i < size:
- byte = _get_byte(bucket)
- if not byte:
- bucket += 1
- i = bucket * 8
- continue
- if byte & (1 << (i & 7)):
- return i
- i += 1
- if i % 8 == 0:
- bucket += 1
- return None
- class OnDiskBitSet(BaseBitSet):
- """A DocIdSet backed by an array of bits on disk.
- >>> st = RamStorage()
- >>> f = st.create_file("test.bin")
- >>> bs = BitSet([1, 10, 15, 7, 2])
- >>> bytecount = bs.to_disk(f)
- >>> f.close()
- >>> # ...
- >>> f = st.open_file("test.bin")
- >>> odbs = OnDiskBitSet(f, bytecount)
- >>> list(odbs)
- [1, 2, 7, 10, 15]
- """
- def __init__(self, dbfile, basepos, bytecount):
- """
- :param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object
- to read from.
- :param basepos: the base position of the bytes in the given file.
- :param bytecount: the number of bytes to use for the bit array.
- """
- self._dbfile = dbfile
- self._basepos = basepos
- self._bytecount = bytecount
- def __repr__(self):
- return "%s(%s, %d, %d)" % (self.__class__.__name__, self.dbfile,
- self._basepos, self.bytecount)
- def byte_count(self):
- return self._bytecount
- def _get_byte(self, n):
- return self._dbfile.get_byte(self._basepos + n)
- def _iter_bytes(self):
- dbfile = self._dbfile
- dbfile.seek(self._basepos)
- for _ in xrange(self._bytecount):
- yield dbfile.read_byte()
- class BitSet(BaseBitSet):
- """A DocIdSet backed by an array of bits. This can also be useful as a bit
- array (e.g. for a Bloom filter). It is much more memory efficient than a
- large built-in set of integers, but wastes memory for sparse sets.
- """
- def __init__(self, source=None, size=0):
- """
- :param maxsize: the maximum size of the bit array.
- :param source: an iterable of positive integers to add to this set.
- :param bits: an array of unsigned bytes ("B") to use as the underlying
- bit array. This is used by some of the object's methods.
- """
- # If the source is a list, tuple, or set, we can guess the size
- if not size and isinstance(source, (list, tuple, set, frozenset)):
- size = max(source)
- bytecount = bytes_for_bits(size)
- self.bits = array("B", (0 for _ in xrange(bytecount)))
- if source:
- add = self.add
- for num in source:
- add(num)
- def __repr__(self):
- return "%s(%r)" % (self.__class__.__name__, list(self))
- def byte_count(self):
- return len(self.bits)
- def _get_byte(self, n):
- return self.bits[n]
- def _iter_bytes(self):
- return iter(self.bits)
- def _trim(self):
- bits = self.bits
- last = len(self.bits) - 1
- while last >= 0 and not bits[last]:
- last -= 1
- del self.bits[last + 1:]
- def _resize(self, tosize):
- curlength = len(self.bits)
- newlength = bytes_for_bits(tosize)
- if newlength > curlength:
- self.bits.extend((0,) * (newlength - curlength))
- elif newlength < curlength:
- del self.bits[newlength + 1:]
- def _zero_extra_bits(self, size):
- bits = self.bits
- spill = size - ((len(bits) - 1) * 8)
- if spill:
- mask = 2 ** spill - 1
- bits[-1] = bits[-1] & mask
- def _logic(self, obj, op, other):
- objbits = obj.bits
- for i, (byte1, byte2) in enumerate(izip_longest(objbits, other.bits,
- fillvalue=0)):
- value = op(byte1, byte2) & 0xFF
- if i >= len(objbits):
- objbits.append(value)
- else:
- objbits[i] = value
- obj._trim()
- return obj
- def to_disk(self, dbfile):
- dbfile.write_array(self.bits)
- return len(self.bits)
- @classmethod
- def from_bytes(cls, bs):
- b = cls()
- b.bits = array("B", bs)
- return b
- @classmethod
- def from_disk(cls, dbfile, bytecount):
- return cls.from_bytes(dbfile.read_array("B", bytecount))
- def copy(self):
- b = self.__class__()
- b.bits = array("B", iter(self.bits))
- return b
- def clear(self):
- for i in xrange(len(self.bits)):
- self.bits[i] = 0
- def add(self, i):
- bucket = i >> 3
- if bucket >= len(self.bits):
- self._resize(i + 1)
- self.bits[bucket] |= 1 << (i & 7)
- def discard(self, i):
- bucket = i >> 3
- self.bits[bucket] &= ~(1 << (i & 7))
- def _resize_to_other(self, other):
- if isinstance(other, (list, tuple, set, frozenset)):
- maxbit = max(other)
- if maxbit // 8 > len(self.bits):
- self._resize(maxbit)
- def update(self, iterable):
- self._resize_to_other(iterable)
- DocIdSet.update(self, iterable)
- def intersection_update(self, other):
- if isinstance(other, BitSet):
- return self._logic(self, operator.__and__, other)
- discard = self.discard
- for n in self:
- if n not in other:
- discard(n)
- def difference_update(self, other):
- if isinstance(other, BitSet):
- return self._logic(self, lambda x, y: x & ~y, other)
- discard = self.discard
- for n in other:
- discard(n)
- def invert_update(self, size):
- bits = self.bits
- for i in xrange(len(bits)):
- bits[i] = ~bits[i] & 0xFF
- self._zero_extra_bits(size)
- def union(self, other):
- if isinstance(other, BitSet):
- return self._logic(self.copy(), operator.__or__, other)
- b = self.copy()
- b.update(other)
- return b
- def intersection(self, other):
- if isinstance(other, BitSet):
- return self._logic(self.copy(), operator.__and__, other)
- return BitSet(source=(n for n in self if n in other))
- def difference(self, other):
- if isinstance(other, BitSet):
- return self._logic(self.copy(), lambda x, y: x & ~y, other)
- return BitSet(source=(n for n in self if n not in other))
- class SortedIntSet(DocIdSet):
- """A DocIdSet backed by a sorted array of integers.
- """
- def __init__(self, source=None, typecode="I"):
- if source:
- self.data = array(typecode, sorted(source))
- else:
- self.data = array(typecode)
- self.typecode = typecode
- def copy(self):
- sis = SortedIntSet()
- sis.data = array(self.typecode, self.data)
- return sis
- def size(self):
- return len(self.data) * self.data.itemsize
- def __repr__(self):
- return "%s(%r)" % (self.__class__.__name__, self.data)
- def __len__(self):
- return len(self.data)
- def __iter__(self):
- return iter(self.data)
- def __nonzero__(self):
- return bool(self.data)
- __bool__ = __nonzero__
- def __contains__(self, i):
- data = self.data
- if not data or i < data[0] or i > data[-1]:
- return False
- pos = bisect_left(data, i)
- if pos == len(data):
- return False
- return data[pos] == i
- def add(self, i):
- data = self.data
- if not data or i > data[-1]:
- data.append(i)
- else:
- mn = data[0]
- mx = data[-1]
- if i == mn or i == mx:
- return
- elif i > mx:
- data.append(i)
- elif i < mn:
- data.insert(0, i)
- else:
- pos = bisect_left(data, i)
- if data[pos] != i:
- data.insert(pos, i)
- def discard(self, i):
- data = self.data
- pos = bisect_left(data, i)
- if data[pos] == i:
- data.pop(pos)
- def clear(self):
- self.data = array(self.typecode)
- def intersection_update(self, other):
- self.data = array(self.typecode, (num for num in self if num in other))
- def difference_update(self, other):
- self.data = array(self.typecode,
- (num for num in self if num not in other))
- def intersection(self, other):
- return SortedIntSet((num for num in self if num in other))
- def difference(self, other):
- return SortedIntSet((num for num in self if num not in other))
- def first(self):
- return self.data[0]
- def last(self):
- return self.data[-1]
- def before(self, i):
- data = self.data
- pos = bisect_left(data, i)
- if pos < 1:
- return None
- else:
- return data[pos - 1]
- def after(self, i):
- data = self.data
- if not data or i >= data[-1]:
- return None
- elif i < data[0]:
- return data[0]
- pos = bisect_right(data, i)
- return data[pos]
- class ReverseIdSet(DocIdSet):
- """
- Wraps a DocIdSet object and reverses its semantics, so docs in the wrapped
- set are not in this set, and vice-versa.
- """
- def __init__(self, idset, limit):
- """
- :param idset: the DocIdSet object to wrap.
- :param limit: the highest possible ID plus one.
- """
- self.idset = idset
- self.limit = limit
- def __len__(self):
- return self.limit - len(self.idset)
- def __contains__(self, i):
- return i not in self.idset
- def __iter__(self):
- ids = iter(self.idset)
- try:
- nx = next(ids)
- except StopIteration:
- nx = -1
- for i in xrange(self.limit):
- if i == nx:
- try:
- nx = next(ids)
- except StopIteration:
- nx = -1
- else:
- yield i
- def add(self, n):
- self.idset.discard(n)
- def discard(self, n):
- self.idset.add(n)
- def first(self):
- for i in self:
- return i
- def last(self):
- idset = self.idset
- maxid = self.limit - 1
- if idset.last() < maxid - 1:
- return maxid
- for i in xrange(maxid, -1, -1):
- if i not in idset:
- return i
- ROARING_CUTOFF = 1 << 12
- class RoaringIdSet(DocIdSet):
- """
- Separates IDs into ranges of 2^16 bits, and stores each range in the most
- efficient type of doc set, either a BitSet (if the range has >= 2^12 IDs)
- or a sorted ID set of 16-bit shorts.
- """
- cutoff = 2**12
- def __init__(self, source=None):
- self.idsets = []
- if source:
- self.update(source)
- def __len__(self):
- if not self.idsets:
- return 0
- return sum(len(idset) for idset in self.idsets)
- def __contains__(self, n):
- bucket = n >> 16
- if bucket >= len(self.idsets):
- return False
- return (n - (bucket << 16)) in self.idsets[bucket]
- def __iter__(self):
- for i, idset in self.idsets:
- floor = i << 16
- for n in idset:
- yield floor + n
- def _find(self, n):
- bucket = n >> 16
- floor = n << 16
- if bucket >= len(self.idsets):
- self.idsets.extend([SortedIntSet() for _
- in xrange(len(self.idsets), bucket + 1)])
- idset = self.idsets[bucket]
- return bucket, floor, idset
- def add(self, n):
- bucket, floor, idset = self._find(n)
- oldlen = len(idset)
- idset.add(n - floor)
- if oldlen <= ROARING_CUTOFF < len(idset):
- self.idsets[bucket] = BitSet(idset)
- def discard(self, n):
- bucket, floor, idset = self._find(n)
- oldlen = len(idset)
- idset.discard(n - floor)
- if oldlen > ROARING_CUTOFF >= len(idset):
- self.idsets[bucket] = SortedIntSet(idset)
- class MultiIdSet(DocIdSet):
- """Wraps multiple SERIAL sub-DocIdSet objects and presents them as an
- aggregated, read-only set.
- """
- def __init__(self, idsets, offsets):
- """
- :param idsets: a list of DocIdSet objects.
- :param offsets: a list of offsets corresponding to the DocIdSet objects
- in ``idsets``.
- """
- assert len(idsets) == len(offsets)
- self.idsets = idsets
- self.offsets = offsets
- def _document_set(self, n):
- offsets = self.offsets
- return max(bisect_left(offsets, n), len(self.offsets) - 1)
- def _set_and_docnum(self, n):
- setnum = self._document_set(n)
- offset = self.offsets[setnum]
- return self.idsets[setnum], n - offset
- def __len__(self):
- return sum(len(idset) for idset in self.idsets)
- def __iter__(self):
- for idset, offset in izip(self.idsets, self.offsets):
- for docnum in idset:
- yield docnum + offset
- def __contains__(self, item):
- idset, n = self._set_and_docnum(item)
- return n in idset
|