123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735 |
- # Copyright 2009 Matt Chaput. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # 1. Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
- # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # The views and conclusions contained in the software and documentation are
- # those of the authors and should not be interpreted as representing official
- # policies, either expressed or implied, of Matt Chaput.
- """This module defines writer and reader classes for a fast, immutable
- on-disk key-value database format. The current format is based heavily on
- D. J. Bernstein's CDB format (http://cr.yp.to/cdb.html).
- """
- import os, struct
- from binascii import crc32
- from bisect import bisect_left
- from hashlib import md5 # @UnresolvedImport
- from whoosh.compat import b, bytes_type
- from whoosh.compat import xrange
- from whoosh.util.numlists import GrowableArray
- from whoosh.system import _INT_SIZE, emptybytes
- # Exceptions
- class FileFormatError(Exception):
- pass
- # Hash functions
- def cdb_hash(key):
- h = 5381
- for c in key:
- h = (h + (h << 5)) & 0xffffffff ^ ord(c)
- return h
- def md5_hash(key):
- return int(md5(key).hexdigest(), 16) & 0xffffffff
- def crc_hash(key):
- return crc32(key) & 0xffffffff
- _hash_functions = (md5_hash, crc_hash, cdb_hash)
- # Structs
- # Two uints before the key/value pair giving the length of the key and value
- _lengths = struct.Struct("!ii")
- # A pointer in a hash table, giving the hash value and the key position
- _pointer = struct.Struct("!Iq")
- # A pointer in the hash table directory, giving the position and number of slots
- _dir_entry = struct.Struct("!qi")
- _directory_size = 256 * _dir_entry.size
- # Basic hash file
- class HashWriter(object):
- """Implements a fast on-disk key-value store. This hash uses a two-level
- hashing scheme, where a key is hashed, the low eight bits of the hash value
- are used to index into one of 256 hash tables. This is basically the CDB
- algorithm, but unlike CDB this object writes all data serially (it doesn't
- seek backwards to overwrite information at the end).
- Also unlike CDB, this format uses 64-bit file pointers, so the file length
- is essentially unlimited. However, each key and value must be less than
- 2 GB in length.
- """
- def __init__(self, dbfile, magic=b("HSH3"), hashtype=0):
- """
- :param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object
- to write to.
- :param magic: the format tag bytes to write at the start of the file.
- :param hashtype: an integer indicating which hashing algorithm to use.
- Possible values are 0 (MD5), 1 (CRC32), or 2 (CDB hash).
- """
- self.dbfile = dbfile
- self.hashtype = hashtype
- self.hashfn = _hash_functions[self.hashtype]
- # A place for subclasses to put extra metadata
- self.extras = {}
- self.startoffset = dbfile.tell()
- # Write format tag
- dbfile.write(magic)
- # Write hash type
- dbfile.write_byte(self.hashtype)
- # Unused future expansion bits
- dbfile.write_int(0)
- dbfile.write_int(0)
- # 256 lists of hashed keys and positions
- self.buckets = [[] for _ in xrange(256)]
- # List to remember the positions of the hash tables
- self.directory = []
- def tell(self):
- return self.dbfile.tell()
- def add(self, key, value):
- """Adds a key/value pair to the file. Note that keys DO NOT need to be
- unique. You can store multiple values under the same key and retrieve
- them using :meth:`HashReader.all`.
- """
- assert isinstance(key, bytes_type)
- assert isinstance(value, bytes_type)
- dbfile = self.dbfile
- pos = dbfile.tell()
- dbfile.write(_lengths.pack(len(key), len(value)))
- dbfile.write(key)
- dbfile.write(value)
- # Get hash value for the key
- h = self.hashfn(key)
- # Add hash and on-disk position to appropriate bucket
- self.buckets[h & 255].append((h, pos))
- def add_all(self, items):
- """Convenience method to add a sequence of ``(key, value)`` pairs. This
- is the same as calling :meth:`HashWriter.add` on each pair in the
- sequence.
- """
- add = self.add
- for key, value in items:
- add(key, value)
- def _write_hashes(self):
- # Writes 256 hash tables containing pointers to the key/value pairs
- dbfile = self.dbfile
- # Represent and empty slot in the hash table using 0,0 (no key can
- # start at position 0 because of the header)
- null = (0, 0)
- for entries in self.buckets:
- # Start position of this bucket's hash table
- pos = dbfile.tell()
- # Remember the start position and the number of slots
- numslots = 2 * len(entries)
- self.directory.append((pos, numslots))
- # Create the empty hash table
- hashtable = [null] * numslots
- # For each (hash value, key position) tuple in the bucket
- for hashval, position in entries:
- # Bitshift and wrap to get the slot for this entry
- slot = (hashval >> 8) % numslots
- # If the slot is taken, keep going until we find an empty slot
- while hashtable[slot] != null:
- slot = (slot + 1) % numslots
- # Insert the entry into the hashtable
- hashtable[slot] = (hashval, position)
- # Write the hash table for this bucket to disk
- for hashval, position in hashtable:
- dbfile.write(_pointer.pack(hashval, position))
- def _write_directory(self):
- # Writes a directory of pointers to the 256 hash tables
- dbfile = self.dbfile
- for position, numslots in self.directory:
- dbfile.write(_dir_entry.pack(position, numslots))
- def _write_extras(self):
- self.dbfile.write_pickle(self.extras)
- def close(self):
- dbfile = self.dbfile
- # Write hash tables
- self._write_hashes()
- # Write directory of pointers to hash tables
- self._write_directory()
- expos = dbfile.tell()
- # Write extra information
- self._write_extras()
- # Write length of pickle
- dbfile.write_int(dbfile.tell() - expos)
- endpos = dbfile.tell()
- dbfile.close()
- return endpos
- class HashReader(object):
- """Reader for the fast on-disk key-value files created by
- :class:`HashWriter`.
- """
- def __init__(self, dbfile, length=None, magic=b("HSH3"), startoffset=0):
- """
- :param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object
- to read from.
- :param length: the length of the file data. This is necessary since the
- hashing information is written at the end of the file.
- :param magic: the format tag bytes to look for at the start of the
- file. If the file's format tag does not match these bytes, the
- object raises a :class:`FileFormatError` exception.
- :param startoffset: the starting point of the file data.
- """
- self.dbfile = dbfile
- self.startoffset = startoffset
- self.is_closed = False
- if length is None:
- dbfile.seek(0, os.SEEK_END)
- length = dbfile.tell() - startoffset
- dbfile.seek(startoffset)
- # Check format tag
- filemagic = dbfile.read(4)
- if filemagic != magic:
- raise FileFormatError("Unknown file header %r" % filemagic)
- # Read hash type
- self.hashtype = dbfile.read_byte()
- self.hashfn = _hash_functions[self.hashtype]
- # Skip unused future expansion bits
- dbfile.read_int()
- dbfile.read_int()
- self.startofdata = dbfile.tell()
- exptr = startoffset + length - _INT_SIZE
- # Get the length of extras from the end of the file
- exlen = dbfile.get_int(exptr)
- # Read the extras
- expos = exptr - exlen
- dbfile.seek(expos)
- self._read_extras()
- # Calculate the directory base from the beginning of the extras
- dbfile.seek(expos - _directory_size)
- # Read directory of hash tables
- self.tables = []
- entrysize = _dir_entry.size
- unpackentry = _dir_entry.unpack
- for _ in xrange(256):
- # position, numslots
- self.tables.append(unpackentry(dbfile.read(entrysize)))
- # The position of the first hash table is the end of the key/value pairs
- self.endofdata = self.tables[0][0]
- @classmethod
- def open(cls, storage, name):
- """Convenience method to open a hash file given a
- :class:`whoosh.filedb.filestore.Storage` object and a name. This takes
- care of opening the file and passing its length to the initializer.
- """
- length = storage.file_length(name)
- dbfile = storage.open_file(name)
- return cls(dbfile, length)
- def file(self):
- return self.dbfile
- def _read_extras(self):
- try:
- self.extras = self.dbfile.read_pickle()
- except EOFError:
- self.extras = {}
- def close(self):
- if self.is_closed:
- raise Exception("Tried to close %r twice" % self)
- self.dbfile.close()
- self.is_closed = True
- def key_at(self, pos):
- # Returns the key bytes at the given position
- dbfile = self.dbfile
- keylen = dbfile.get_uint(pos)
- return dbfile.get(pos + _lengths.size, keylen)
- def key_and_range_at(self, pos):
- # Returns a (keybytes, datapos, datalen) tuple for the key at the given
- # position
- dbfile = self.dbfile
- lenssize = _lengths.size
- if pos >= self.endofdata:
- return None
- keylen, datalen = _lengths.unpack(dbfile.get(pos, lenssize))
- keybytes = dbfile.get(pos + lenssize, keylen)
- datapos = pos + lenssize + keylen
- return keybytes, datapos, datalen
- def _ranges(self, pos=None, eod=None):
- # Yields a series of (keypos, keylength, datapos, datalength) tuples
- # for the key/value pairs in the file
- dbfile = self.dbfile
- pos = pos or self.startofdata
- eod = eod or self.endofdata
- lenssize = _lengths.size
- unpacklens = _lengths.unpack
- while pos < eod:
- keylen, datalen = unpacklens(dbfile.get(pos, lenssize))
- keypos = pos + lenssize
- datapos = keypos + keylen
- yield (keypos, keylen, datapos, datalen)
- pos = datapos + datalen
- def __getitem__(self, key):
- for value in self.all(key):
- return value
- raise KeyError(key)
- def __iter__(self):
- dbfile = self.dbfile
- for keypos, keylen, datapos, datalen in self._ranges():
- key = dbfile.get(keypos, keylen)
- value = dbfile.get(datapos, datalen)
- yield (key, value)
- def __contains__(self, key):
- for _ in self.ranges_for_key(key):
- return True
- return False
- def keys(self):
- dbfile = self.dbfile
- for keypos, keylen, _, _ in self._ranges():
- yield dbfile.get(keypos, keylen)
- def values(self):
- dbfile = self.dbfile
- for _, _, datapos, datalen in self._ranges():
- yield dbfile.get(datapos, datalen)
- def items(self):
- dbfile = self.dbfile
- for keypos, keylen, datapos, datalen in self._ranges():
- yield (dbfile.get(keypos, keylen), dbfile.get(datapos, datalen))
- def get(self, key, default=None):
- for value in self.all(key):
- return value
- return default
- def all(self, key):
- """Yields a sequence of values associated with the given key.
- """
- dbfile = self.dbfile
- for datapos, datalen in self.ranges_for_key(key):
- yield dbfile.get(datapos, datalen)
- def ranges_for_key(self, key):
- """Yields a sequence of ``(datapos, datalength)`` tuples associated
- with the given key.
- """
- if not isinstance(key, bytes_type):
- raise TypeError("Key %r should be bytes" % key)
- dbfile = self.dbfile
- # Hash the key
- keyhash = self.hashfn(key)
- # Get the position and number of slots for the hash table in which the
- # key may be found
- tablestart, numslots = self.tables[keyhash & 255]
- # If the hash table is empty, we know the key doesn't exists
- if not numslots:
- return
- ptrsize = _pointer.size
- unpackptr = _pointer.unpack
- lenssize = _lengths.size
- unpacklens = _lengths.unpack
- # Calculate where the key's slot should be
- slotpos = tablestart + (((keyhash >> 8) % numslots) * ptrsize)
- # Read slots looking for our key's hash value
- for _ in xrange(numslots):
- slothash, itempos = unpackptr(dbfile.get(slotpos, ptrsize))
- # If this slot is empty, we're done
- if not itempos:
- return
- # If the key hash in this slot matches our key's hash, we might have
- # a match, so read the actual key and see if it's our key
- if slothash == keyhash:
- # Read the key and value lengths
- keylen, datalen = unpacklens(dbfile.get(itempos, lenssize))
- # Only bother reading the actual key if the lengths match
- if keylen == len(key):
- keystart = itempos + lenssize
- if key == dbfile.get(keystart, keylen):
- # The keys match, so yield (datapos, datalen)
- yield (keystart + keylen, datalen)
- slotpos += ptrsize
- # If we reach the end of the hashtable, wrap around
- if slotpos == tablestart + (numslots * ptrsize):
- slotpos = tablestart
- def range_for_key(self, key):
- for item in self.ranges_for_key(key):
- return item
- raise KeyError(key)
- # Ordered hash file
- class OrderedHashWriter(HashWriter):
- """Implements an on-disk hash, but requires that keys be added in order.
- An :class:`OrderedHashReader` can then look up "nearest keys" based on
- the ordering.
- """
- def __init__(self, dbfile):
- HashWriter.__init__(self, dbfile)
- # Keep an array of the positions of all keys
- self.index = GrowableArray("H")
- # Keep track of the last key added
- self.lastkey = emptybytes
- def add(self, key, value):
- if key <= self.lastkey:
- raise ValueError("Keys must increase: %r..%r"
- % (self.lastkey, key))
- self.index.append(self.dbfile.tell())
- HashWriter.add(self, key, value)
- self.lastkey = key
- def _write_extras(self):
- dbfile = self.dbfile
- index = self.index
- # Store metadata about the index array
- self.extras["indextype"] = index.typecode
- self.extras["indexlen"] = len(index)
- # Write the extras
- HashWriter._write_extras(self)
- # Write the index array
- index.to_file(dbfile)
- class OrderedHashReader(HashReader):
- def closest_key(self, key):
- """Returns the closest key equal to or greater than the given key. If
- there is no key in the file equal to or greater than the given key,
- returns None.
- """
- pos = self.closest_key_pos(key)
- if pos is None:
- return None
- return self.key_at(pos)
- def ranges_from(self, key):
- """Yields a series of ``(keypos, keylen, datapos, datalen)`` tuples
- for the ordered series of keys equal or greater than the given key.
- """
- pos = self.closest_key_pos(key)
- if pos is None:
- return
- for item in self._ranges(pos=pos):
- yield item
- def keys_from(self, key):
- """Yields an ordered series of keys equal to or greater than the given
- key.
- """
- dbfile = self.dbfile
- for keypos, keylen, _, _ in self.ranges_from(key):
- yield dbfile.get(keypos, keylen)
- def items_from(self, key):
- """Yields an ordered series of ``(key, value)`` tuples for keys equal
- to or greater than the given key.
- """
- dbfile = self.dbfile
- for keypos, keylen, datapos, datalen in self.ranges_from(key):
- yield (dbfile.get(keypos, keylen), dbfile.get(datapos, datalen))
- def _read_extras(self):
- dbfile = self.dbfile
- # Read the extras
- HashReader._read_extras(self)
- # Set up for reading the index array
- indextype = self.extras["indextype"]
- self.indexbase = dbfile.tell()
- self.indexlen = self.extras["indexlen"]
- self.indexsize = struct.calcsize(indextype)
- # Set up the function to read values from the index array
- if indextype == "B":
- self._get_pos = dbfile.get_byte
- elif indextype == "H":
- self._get_pos = dbfile.get_ushort
- elif indextype == "i":
- self._get_pos = dbfile.get_int
- elif indextype == "I":
- self._get_pos = dbfile.get_uint
- elif indextype == "q":
- self._get_pos = dbfile.get_long
- else:
- raise Exception("Unknown index type %r" % indextype)
- def closest_key_pos(self, key):
- # Given a key, return the position of that key OR the next highest key
- # if the given key does not exist
- if not isinstance(key, bytes_type):
- raise TypeError("Key %r should be bytes" % key)
- indexbase = self.indexbase
- indexsize = self.indexsize
- key_at = self.key_at
- _get_pos = self._get_pos
- # Do a binary search of the positions in the index array
- lo = 0
- hi = self.indexlen
- while lo < hi:
- mid = (lo + hi) // 2
- midkey = key_at(_get_pos(indexbase + mid * indexsize))
- if midkey < key:
- lo = mid + 1
- else:
- hi = mid
- # If we went off the end, return None
- if lo == self.indexlen:
- return None
- # Return the closest key
- return _get_pos(indexbase + lo * indexsize)
- # Fielded Ordered hash file
- class FieldedOrderedHashWriter(HashWriter):
- """Implements an on-disk hash, but writes separate position indexes for
- each field.
- """
- def __init__(self, dbfile):
- HashWriter.__init__(self, dbfile)
- # Map field names to (startpos, indexpos, length, typecode)
- self.fieldmap = self.extras["fieldmap"] = {}
- # Keep track of the last key added
- self.lastkey = emptybytes
- def start_field(self, fieldname):
- self.fieldstart = self.dbfile.tell()
- self.fieldname = fieldname
- # Keep an array of the positions of all keys
- self.poses = GrowableArray("H")
- self.lastkey = emptybytes
- def add(self, key, value):
- if key <= self.lastkey:
- raise ValueError("Keys must increase: %r..%r"
- % (self.lastkey, key))
- self.poses.append(self.dbfile.tell() - self.fieldstart)
- HashWriter.add(self, key, value)
- self.lastkey = key
- def end_field(self):
- dbfile = self.dbfile
- fieldname = self.fieldname
- poses = self.poses
- self.fieldmap[fieldname] = (self.fieldstart, dbfile.tell(), len(poses),
- poses.typecode)
- poses.to_file(dbfile)
- class FieldedOrderedHashReader(HashReader):
- def __init__(self, *args, **kwargs):
- HashReader.__init__(self, *args, **kwargs)
- self.fieldmap = self.extras["fieldmap"]
- # Make a sorted list of the field names with their start and end ranges
- self.fieldlist = []
- for fieldname in sorted(self.fieldmap.keys()):
- startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname]
- self.fieldlist.append((fieldname, startpos, ixpos))
- def field_start(self, fieldname):
- return self.fieldmap[fieldname][0]
- def fielded_ranges(self, pos=None, eod=None):
- flist = self.fieldlist
- fpos = 0
- fieldname, start, end = flist[fpos]
- for keypos, keylen, datapos, datalen in self._ranges(pos, eod):
- if keypos >= end:
- fpos += 1
- fieldname, start, end = flist[fpos]
- yield fieldname, keypos, keylen, datapos, datalen
- def iter_terms(self):
- get = self.dbfile.get
- for fieldname, keypos, keylen, _, _ in self.fielded_ranges():
- yield fieldname, get(keypos, keylen)
- def iter_term_items(self):
- get = self.dbfile.get
- for item in self.fielded_ranges():
- fieldname, keypos, keylen, datapos, datalen = item
- yield fieldname, get(keypos, keylen), get(datapos, datalen)
- def contains_term(self, fieldname, btext):
- try:
- x = self.range_for_term(fieldname, btext)
- return True
- except KeyError:
- return False
- def range_for_term(self, fieldname, btext):
- start, ixpos, ixsize, code = self.fieldmap[fieldname]
- for datapos, datalen in self.ranges_for_key(btext):
- if start < datapos < ixpos:
- return datapos, datalen
- raise KeyError((fieldname, btext))
- def term_data(self, fieldname, btext):
- datapos, datalen = self.range_for_term(fieldname, btext)
- return self.dbfile.get(datapos, datalen)
- def term_get(self, fieldname, btext, default=None):
- try:
- return self.term_data(fieldname, btext)
- except KeyError:
- return default
- def closest_term_pos(self, fieldname, key):
- # Given a key, return the position of that key OR the next highest key
- # if the given key does not exist
- if not isinstance(key, bytes_type):
- raise TypeError("Key %r should be bytes" % key)
- dbfile = self.dbfile
- key_at = self.key_at
- startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname]
- if ixtype == "B":
- get_pos = dbfile.get_byte
- elif ixtype == "H":
- get_pos = dbfile.get_ushort
- elif ixtype == "i":
- get_pos = dbfile.get_int
- elif ixtype == "I":
- get_pos = dbfile.get_uint
- elif ixtype == "q":
- get_pos = dbfile.get_long
- else:
- raise Exception("Unknown index type %r" % ixtype)
- # Do a binary search of the positions in the index array
- lo = 0
- hi = ixsize
- while lo < hi:
- mid = (lo + hi) // 2
- midkey = key_at(startpos + get_pos(ixpos + mid * ixsize))
- if midkey < key:
- lo = mid + 1
- else:
- hi = mid
- # If we went off the end, return None
- if lo == ixsize:
- return None
- # Return the closest key
- return startpos + get_pos(ixpos + lo * ixsize)
- def closest_term(self, fieldname, btext):
- pos = self.closest_term_pos(fieldname, btext)
- if pos is None:
- return None
- return self.key_at(pos)
- def term_ranges_from(self, fieldname, btext):
- pos = self.closest_term_pos(fieldname, btext)
- if pos is None:
- return
- startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname]
- for item in self._ranges(pos, ixpos):
- yield item
- def terms_from(self, fieldname, btext):
- dbfile = self.dbfile
- for keypos, keylen, _, _ in self.term_ranges_from(fieldname, btext):
- yield dbfile.get(keypos, keylen)
- def term_items_from(self, fieldname, btext):
- dbfile = self.dbfile
- for item in self.term_ranges_from(fieldname, btext):
- keypos, keylen, datapos, datalen = item
- yield (dbfile.get(keypos, keylen), dbfile.get(datapos, datalen))
|