filetables.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735
  1. # Copyright 2009 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. """This module defines writer and reader classes for a fast, immutable
  28. on-disk key-value database format. The current format is based heavily on
  29. D. J. Bernstein's CDB format (http://cr.yp.to/cdb.html).
  30. """
  31. import os, struct
  32. from binascii import crc32
  33. from bisect import bisect_left
  34. from hashlib import md5 # @UnresolvedImport
  35. from whoosh.compat import b, bytes_type
  36. from whoosh.compat import xrange
  37. from whoosh.util.numlists import GrowableArray
  38. from whoosh.system import _INT_SIZE, emptybytes
  39. # Exceptions
  40. class FileFormatError(Exception):
  41. pass
  42. # Hash functions
  43. def cdb_hash(key):
  44. h = 5381
  45. for c in key:
  46. h = (h + (h << 5)) & 0xffffffff ^ ord(c)
  47. return h
  48. def md5_hash(key):
  49. return int(md5(key).hexdigest(), 16) & 0xffffffff
  50. def crc_hash(key):
  51. return crc32(key) & 0xffffffff
  52. _hash_functions = (md5_hash, crc_hash, cdb_hash)
  53. # Structs
  54. # Two uints before the key/value pair giving the length of the key and value
  55. _lengths = struct.Struct("!ii")
  56. # A pointer in a hash table, giving the hash value and the key position
  57. _pointer = struct.Struct("!Iq")
  58. # A pointer in the hash table directory, giving the position and number of slots
  59. _dir_entry = struct.Struct("!qi")
  60. _directory_size = 256 * _dir_entry.size
  61. # Basic hash file
  62. class HashWriter(object):
  63. """Implements a fast on-disk key-value store. This hash uses a two-level
  64. hashing scheme, where a key is hashed, the low eight bits of the hash value
  65. are used to index into one of 256 hash tables. This is basically the CDB
  66. algorithm, but unlike CDB this object writes all data serially (it doesn't
  67. seek backwards to overwrite information at the end).
  68. Also unlike CDB, this format uses 64-bit file pointers, so the file length
  69. is essentially unlimited. However, each key and value must be less than
  70. 2 GB in length.
  71. """
  72. def __init__(self, dbfile, magic=b("HSH3"), hashtype=0):
  73. """
  74. :param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object
  75. to write to.
  76. :param magic: the format tag bytes to write at the start of the file.
  77. :param hashtype: an integer indicating which hashing algorithm to use.
  78. Possible values are 0 (MD5), 1 (CRC32), or 2 (CDB hash).
  79. """
  80. self.dbfile = dbfile
  81. self.hashtype = hashtype
  82. self.hashfn = _hash_functions[self.hashtype]
  83. # A place for subclasses to put extra metadata
  84. self.extras = {}
  85. self.startoffset = dbfile.tell()
  86. # Write format tag
  87. dbfile.write(magic)
  88. # Write hash type
  89. dbfile.write_byte(self.hashtype)
  90. # Unused future expansion bits
  91. dbfile.write_int(0)
  92. dbfile.write_int(0)
  93. # 256 lists of hashed keys and positions
  94. self.buckets = [[] for _ in xrange(256)]
  95. # List to remember the positions of the hash tables
  96. self.directory = []
  97. def tell(self):
  98. return self.dbfile.tell()
  99. def add(self, key, value):
  100. """Adds a key/value pair to the file. Note that keys DO NOT need to be
  101. unique. You can store multiple values under the same key and retrieve
  102. them using :meth:`HashReader.all`.
  103. """
  104. assert isinstance(key, bytes_type)
  105. assert isinstance(value, bytes_type)
  106. dbfile = self.dbfile
  107. pos = dbfile.tell()
  108. dbfile.write(_lengths.pack(len(key), len(value)))
  109. dbfile.write(key)
  110. dbfile.write(value)
  111. # Get hash value for the key
  112. h = self.hashfn(key)
  113. # Add hash and on-disk position to appropriate bucket
  114. self.buckets[h & 255].append((h, pos))
  115. def add_all(self, items):
  116. """Convenience method to add a sequence of ``(key, value)`` pairs. This
  117. is the same as calling :meth:`HashWriter.add` on each pair in the
  118. sequence.
  119. """
  120. add = self.add
  121. for key, value in items:
  122. add(key, value)
  123. def _write_hashes(self):
  124. # Writes 256 hash tables containing pointers to the key/value pairs
  125. dbfile = self.dbfile
  126. # Represent and empty slot in the hash table using 0,0 (no key can
  127. # start at position 0 because of the header)
  128. null = (0, 0)
  129. for entries in self.buckets:
  130. # Start position of this bucket's hash table
  131. pos = dbfile.tell()
  132. # Remember the start position and the number of slots
  133. numslots = 2 * len(entries)
  134. self.directory.append((pos, numslots))
  135. # Create the empty hash table
  136. hashtable = [null] * numslots
  137. # For each (hash value, key position) tuple in the bucket
  138. for hashval, position in entries:
  139. # Bitshift and wrap to get the slot for this entry
  140. slot = (hashval >> 8) % numslots
  141. # If the slot is taken, keep going until we find an empty slot
  142. while hashtable[slot] != null:
  143. slot = (slot + 1) % numslots
  144. # Insert the entry into the hashtable
  145. hashtable[slot] = (hashval, position)
  146. # Write the hash table for this bucket to disk
  147. for hashval, position in hashtable:
  148. dbfile.write(_pointer.pack(hashval, position))
  149. def _write_directory(self):
  150. # Writes a directory of pointers to the 256 hash tables
  151. dbfile = self.dbfile
  152. for position, numslots in self.directory:
  153. dbfile.write(_dir_entry.pack(position, numslots))
  154. def _write_extras(self):
  155. self.dbfile.write_pickle(self.extras)
  156. def close(self):
  157. dbfile = self.dbfile
  158. # Write hash tables
  159. self._write_hashes()
  160. # Write directory of pointers to hash tables
  161. self._write_directory()
  162. expos = dbfile.tell()
  163. # Write extra information
  164. self._write_extras()
  165. # Write length of pickle
  166. dbfile.write_int(dbfile.tell() - expos)
  167. endpos = dbfile.tell()
  168. dbfile.close()
  169. return endpos
  170. class HashReader(object):
  171. """Reader for the fast on-disk key-value files created by
  172. :class:`HashWriter`.
  173. """
  174. def __init__(self, dbfile, length=None, magic=b("HSH3"), startoffset=0):
  175. """
  176. :param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object
  177. to read from.
  178. :param length: the length of the file data. This is necessary since the
  179. hashing information is written at the end of the file.
  180. :param magic: the format tag bytes to look for at the start of the
  181. file. If the file's format tag does not match these bytes, the
  182. object raises a :class:`FileFormatError` exception.
  183. :param startoffset: the starting point of the file data.
  184. """
  185. self.dbfile = dbfile
  186. self.startoffset = startoffset
  187. self.is_closed = False
  188. if length is None:
  189. dbfile.seek(0, os.SEEK_END)
  190. length = dbfile.tell() - startoffset
  191. dbfile.seek(startoffset)
  192. # Check format tag
  193. filemagic = dbfile.read(4)
  194. if filemagic != magic:
  195. raise FileFormatError("Unknown file header %r" % filemagic)
  196. # Read hash type
  197. self.hashtype = dbfile.read_byte()
  198. self.hashfn = _hash_functions[self.hashtype]
  199. # Skip unused future expansion bits
  200. dbfile.read_int()
  201. dbfile.read_int()
  202. self.startofdata = dbfile.tell()
  203. exptr = startoffset + length - _INT_SIZE
  204. # Get the length of extras from the end of the file
  205. exlen = dbfile.get_int(exptr)
  206. # Read the extras
  207. expos = exptr - exlen
  208. dbfile.seek(expos)
  209. self._read_extras()
  210. # Calculate the directory base from the beginning of the extras
  211. dbfile.seek(expos - _directory_size)
  212. # Read directory of hash tables
  213. self.tables = []
  214. entrysize = _dir_entry.size
  215. unpackentry = _dir_entry.unpack
  216. for _ in xrange(256):
  217. # position, numslots
  218. self.tables.append(unpackentry(dbfile.read(entrysize)))
  219. # The position of the first hash table is the end of the key/value pairs
  220. self.endofdata = self.tables[0][0]
  221. @classmethod
  222. def open(cls, storage, name):
  223. """Convenience method to open a hash file given a
  224. :class:`whoosh.filedb.filestore.Storage` object and a name. This takes
  225. care of opening the file and passing its length to the initializer.
  226. """
  227. length = storage.file_length(name)
  228. dbfile = storage.open_file(name)
  229. return cls(dbfile, length)
  230. def file(self):
  231. return self.dbfile
  232. def _read_extras(self):
  233. try:
  234. self.extras = self.dbfile.read_pickle()
  235. except EOFError:
  236. self.extras = {}
  237. def close(self):
  238. if self.is_closed:
  239. raise Exception("Tried to close %r twice" % self)
  240. self.dbfile.close()
  241. self.is_closed = True
  242. def key_at(self, pos):
  243. # Returns the key bytes at the given position
  244. dbfile = self.dbfile
  245. keylen = dbfile.get_uint(pos)
  246. return dbfile.get(pos + _lengths.size, keylen)
  247. def key_and_range_at(self, pos):
  248. # Returns a (keybytes, datapos, datalen) tuple for the key at the given
  249. # position
  250. dbfile = self.dbfile
  251. lenssize = _lengths.size
  252. if pos >= self.endofdata:
  253. return None
  254. keylen, datalen = _lengths.unpack(dbfile.get(pos, lenssize))
  255. keybytes = dbfile.get(pos + lenssize, keylen)
  256. datapos = pos + lenssize + keylen
  257. return keybytes, datapos, datalen
  258. def _ranges(self, pos=None, eod=None):
  259. # Yields a series of (keypos, keylength, datapos, datalength) tuples
  260. # for the key/value pairs in the file
  261. dbfile = self.dbfile
  262. pos = pos or self.startofdata
  263. eod = eod or self.endofdata
  264. lenssize = _lengths.size
  265. unpacklens = _lengths.unpack
  266. while pos < eod:
  267. keylen, datalen = unpacklens(dbfile.get(pos, lenssize))
  268. keypos = pos + lenssize
  269. datapos = keypos + keylen
  270. yield (keypos, keylen, datapos, datalen)
  271. pos = datapos + datalen
  272. def __getitem__(self, key):
  273. for value in self.all(key):
  274. return value
  275. raise KeyError(key)
  276. def __iter__(self):
  277. dbfile = self.dbfile
  278. for keypos, keylen, datapos, datalen in self._ranges():
  279. key = dbfile.get(keypos, keylen)
  280. value = dbfile.get(datapos, datalen)
  281. yield (key, value)
  282. def __contains__(self, key):
  283. for _ in self.ranges_for_key(key):
  284. return True
  285. return False
  286. def keys(self):
  287. dbfile = self.dbfile
  288. for keypos, keylen, _, _ in self._ranges():
  289. yield dbfile.get(keypos, keylen)
  290. def values(self):
  291. dbfile = self.dbfile
  292. for _, _, datapos, datalen in self._ranges():
  293. yield dbfile.get(datapos, datalen)
  294. def items(self):
  295. dbfile = self.dbfile
  296. for keypos, keylen, datapos, datalen in self._ranges():
  297. yield (dbfile.get(keypos, keylen), dbfile.get(datapos, datalen))
  298. def get(self, key, default=None):
  299. for value in self.all(key):
  300. return value
  301. return default
  302. def all(self, key):
  303. """Yields a sequence of values associated with the given key.
  304. """
  305. dbfile = self.dbfile
  306. for datapos, datalen in self.ranges_for_key(key):
  307. yield dbfile.get(datapos, datalen)
  308. def ranges_for_key(self, key):
  309. """Yields a sequence of ``(datapos, datalength)`` tuples associated
  310. with the given key.
  311. """
  312. if not isinstance(key, bytes_type):
  313. raise TypeError("Key %r should be bytes" % key)
  314. dbfile = self.dbfile
  315. # Hash the key
  316. keyhash = self.hashfn(key)
  317. # Get the position and number of slots for the hash table in which the
  318. # key may be found
  319. tablestart, numslots = self.tables[keyhash & 255]
  320. # If the hash table is empty, we know the key doesn't exists
  321. if not numslots:
  322. return
  323. ptrsize = _pointer.size
  324. unpackptr = _pointer.unpack
  325. lenssize = _lengths.size
  326. unpacklens = _lengths.unpack
  327. # Calculate where the key's slot should be
  328. slotpos = tablestart + (((keyhash >> 8) % numslots) * ptrsize)
  329. # Read slots looking for our key's hash value
  330. for _ in xrange(numslots):
  331. slothash, itempos = unpackptr(dbfile.get(slotpos, ptrsize))
  332. # If this slot is empty, we're done
  333. if not itempos:
  334. return
  335. # If the key hash in this slot matches our key's hash, we might have
  336. # a match, so read the actual key and see if it's our key
  337. if slothash == keyhash:
  338. # Read the key and value lengths
  339. keylen, datalen = unpacklens(dbfile.get(itempos, lenssize))
  340. # Only bother reading the actual key if the lengths match
  341. if keylen == len(key):
  342. keystart = itempos + lenssize
  343. if key == dbfile.get(keystart, keylen):
  344. # The keys match, so yield (datapos, datalen)
  345. yield (keystart + keylen, datalen)
  346. slotpos += ptrsize
  347. # If we reach the end of the hashtable, wrap around
  348. if slotpos == tablestart + (numslots * ptrsize):
  349. slotpos = tablestart
  350. def range_for_key(self, key):
  351. for item in self.ranges_for_key(key):
  352. return item
  353. raise KeyError(key)
  354. # Ordered hash file
  355. class OrderedHashWriter(HashWriter):
  356. """Implements an on-disk hash, but requires that keys be added in order.
  357. An :class:`OrderedHashReader` can then look up "nearest keys" based on
  358. the ordering.
  359. """
  360. def __init__(self, dbfile):
  361. HashWriter.__init__(self, dbfile)
  362. # Keep an array of the positions of all keys
  363. self.index = GrowableArray("H")
  364. # Keep track of the last key added
  365. self.lastkey = emptybytes
  366. def add(self, key, value):
  367. if key <= self.lastkey:
  368. raise ValueError("Keys must increase: %r..%r"
  369. % (self.lastkey, key))
  370. self.index.append(self.dbfile.tell())
  371. HashWriter.add(self, key, value)
  372. self.lastkey = key
  373. def _write_extras(self):
  374. dbfile = self.dbfile
  375. index = self.index
  376. # Store metadata about the index array
  377. self.extras["indextype"] = index.typecode
  378. self.extras["indexlen"] = len(index)
  379. # Write the extras
  380. HashWriter._write_extras(self)
  381. # Write the index array
  382. index.to_file(dbfile)
  383. class OrderedHashReader(HashReader):
  384. def closest_key(self, key):
  385. """Returns the closest key equal to or greater than the given key. If
  386. there is no key in the file equal to or greater than the given key,
  387. returns None.
  388. """
  389. pos = self.closest_key_pos(key)
  390. if pos is None:
  391. return None
  392. return self.key_at(pos)
  393. def ranges_from(self, key):
  394. """Yields a series of ``(keypos, keylen, datapos, datalen)`` tuples
  395. for the ordered series of keys equal or greater than the given key.
  396. """
  397. pos = self.closest_key_pos(key)
  398. if pos is None:
  399. return
  400. for item in self._ranges(pos=pos):
  401. yield item
  402. def keys_from(self, key):
  403. """Yields an ordered series of keys equal to or greater than the given
  404. key.
  405. """
  406. dbfile = self.dbfile
  407. for keypos, keylen, _, _ in self.ranges_from(key):
  408. yield dbfile.get(keypos, keylen)
  409. def items_from(self, key):
  410. """Yields an ordered series of ``(key, value)`` tuples for keys equal
  411. to or greater than the given key.
  412. """
  413. dbfile = self.dbfile
  414. for keypos, keylen, datapos, datalen in self.ranges_from(key):
  415. yield (dbfile.get(keypos, keylen), dbfile.get(datapos, datalen))
  416. def _read_extras(self):
  417. dbfile = self.dbfile
  418. # Read the extras
  419. HashReader._read_extras(self)
  420. # Set up for reading the index array
  421. indextype = self.extras["indextype"]
  422. self.indexbase = dbfile.tell()
  423. self.indexlen = self.extras["indexlen"]
  424. self.indexsize = struct.calcsize(indextype)
  425. # Set up the function to read values from the index array
  426. if indextype == "B":
  427. self._get_pos = dbfile.get_byte
  428. elif indextype == "H":
  429. self._get_pos = dbfile.get_ushort
  430. elif indextype == "i":
  431. self._get_pos = dbfile.get_int
  432. elif indextype == "I":
  433. self._get_pos = dbfile.get_uint
  434. elif indextype == "q":
  435. self._get_pos = dbfile.get_long
  436. else:
  437. raise Exception("Unknown index type %r" % indextype)
  438. def closest_key_pos(self, key):
  439. # Given a key, return the position of that key OR the next highest key
  440. # if the given key does not exist
  441. if not isinstance(key, bytes_type):
  442. raise TypeError("Key %r should be bytes" % key)
  443. indexbase = self.indexbase
  444. indexsize = self.indexsize
  445. key_at = self.key_at
  446. _get_pos = self._get_pos
  447. # Do a binary search of the positions in the index array
  448. lo = 0
  449. hi = self.indexlen
  450. while lo < hi:
  451. mid = (lo + hi) // 2
  452. midkey = key_at(_get_pos(indexbase + mid * indexsize))
  453. if midkey < key:
  454. lo = mid + 1
  455. else:
  456. hi = mid
  457. # If we went off the end, return None
  458. if lo == self.indexlen:
  459. return None
  460. # Return the closest key
  461. return _get_pos(indexbase + lo * indexsize)
  462. # Fielded Ordered hash file
  463. class FieldedOrderedHashWriter(HashWriter):
  464. """Implements an on-disk hash, but writes separate position indexes for
  465. each field.
  466. """
  467. def __init__(self, dbfile):
  468. HashWriter.__init__(self, dbfile)
  469. # Map field names to (startpos, indexpos, length, typecode)
  470. self.fieldmap = self.extras["fieldmap"] = {}
  471. # Keep track of the last key added
  472. self.lastkey = emptybytes
  473. def start_field(self, fieldname):
  474. self.fieldstart = self.dbfile.tell()
  475. self.fieldname = fieldname
  476. # Keep an array of the positions of all keys
  477. self.poses = GrowableArray("H")
  478. self.lastkey = emptybytes
  479. def add(self, key, value):
  480. if key <= self.lastkey:
  481. raise ValueError("Keys must increase: %r..%r"
  482. % (self.lastkey, key))
  483. self.poses.append(self.dbfile.tell() - self.fieldstart)
  484. HashWriter.add(self, key, value)
  485. self.lastkey = key
  486. def end_field(self):
  487. dbfile = self.dbfile
  488. fieldname = self.fieldname
  489. poses = self.poses
  490. self.fieldmap[fieldname] = (self.fieldstart, dbfile.tell(), len(poses),
  491. poses.typecode)
  492. poses.to_file(dbfile)
  493. class FieldedOrderedHashReader(HashReader):
  494. def __init__(self, *args, **kwargs):
  495. HashReader.__init__(self, *args, **kwargs)
  496. self.fieldmap = self.extras["fieldmap"]
  497. # Make a sorted list of the field names with their start and end ranges
  498. self.fieldlist = []
  499. for fieldname in sorted(self.fieldmap.keys()):
  500. startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname]
  501. self.fieldlist.append((fieldname, startpos, ixpos))
  502. def field_start(self, fieldname):
  503. return self.fieldmap[fieldname][0]
  504. def fielded_ranges(self, pos=None, eod=None):
  505. flist = self.fieldlist
  506. fpos = 0
  507. fieldname, start, end = flist[fpos]
  508. for keypos, keylen, datapos, datalen in self._ranges(pos, eod):
  509. if keypos >= end:
  510. fpos += 1
  511. fieldname, start, end = flist[fpos]
  512. yield fieldname, keypos, keylen, datapos, datalen
  513. def iter_terms(self):
  514. get = self.dbfile.get
  515. for fieldname, keypos, keylen, _, _ in self.fielded_ranges():
  516. yield fieldname, get(keypos, keylen)
  517. def iter_term_items(self):
  518. get = self.dbfile.get
  519. for item in self.fielded_ranges():
  520. fieldname, keypos, keylen, datapos, datalen = item
  521. yield fieldname, get(keypos, keylen), get(datapos, datalen)
  522. def contains_term(self, fieldname, btext):
  523. try:
  524. x = self.range_for_term(fieldname, btext)
  525. return True
  526. except KeyError:
  527. return False
  528. def range_for_term(self, fieldname, btext):
  529. start, ixpos, ixsize, code = self.fieldmap[fieldname]
  530. for datapos, datalen in self.ranges_for_key(btext):
  531. if start < datapos < ixpos:
  532. return datapos, datalen
  533. raise KeyError((fieldname, btext))
  534. def term_data(self, fieldname, btext):
  535. datapos, datalen = self.range_for_term(fieldname, btext)
  536. return self.dbfile.get(datapos, datalen)
  537. def term_get(self, fieldname, btext, default=None):
  538. try:
  539. return self.term_data(fieldname, btext)
  540. except KeyError:
  541. return default
  542. def closest_term_pos(self, fieldname, key):
  543. # Given a key, return the position of that key OR the next highest key
  544. # if the given key does not exist
  545. if not isinstance(key, bytes_type):
  546. raise TypeError("Key %r should be bytes" % key)
  547. dbfile = self.dbfile
  548. key_at = self.key_at
  549. startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname]
  550. if ixtype == "B":
  551. get_pos = dbfile.get_byte
  552. elif ixtype == "H":
  553. get_pos = dbfile.get_ushort
  554. elif ixtype == "i":
  555. get_pos = dbfile.get_int
  556. elif ixtype == "I":
  557. get_pos = dbfile.get_uint
  558. elif ixtype == "q":
  559. get_pos = dbfile.get_long
  560. else:
  561. raise Exception("Unknown index type %r" % ixtype)
  562. # Do a binary search of the positions in the index array
  563. lo = 0
  564. hi = ixsize
  565. while lo < hi:
  566. mid = (lo + hi) // 2
  567. midkey = key_at(startpos + get_pos(ixpos + mid * ixsize))
  568. if midkey < key:
  569. lo = mid + 1
  570. else:
  571. hi = mid
  572. # If we went off the end, return None
  573. if lo == ixsize:
  574. return None
  575. # Return the closest key
  576. return startpos + get_pos(ixpos + lo * ixsize)
  577. def closest_term(self, fieldname, btext):
  578. pos = self.closest_term_pos(fieldname, btext)
  579. if pos is None:
  580. return None
  581. return self.key_at(pos)
  582. def term_ranges_from(self, fieldname, btext):
  583. pos = self.closest_term_pos(fieldname, btext)
  584. if pos is None:
  585. return
  586. startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname]
  587. for item in self._ranges(pos, ixpos):
  588. yield item
  589. def terms_from(self, fieldname, btext):
  590. dbfile = self.dbfile
  591. for keypos, keylen, _, _ in self.term_ranges_from(fieldname, btext):
  592. yield dbfile.get(keypos, keylen)
  593. def term_items_from(self, fieldname, btext):
  594. dbfile = self.dbfile
  595. for item in self.term_ranges_from(fieldname, btext):
  596. keypos, keylen, datapos, datalen = item
  597. yield (dbfile.get(keypos, keylen), dbfile.get(datapos, datalen))