columns.py 48 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474
  1. # Copyright 2012 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. """
  28. The API and implementation of columns may change in the next version of Whoosh!
  29. This module contains "Column" objects which you can use as the argument to a
  30. Field object's ``sortable=`` keyword argument. Each field defines a default
  31. column type for when the user specifies ``sortable=True`` (the object returned
  32. by the field's ``default_column()`` method).
  33. The default column type for most fields is ``VarBytesColumn``,
  34. although numeric and date fields use ``NumericColumn``. Expert users may use
  35. other field types that may be faster or more storage efficient based on the
  36. field contents. For example, if a field always contains one of a limited number
  37. of possible values, a ``RefBytesColumn`` will save space by only storing the
  38. values once. If a field's values are always a fixed length, the
  39. ``FixedBytesColumn`` saves space by not storing the length of each value.
  40. A ``Column`` object basically exists to store configuration information and
  41. provides two important methods: ``writer()`` to return a ``ColumnWriter`` object
  42. and ``reader()`` to return a ``ColumnReader`` object.
  43. """
  44. from __future__ import division, with_statement
  45. import struct, warnings
  46. from array import array
  47. from bisect import bisect_right
  48. try:
  49. import zlib
  50. except ImportError:
  51. zlib = None
  52. from whoosh.compat import b, bytes_type, BytesIO
  53. from whoosh.compat import array_tobytes, xrange
  54. from whoosh.compat import dumps, loads
  55. from whoosh.filedb.structfile import StructFile
  56. from whoosh.idsets import BitSet, OnDiskBitSet
  57. from whoosh.system import emptybytes
  58. from whoosh.util.cache import lru_cache
  59. from whoosh.util.numeric import typecode_max, typecode_min
  60. from whoosh.util.numlists import GrowableArray
  61. from whoosh.util.varints import varint, read_varint
  62. # Utility functions
  63. def _mintype(maxn):
  64. if maxn < 2 ** 8:
  65. typecode = "B"
  66. elif maxn < 2 ** 16:
  67. typecode = "H"
  68. elif maxn < 2 ** 31:
  69. typecode = "i"
  70. else:
  71. typecode = "I"
  72. return typecode
  73. # Python does not support arrays of long long see Issue 1172711
  74. # These functions help write/read a simulated an array of q/Q using lists
  75. def write_qsafe_array(typecode, arry, dbfile):
  76. if typecode == "q":
  77. for num in arry:
  78. dbfile.write_long(num)
  79. elif typecode == "Q":
  80. for num in arry:
  81. dbfile.write_ulong(num)
  82. else:
  83. dbfile.write_array(arry)
  84. def read_qsafe_array(typecode, size, dbfile):
  85. if typecode == "q":
  86. arry = [dbfile.read_long() for _ in xrange(size)]
  87. elif typecode == "Q":
  88. arry = [dbfile.read_ulong() for _ in xrange(size)]
  89. else:
  90. arry = dbfile.read_array(typecode, size)
  91. return arry
  92. def make_array(typecode, size=0, default=None):
  93. if typecode.lower() == "q":
  94. # Python does not support arrays of long long see Issue 1172711
  95. if default is not None and size:
  96. arry = [default] * size
  97. else:
  98. arry = []
  99. else:
  100. if default is not None and size:
  101. arry = array(typecode, (default for _ in xrange(size)))
  102. else:
  103. arry = array(typecode)
  104. return arry
  105. # Base classes
  106. class Column(object):
  107. """Represents a "column" of rows mapping docnums to document values.
  108. The interface requires that you store the start offset of the column, the
  109. length of the column data, and the number of documents (rows) separately,
  110. and pass them to the reader object.
  111. """
  112. reversible = False
  113. def writer(self, dbfile):
  114. """Returns a :class:`ColumnWriter` object you can use to use to create
  115. a column of this type on disk.
  116. :param dbfile: the :class:`~whoosh.filedb.structfile.StructFile` to
  117. write to.
  118. """
  119. return self.Writer(dbfile)
  120. def reader(self, dbfile, basepos, length, doccount):
  121. """Returns a :class:`ColumnReader` object you can use to read a column
  122. of this type from disk.
  123. :param dbfile: the :class:`~whoosh.filedb.structfile.StructFile` to
  124. read from.
  125. :param basepos: the offset within the file at which the column starts.
  126. :param length: the length in bytes of the column occupies in the file.
  127. :param doccount: the number of rows (documents) in the column.
  128. """
  129. return self.Reader(dbfile, basepos, length, doccount)
  130. def default_value(self, reverse=False):
  131. """Returns the default value for this column type.
  132. """
  133. return self._default
  134. def stores_lists(self):
  135. """Returns True if the column stores a list of values for each document
  136. instead of a single value.
  137. """
  138. return False
  139. class ColumnWriter(object):
  140. def __init__(self, dbfile):
  141. self._dbfile = dbfile
  142. self._count = 0
  143. def fill(self, docnum):
  144. write = self._dbfile.write
  145. default = self._defaultbytes
  146. if docnum > self._count:
  147. for _ in xrange(docnum - self._count):
  148. write(default)
  149. def add(self, docnum, value):
  150. raise NotImplementedError
  151. def finish(self, docnum):
  152. pass
  153. class ColumnReader(object):
  154. def __init__(self, dbfile, basepos, length, doccount):
  155. self._dbfile = dbfile
  156. self._basepos = basepos
  157. self._length = length
  158. self._doccount = doccount
  159. def __len__(self):
  160. return self._doccount
  161. def __getitem__(self, docnum):
  162. raise NotImplementedError
  163. def sort_key(self, docnum):
  164. return self[docnum]
  165. def __iter__(self):
  166. for i in xrange(self._doccount):
  167. yield self[i]
  168. def load(self):
  169. return list(self)
  170. def set_reverse(self):
  171. raise NotImplementedError
  172. # Arbitrary bytes column
  173. class VarBytesColumn(Column):
  174. """Stores variable length byte strings. See also :class:`RefBytesColumn`.
  175. The current implementation limits the total length of all document values
  176. a segment to 2 GB.
  177. The default value (the value returned for a document that didn't have a
  178. value assigned to it at indexing time) is an empty bytestring (``b''``).
  179. """
  180. _default = emptybytes
  181. def __init__(self, allow_offsets=True, write_offsets_cutoff=2**15):
  182. """
  183. :param allow_offsets: Whether the column should write offsets when there
  184. are many rows in the column (this makes opening the column much
  185. faster). This argument is mostly for testing.
  186. :param write_offsets_cutoff: Write offsets (for speed) when there are
  187. more than this many rows in the column. This argument is mostly
  188. for testing.
  189. """
  190. self.allow_offsets = allow_offsets
  191. self.write_offsets_cutoff = write_offsets_cutoff
  192. def writer(self, dbfile):
  193. return self.Writer(dbfile, self.allow_offsets,
  194. self.write_offsets_cutoff)
  195. class Writer(ColumnWriter):
  196. def __init__(self, dbfile, allow_offsets=True, cutoff=2**15):
  197. assert isinstance(dbfile, StructFile)
  198. self._dbfile = dbfile
  199. self._count = 0
  200. self._lengths = GrowableArray(allow_longs=False)
  201. self._offsets = GrowableArray(allow_longs=False)
  202. self._offset_base = 0
  203. self.allow_offsets = allow_offsets
  204. self.cutoff = cutoff
  205. def __repr__(self):
  206. return "<VarBytes.Writer>"
  207. def fill(self, docnum):
  208. base = self._offset_base
  209. if docnum > self._count:
  210. self._lengths.extend(0 for _ in xrange(docnum - self._count))
  211. self._offsets.extend(base for _ in xrange(docnum - self._count))
  212. def add(self, docnum, v):
  213. self.fill(docnum)
  214. self._dbfile.write(v)
  215. self._lengths.append(len(v))
  216. self._offsets.append(self._offset_base)
  217. self._offset_base += len(v)
  218. self._count = docnum + 1
  219. def finish(self, doccount):
  220. dbfile = self._dbfile
  221. lengths = self._lengths.array
  222. offsets = self._offsets.array
  223. self.fill(doccount)
  224. dbfile.write_array(lengths)
  225. # Only write the offsets if there is a large number of items in the
  226. # column, otherwise it's fast enough to derive them from the lens
  227. write_offsets = self.allow_offsets and doccount > self.cutoff
  228. if write_offsets:
  229. dbfile.write_array(offsets)
  230. # Backwards compatibility: previous versions only wrote the lengths,
  231. # and the last byte of the column was the lengths type code...
  232. dbfile.write(lengths.typecode.encode("ascii"))
  233. # ...but if we wrote offsets, make the last byte "X" so we know
  234. if write_offsets:
  235. dbfile.write(offsets.typecode.encode("ascii"))
  236. dbfile.write("X".encode("ascii"))
  237. class Reader(ColumnReader):
  238. def __init__(self, dbfile, basepos, length, doccount):
  239. self._dbfile = dbfile
  240. self._basepos = basepos
  241. self._length = length
  242. self._doccount = doccount
  243. self.had_stored_offsets = False # for testing
  244. self._read_offsets_and_lengths()
  245. def __repr__(self):
  246. return "<VarBytes.Reader>"
  247. def _read_offsets_and_lengths(self):
  248. dbfile = self._dbfile
  249. basepos = self._basepos
  250. length = self._length
  251. doccount = self._doccount
  252. # The end of the lengths array is the end of the data minus the
  253. # typecode byte
  254. lastbyte = basepos + length - 1
  255. # Load the length typecode from the end
  256. lens_code = chr(dbfile.get_byte(lastbyte))
  257. offsets = None
  258. if lens_code == "X":
  259. self.had_stored_offsets = True
  260. # This indicates we wrote the offsets, so get the real lengths
  261. # type code
  262. lens_code = chr(dbfile.get_byte(lastbyte - 2))
  263. offsets_code = chr(dbfile.get_byte(lastbyte - 1))
  264. # Read the offsets from before the last byte
  265. itemsize = struct.calcsize(offsets_code)
  266. offsetstart = (lastbyte - 2) - doccount * itemsize
  267. offsets = dbfile.get_array(offsetstart, offsets_code, doccount)
  268. lastbyte = offsetstart
  269. # Load the length array
  270. itemsize = struct.calcsize(lens_code)
  271. lenstart = lastbyte - (itemsize * doccount)
  272. lengths = dbfile.get_array(lenstart, lens_code, doccount)
  273. # If we didn't write the offsets, derive them from the lengths
  274. if offsets is None:
  275. offsets = array("L")
  276. base = 0
  277. for length in lengths:
  278. offsets.append(base)
  279. base += length
  280. self._offsets = offsets
  281. self._lengths = lengths
  282. def __getitem__(self, docnum):
  283. length = self._lengths[docnum]
  284. if not length:
  285. return emptybytes
  286. offset = self._offsets[docnum]
  287. return self._dbfile.get(self._basepos + offset, length)
  288. def __iter__(self):
  289. get = self._dbfile.get
  290. pos = self._basepos
  291. for length in self._lengths:
  292. yield get(pos, length)
  293. pos += length
  294. class FixedBytesColumn(Column):
  295. """Stores fixed-length byte strings.
  296. """
  297. def __init__(self, fixedlen, default=None):
  298. """
  299. :param fixedlen: the fixed length of byte strings in this column.
  300. :param default: the default value to use for documents that don't
  301. specify a value. If you don't specify a default, the column will
  302. use ``b'\\x00' * fixedlen``.
  303. """
  304. self._fixedlen = fixedlen
  305. if default is None:
  306. default = b("\x00") * fixedlen
  307. elif len(default) != fixedlen:
  308. raise ValueError
  309. self._default = default
  310. def writer(self, dbfile):
  311. return self.Writer(dbfile, self._fixedlen, self._default)
  312. def reader(self, dbfile, basepos, length, doccount):
  313. return self.Reader(dbfile, basepos, length, doccount, self._fixedlen,
  314. self._default)
  315. class Writer(ColumnWriter):
  316. def __init__(self, dbfile, fixedlen, default):
  317. self._dbfile = dbfile
  318. self._fixedlen = fixedlen
  319. self._default = self._defaultbytes = default
  320. self._count = 0
  321. def __repr__(self):
  322. return "<FixedBytes.Writer>"
  323. def add(self, docnum, v):
  324. if v == self._default:
  325. return
  326. if docnum > self._count:
  327. self.fill(docnum)
  328. assert len(v) == self._fixedlen
  329. self._dbfile.write(v)
  330. self._count = docnum + 1
  331. class Reader(ColumnReader):
  332. def __init__(self, dbfile, basepos, length, doccount, fixedlen,
  333. default):
  334. self._dbfile = dbfile
  335. self._basepos = basepos
  336. self._doccount = doccount
  337. self._fixedlen = fixedlen
  338. self._default = self._defaultbytes = default
  339. self._count = length // fixedlen
  340. def __repr__(self):
  341. return "<FixedBytes.Reader>"
  342. def __getitem__(self, docnum):
  343. if docnum >= self._count:
  344. return self._defaultbytes
  345. pos = self._basepos + self._fixedlen * docnum
  346. return self._dbfile.get(pos, self._fixedlen)
  347. def __iter__(self):
  348. count = self._count
  349. default = self._default
  350. for i in xrange(self._doccount):
  351. if i < count:
  352. yield self[i]
  353. else:
  354. yield default
  355. # Variable/fixed length reference (enum) column
  356. class RefBytesColumn(Column):
  357. """Stores variable-length or fixed-length byte strings, similar to
  358. :class:`VarBytesColumn` and :class:`FixedBytesColumn`. However, where those
  359. columns stores a value for each document, this column keeps a list of all
  360. the unique values in the field, and for each document stores a short
  361. pointer into the unique list. For fields where the number of possible
  362. values is smaller than the number of documents (for example,
  363. "category" or "chapter"), this saves significant space.
  364. This column type supports a maximum of 65535 unique values across all
  365. documents in a segment. You should generally use this column type where the
  366. number of unique values is in no danger of approaching that number (for
  367. example, a "tags" field). If you try to index too many unique values, the
  368. column will convert additional unique values to the default value and issue
  369. a warning using the ``warnings`` module (this will usually be preferable to
  370. crashing the indexer and potentially losing indexed documents).
  371. """
  372. # NOTE that RefBytes is reversible within a single column (we could just
  373. # negate the reference number), but it's NOT reversible ACROSS SEGMENTS
  374. # (since different segments can have different uniques values in their
  375. # columns), so we have to say that the column type is not reversible
  376. reversible = False
  377. def __init__(self, fixedlen=0, default=None):
  378. """
  379. :param fixedlen: an optional fixed length for the values. If you
  380. specify a number other than 0, the column will require all values
  381. to be the specified length.
  382. :param default: a default value to use for documents that don't specify
  383. one. If you don't specify a default, the column will use an empty
  384. bytestring (``b''``), or if you specify a fixed length,
  385. ``b'\\x00' * fixedlen``.
  386. """
  387. self._fixedlen = fixedlen
  388. if default is None:
  389. default = b("\x00") * fixedlen if fixedlen else emptybytes
  390. elif fixedlen and len(default) != fixedlen:
  391. raise ValueError
  392. self._default = default
  393. def writer(self, dbfile):
  394. return self.Writer(dbfile, self._fixedlen, self._default)
  395. def reader(self, dbfile, basepos, length, doccount):
  396. return self.Reader(dbfile, basepos, length, doccount, self._fixedlen)
  397. class Writer(ColumnWriter):
  398. def __init__(self, dbfile, fixedlen, default):
  399. self._dbfile = dbfile
  400. self._fixedlen = fixedlen
  401. self._default = default
  402. # At first we'll buffer refs in a byte array. If the number of
  403. # uniques stays below 256, we can just write the byte array. As
  404. # soon as the ref count goes above 255, we know we're going to have
  405. # to write shorts, so we'll switch to writing directly.
  406. self._refs = array("B")
  407. self._uniques = {default: 0}
  408. self._count = 0
  409. def __repr__(self):
  410. return "<RefBytes.Writer>"
  411. def fill(self, docnum):
  412. if docnum > self._count:
  413. if self._refs is not None:
  414. self._refs.extend(0 for _ in xrange(docnum - self._count))
  415. else:
  416. dbfile = self._dbfile
  417. for _ in xrange(docnum - self._count):
  418. dbfile.write_ushort(0)
  419. def add(self, docnum, v):
  420. dbfile = self._dbfile
  421. refs = self._refs
  422. self.fill(docnum)
  423. uniques = self._uniques
  424. try:
  425. ref = uniques[v]
  426. except KeyError:
  427. uniques[v] = ref = len(uniques)
  428. if refs is not None and ref >= 256:
  429. # We won't be able to use bytes, we have to switch to
  430. # writing unbuffered ushorts
  431. for n in refs:
  432. dbfile.write_ushort(n)
  433. refs = self._refs = None
  434. if refs is not None:
  435. self._refs.append(ref)
  436. else:
  437. if ref > 65535:
  438. warnings.warn("RefBytesColumn dropped unique value %r" % v,
  439. UserWarning)
  440. ref = 0
  441. dbfile.write_ushort(ref)
  442. self._count = docnum + 1
  443. def _write_uniques(self, typecode):
  444. dbfile = self._dbfile
  445. fixedlen = self._fixedlen
  446. uniques = self._uniques
  447. dbfile.write_varint(len(uniques))
  448. # Sort unique values by position
  449. vs = sorted(uniques.keys(), key=lambda key: uniques[key])
  450. for v in vs:
  451. if not fixedlen:
  452. dbfile.write_varint(len(v))
  453. dbfile.write(v)
  454. def finish(self, doccount):
  455. dbfile = self._dbfile
  456. refs = self._refs
  457. self.fill(doccount)
  458. typecode = "H"
  459. if refs is not None:
  460. dbfile.write_array(refs)
  461. typecode = refs.typecode
  462. self._write_uniques(typecode)
  463. dbfile.write_byte(ord(typecode))
  464. class Reader(ColumnReader):
  465. def __init__(self, dbfile, basepos, length, doccount, fixedlen):
  466. self._dbfile = dbfile
  467. self._basepos = basepos
  468. self._doccount = doccount
  469. self._fixedlen = fixedlen
  470. self._typecode = chr(dbfile.get_byte(basepos + length - 1))
  471. st = struct.Struct("!" + self._typecode)
  472. self._unpack = st.unpack
  473. self._itemsize = st.size
  474. dbfile.seek(basepos + doccount * self._itemsize)
  475. self._uniques = self._read_uniques()
  476. def __repr__(self):
  477. return "<RefBytes.Reader>"
  478. def _read_uniques(self):
  479. dbfile = self._dbfile
  480. fixedlen = self._fixedlen
  481. ucount = dbfile.read_varint()
  482. length = fixedlen
  483. uniques = []
  484. for _ in xrange(ucount):
  485. if not fixedlen:
  486. length = dbfile.read_varint()
  487. uniques.append(dbfile.read(length))
  488. return uniques
  489. def __getitem__(self, docnum):
  490. pos = self._basepos + docnum * self._itemsize
  491. ref = self._unpack(self._dbfile.get(pos, self._itemsize))[0]
  492. return self._uniques[ref]
  493. def __iter__(self):
  494. get = self._dbfile.get
  495. basepos = self._basepos
  496. uniques = self._uniques
  497. unpack = self._unpack
  498. itemsize = self._itemsize
  499. for i in xrange(self._doccount):
  500. pos = basepos + i * itemsize
  501. ref = unpack(get(pos, itemsize))[0]
  502. yield uniques[ref]
  503. # Numeric column
  504. class NumericColumn(FixedBytesColumn):
  505. """Stores numbers (integers and floats) as compact binary.
  506. """
  507. reversible = True
  508. def __init__(self, typecode, default=0):
  509. """
  510. :param typecode: a typecode character (as used by the ``struct``
  511. module) specifying the number type. For example, ``"i"`` for
  512. signed integers.
  513. :param default: the default value to use for documents that don't
  514. specify one.
  515. """
  516. self._typecode = typecode
  517. self._default = default
  518. def writer(self, dbfile):
  519. return self.Writer(dbfile, self._typecode, self._default)
  520. def reader(self, dbfile, basepos, length, doccount):
  521. return self.Reader(dbfile, basepos, length, doccount, self._typecode,
  522. self._default)
  523. def default_value(self, reverse=False):
  524. v = self._default
  525. if reverse:
  526. v = 0 - v
  527. return v
  528. class Writer(FixedBytesColumn.Writer):
  529. def __init__(self, dbfile, typecode, default):
  530. self._dbfile = dbfile
  531. self._pack = struct.Struct("!" + typecode).pack
  532. self._default = default
  533. self._defaultbytes = self._pack(default)
  534. self._fixedlen = struct.calcsize(typecode)
  535. self._count = 0
  536. def __repr__(self):
  537. return "<Numeric.Writer>"
  538. def add(self, docnum, v):
  539. if v == self._default:
  540. return
  541. if docnum > self._count:
  542. self.fill(docnum)
  543. self._dbfile.write(self._pack(v))
  544. self._count = docnum + 1
  545. class Reader(FixedBytesColumn.Reader):
  546. def __init__(self, dbfile, basepos, length, doccount, typecode,
  547. default):
  548. self._dbfile = dbfile
  549. self._basepos = basepos
  550. self._doccount = doccount
  551. self._default = default
  552. self._reverse = False
  553. self._typecode = typecode
  554. self._unpack = struct.Struct("!" + typecode).unpack
  555. self._defaultbytes = struct.pack("!" + typecode, default)
  556. self._fixedlen = struct.calcsize(typecode)
  557. self._count = length // self._fixedlen
  558. def __repr__(self):
  559. return "<Numeric.Reader>"
  560. def __getitem__(self, docnum):
  561. s = FixedBytesColumn.Reader.__getitem__(self, docnum)
  562. return self._unpack(s)[0]
  563. def sort_key(self, docnum):
  564. key = self[docnum]
  565. if self._reverse:
  566. key = 0 - key
  567. return key
  568. def load(self):
  569. if self._typecode in "qQ":
  570. return list(self)
  571. else:
  572. return array(self._typecode, self)
  573. def set_reverse(self):
  574. self._reverse = True
  575. # Column of boolean values
  576. class BitColumn(Column):
  577. """Stores a column of True/False values compactly.
  578. """
  579. reversible = True
  580. _default = False
  581. def __init__(self, compress_at=2048):
  582. """
  583. :param compress_at: columns with this number of values or fewer will
  584. be saved compressed on disk, and loaded into RAM for reading. Set
  585. this to 0 to disable compression.
  586. """
  587. self._compressat = compress_at
  588. def writer(self, dbfile):
  589. return self.Writer(dbfile, self._compressat)
  590. def default_value(self, reverse=False):
  591. return self._default ^ reverse
  592. class Writer(ColumnWriter):
  593. def __init__(self, dbfile, compressat):
  594. self._dbfile = dbfile
  595. self._compressat = compressat
  596. self._bitset = BitSet()
  597. def __repr__(self):
  598. return "<Bit.Writer>"
  599. def add(self, docnum, value):
  600. if value:
  601. self._bitset.add(docnum)
  602. def finish(self, doccount):
  603. dbfile = self._dbfile
  604. bits = self._bitset.bits
  605. if zlib and len(bits) <= self._compressat:
  606. compressed = zlib.compress(array_tobytes(bits), 3)
  607. dbfile.write(compressed)
  608. dbfile.write_byte(1)
  609. else:
  610. dbfile.write_array(bits)
  611. dbfile.write_byte(0)
  612. class Reader(ColumnReader):
  613. def __init__(self, dbfile, basepos, length, doccount):
  614. self._dbfile = dbfile
  615. self._basepos = basepos
  616. self._length = length
  617. self._doccount = doccount
  618. self._reverse = False
  619. compressed = dbfile.get_byte(basepos + (length - 1))
  620. if compressed:
  621. bbytes = zlib.decompress(dbfile.get(basepos, length - 1))
  622. bitset = BitSet.from_bytes(bbytes)
  623. else:
  624. dbfile.seek(basepos)
  625. bitset = OnDiskBitSet(dbfile, basepos, length - 1)
  626. self._bitset = bitset
  627. def id_set(self):
  628. return self._bitset
  629. def __repr__(self):
  630. return "<Bit.Reader>"
  631. def __getitem__(self, i):
  632. return i in self._bitset
  633. def sort_key(self, docnum):
  634. return int(self[docnum] ^ self._reverse)
  635. def __iter__(self):
  636. i = 0
  637. for num in self._bitset:
  638. if num > i:
  639. for _ in xrange(num - i):
  640. yield False
  641. yield True
  642. i = num + 1
  643. if self._doccount > i:
  644. for _ in xrange(self._doccount - i):
  645. yield False
  646. def load(self):
  647. if isinstance(self._bitset, OnDiskBitSet):
  648. bs = self._dbfile.get_array(self._basepos, "B",
  649. self._length - 1)
  650. self._bitset = BitSet.from_bytes(bs)
  651. return self
  652. def set_reverse(self):
  653. self._reverse = True
  654. # Compressed variants
  655. class CompressedBytesColumn(Column):
  656. """Stores variable-length byte strings compressed using deflate (by
  657. default).
  658. """
  659. def __init__(self, level=3, module="zlib"):
  660. """
  661. :param level: the compression level to use.
  662. :param module: a string containing the name of the compression module
  663. to use. The default is "zlib". The module should export "compress"
  664. and "decompress" functions.
  665. """
  666. self._level = level
  667. self._module = module
  668. def writer(self, dbfile):
  669. return self.Writer(dbfile, self._level, self._module)
  670. def reader(self, dbfile, basepos, length, doccount):
  671. return self.Reader(dbfile, basepos, length, doccount, self._module)
  672. class Writer(VarBytesColumn.Writer):
  673. def __init__(self, dbfile, level, module):
  674. VarBytesColumn.Writer.__init__(self, dbfile)
  675. self._level = level
  676. self._compress = __import__(module).compress
  677. def __repr__(self):
  678. return "<CompressedBytes.Writer>"
  679. def add(self, docnum, v):
  680. v = self._compress(v, self._level)
  681. VarBytesColumn.Writer.add(self, docnum, v)
  682. class Reader(VarBytesColumn.Reader):
  683. def __init__(self, dbfile, basepos, length, doccount, module):
  684. VarBytesColumn.Reader.__init__(self, dbfile, basepos, length,
  685. doccount)
  686. self._decompress = __import__(module).decompress
  687. def __repr__(self):
  688. return "<CompressedBytes.Reader>"
  689. def __getitem__(self, docnum):
  690. v = VarBytesColumn.Reader.__getitem__(self, docnum)
  691. if v:
  692. v = self._decompress(v)
  693. return v
  694. def __iter__(self):
  695. for v in VarBytesColumn.Reader.__iter__(self):
  696. yield self._decompress(v)
  697. def load(self):
  698. return list(self)
  699. class CompressedBlockColumn(Column):
  700. """An experimental column type that compresses and decompresses blocks of
  701. values at a time. This can lead to high compression and decent performance
  702. for columns with lots of very short values, but random access times are
  703. usually terrible.
  704. """
  705. def __init__(self, level=3, blocksize=32, module="zlib"):
  706. """
  707. :param level: the compression level to use.
  708. :param blocksize: the size (in KB) of each compressed block.
  709. :param module: a string containing the name of the compression module
  710. to use. The default is "zlib". The module should export "compress"
  711. and "decompress" functions.
  712. """
  713. self._level = level
  714. self._blocksize = blocksize
  715. self._module = module
  716. def writer(self, dbfile):
  717. return self.Writer(dbfile, self._level, self._blocksize, self._module)
  718. def reader(self, dbfile, basepos, length, doccount):
  719. return self.Reader(dbfile, basepos, length, doccount, self._module)
  720. class Writer(ColumnWriter):
  721. def __init__(self, dbfile, level, blocksize, module):
  722. self._dbfile = dbfile
  723. self._blocksize = blocksize * 1024
  724. self._level = level
  725. self._compress = __import__(module).compress
  726. self._reset()
  727. def __repr__(self):
  728. return "<CompressedBlock.Writer>"
  729. def _reset(self):
  730. self._startdoc = None
  731. self._block = emptybytes
  732. self._lengths = []
  733. def _emit(self):
  734. dbfile = self._dbfile
  735. block = self._compress(self._block, self._level)
  736. header = (self._startdoc, self._lastdoc, len(block),
  737. tuple(self._lengths))
  738. dbfile.write_pickle(header)
  739. dbfile.write(block)
  740. def add(self, docnum, v):
  741. if self._startdoc is None:
  742. self._startdoc = docnum
  743. self._lengths.append((docnum, len(v)))
  744. self._lastdoc = docnum
  745. self._block += v
  746. if len(self._block) >= self._blocksize:
  747. self._emit()
  748. self._reset()
  749. def finish(self, doccount):
  750. # If there's still a pending block, write it out
  751. if self._startdoc is not None:
  752. self._emit()
  753. class Reader(ColumnReader):
  754. def __init__(self, dbfile, basepos, length, doccount, module):
  755. ColumnReader.__init__(self, dbfile, basepos, length, doccount)
  756. self._decompress = __import__(module).decompress
  757. self._blocks = []
  758. dbfile.seek(basepos)
  759. pos = 0
  760. while pos < length:
  761. startdoc, enddoc, blocklen, lengths = dbfile.read_pickle()
  762. here = dbfile.tell()
  763. self._blocks.append((startdoc, enddoc, here, blocklen,
  764. lengths))
  765. dbfile.seek(blocklen, 1)
  766. pos = here + blocklen
  767. def __repr__(self):
  768. return "<CompressedBlock.Reader>"
  769. def _find_block(self, docnum):
  770. # TODO: use binary search instead of linear
  771. for i, b in enumerate(self._blocks):
  772. if docnum < b[0]:
  773. return None
  774. elif docnum <= b[1]:
  775. return i
  776. return None
  777. def _get_block(self, blocknum):
  778. block = self._blocks[blocknum]
  779. pos = block[2]
  780. blocklen = block[3]
  781. lengths = block[4]
  782. data = self._decompress(self._dbfile.get(self._basepos + pos,
  783. blocklen))
  784. values = {}
  785. base = 0
  786. for docnum, vlen in lengths:
  787. values[docnum] = data[base:base + vlen]
  788. base += vlen
  789. return values
  790. def __getitem__(self, docnum):
  791. i = self._find_block(docnum)
  792. if i is None:
  793. return emptybytes
  794. return self._get_block(i)[docnum]
  795. def __iter__(self):
  796. last = -1
  797. for i, block in enumerate(self._blocks):
  798. startdoc = block[0]
  799. enddoc = block[1]
  800. if startdoc > (last + 1):
  801. for _ in xrange(startdoc - last):
  802. yield emptybytes
  803. values = self._get_block(i)
  804. for docnum in xrange(startdoc, enddoc + 1):
  805. if docnum in values:
  806. yield values[docnum]
  807. else:
  808. yield emptybytes
  809. last = enddoc
  810. if enddoc < self._doccount - 1:
  811. for _ in xrange(self._doccount - enddoc):
  812. yield emptybytes
  813. class StructColumn(FixedBytesColumn):
  814. def __init__(self, spec, default):
  815. self._spec = spec
  816. self._fixedlen = struct.calcsize(spec)
  817. self._default = default
  818. def writer(self, dbfile):
  819. return self.Writer(dbfile, self._spec, self._default)
  820. def reader(self, dbfile, basepos, length, doccount):
  821. return self.Reader(dbfile, basepos, length, doccount, self._spec,
  822. self._default)
  823. class Writer(FixedBytesColumn.Writer):
  824. def __init__(self, dbfile, spec, default):
  825. self._dbfile = dbfile
  826. self._struct = struct.Struct(spec)
  827. self._fixedlen = self._struct.size
  828. self._default = default
  829. self._defaultbytes = self._struct.pack(*default)
  830. self._count = 0
  831. def __repr__(self):
  832. return "<Struct.Writer>"
  833. def add(self, docnum, v):
  834. b = self._struct.pack(*v)
  835. FixedBytesColumn.Writer.add(self, docnum, b)
  836. class Reader(FixedBytesColumn.Reader):
  837. def __init__(self, dbfile, basepos, length, doccount, spec, default):
  838. self._dbfile = dbfile
  839. self._basepos = basepos
  840. self._doccount = doccount
  841. self._struct = struct.Struct(spec)
  842. self._fixedlen = self._struct.size
  843. self._default = default
  844. self._defaultbytes = self._struct.pack(*default)
  845. self._count = length // self._fixedlen
  846. def __repr__(self):
  847. return "<Struct.Reader>"
  848. def __getitem__(self, docnum):
  849. v = FixedBytesColumn.Reader.__getitem__(self, docnum)
  850. return self._struct.unpack(v)
  851. # Utility readers
  852. class EmptyColumnReader(ColumnReader):
  853. """Acts like a reader for a column with no stored values. Always returns
  854. the default.
  855. """
  856. def __init__(self, default, doccount):
  857. """
  858. :param default: the value to return for all "get" requests.
  859. :param doccount: the number of documents in the nominal column.
  860. """
  861. self._default = default
  862. self._doccount = doccount
  863. def __getitem__(self, docnum):
  864. return self._default
  865. def __iter__(self):
  866. return (self._default for _ in xrange(self._doccount))
  867. def load(self):
  868. return self
  869. class MultiColumnReader(ColumnReader):
  870. """Serializes access to multiple column readers, making them appear to be
  871. one large column.
  872. """
  873. def __init__(self, readers, offsets=None):
  874. """
  875. :param readers: a sequence of column reader objects.
  876. """
  877. self._readers = readers
  878. self._doc_offsets = []
  879. self._doccount = 0
  880. if offsets is None:
  881. for r in readers:
  882. self._doc_offsets.append(self._doccount)
  883. self._doccount += len(r)
  884. else:
  885. assert len(offsets) == len(readers)
  886. self._doc_offsets = offsets
  887. def _document_reader(self, docnum):
  888. return max(0, bisect_right(self._doc_offsets, docnum) - 1)
  889. def _reader_and_docnum(self, docnum):
  890. rnum = self._document_reader(docnum)
  891. offset = self._doc_offsets[rnum]
  892. return rnum, docnum - offset
  893. def __getitem__(self, docnum):
  894. x, y = self._reader_and_docnum(docnum)
  895. return self._readers[x][y]
  896. def __iter__(self):
  897. for r in self._readers:
  898. for v in r:
  899. yield v
  900. class TranslatingColumnReader(ColumnReader):
  901. """Calls a function to "translate" values from an underlying column reader
  902. object before returning them.
  903. ``IndexReader`` objects can wrap a column reader with this object to call
  904. ``FieldType.from_column_value`` on the stored column value before returning
  905. it the the user.
  906. """
  907. def __init__(self, reader, translate):
  908. """
  909. :param reader: the underlying ColumnReader object to get values from.
  910. :param translate: a function that takes a value from the underlying
  911. reader and returns a translated value.
  912. """
  913. self._reader = reader
  914. self._translate = translate
  915. def raw_column(self):
  916. """Returns the underlying column reader.
  917. """
  918. return self._reader
  919. def __len__(self):
  920. return len(self._reader)
  921. def __getitem__(self, docnum):
  922. return self._translate(self._reader[docnum])
  923. def sort_key(self, docnum):
  924. return self._reader.sort_key(docnum)
  925. def __iter__(self):
  926. translate = self._translate
  927. return (translate(v) for v in self._reader)
  928. def set_reverse(self):
  929. self._reader.set_reverse()
  930. # Column wrappers
  931. class WrappedColumn(Column):
  932. def __init__(self, child):
  933. self._child = child
  934. def writer(self, *args, **kwargs):
  935. return self.Writer(self._child.writer(*args, **kwargs))
  936. def reader(self, *args, **kwargs):
  937. return self.Reader(self._child.reader(*args, **kwargs))
  938. def stores_lists(self):
  939. return self._child.stores_lists()
  940. class WrappedColumnWriter(ColumnWriter):
  941. def __init__(self, child):
  942. self._child = child
  943. def fill(self, docnum):
  944. return self._child.fill(docnum)
  945. def add(self, docnum, value):
  946. return self._child.add(docnum, value)
  947. def finish(self, docnum):
  948. return self._child.finish(docnum)
  949. class WrappedColumnReader(ColumnReader):
  950. def __init__(self, child):
  951. self._child = child
  952. def __len__(self):
  953. return len(self._child)
  954. def __getitem__(self, docnum):
  955. return self._child[docnum]
  956. def sort_key(self, docnum):
  957. return self._child.sort_key(docnum)
  958. def __iter__(self):
  959. return iter(self._child)
  960. def load(self):
  961. return list(self)
  962. def set_reverse(self):
  963. self._child.set_reverse()
  964. class ClampedNumericColumn(WrappedColumn):
  965. """An experimental wrapper type for NumericColumn that clamps out-of-range
  966. values instead of raising an exception.
  967. """
  968. def reader(self, *args, **kwargs):
  969. return self._child.reader(*args, **kwargs)
  970. class Writer(WrappedColumnWriter):
  971. def __init__(self, child):
  972. self._child = child
  973. self._min = typecode_min[child._typecode]
  974. self._max = typecode_max[child._typecode]
  975. def add(self, docnum, v):
  976. v = min(v, self._min)
  977. v = max(v, self._max)
  978. self._child.add(docnum, v)
  979. class PickleColumn(WrappedColumn):
  980. """Converts arbitrary objects to pickled bytestrings and stores them using
  981. the wrapped column (usually a :class:`VarBytesColumn` or
  982. :class:`CompressedBytesColumn`).
  983. If you can express the value you want to store as a number or bytestring,
  984. you should use the appropriate column type to avoid the time and size
  985. overhead of pickling and unpickling.
  986. """
  987. class Writer(WrappedColumnWriter):
  988. def __repr__(self):
  989. return "<PickleWriter>"
  990. def add(self, docnum, v):
  991. if v is None:
  992. v = emptybytes
  993. else:
  994. v = dumps(v, 2)
  995. self._child.add(docnum, v)
  996. class Reader(WrappedColumnReader):
  997. def __repr__(self):
  998. return "<PickleReader>"
  999. def __getitem__(self, docnum):
  1000. v = self._child[docnum]
  1001. if not v:
  1002. return None
  1003. else:
  1004. return loads(v)
  1005. def __iter__(self):
  1006. for v in self._child:
  1007. if not v:
  1008. yield None
  1009. else:
  1010. yield loads(v)
  1011. # List columns
  1012. class ListColumn(WrappedColumn):
  1013. def stores_lists(self):
  1014. return True
  1015. class ListColumnReader(ColumnReader):
  1016. def sort_key(self, docnum):
  1017. return self[docnum][0]
  1018. def __iter__(self):
  1019. for docnum in xrange(len(self)):
  1020. yield self[docnum]
  1021. class VarBytesListColumn(ListColumn):
  1022. def __init__(self):
  1023. self._child = VarBytesColumn()
  1024. class Writer(WrappedColumnWriter):
  1025. def add(self, docnum, ls):
  1026. out = [varint(len(ls))]
  1027. for v in ls:
  1028. assert isinstance(v, bytes_type)
  1029. out.append(varint(len(v)))
  1030. out.append(v)
  1031. self._child.add(docnum, emptybytes.join(out))
  1032. class Reader(ListColumnReader, WrappedColumnReader):
  1033. def __getitem__(self, docnum):
  1034. data = self._child[docnum]
  1035. if not data:
  1036. return []
  1037. bio = BytesIO(data)
  1038. count = read_varint(bio.read)
  1039. out = []
  1040. for _ in xrange(count):
  1041. vlen = read_varint(bio.read)
  1042. v = bio.read(vlen)
  1043. out.append(v)
  1044. return out
  1045. class FixedBytesListColumn(ListColumn):
  1046. def __init__(self, fixedlen):
  1047. self._fixedlen = fixedlen
  1048. self._child = VarBytesColumn()
  1049. def writer(self, *args, **kwargs):
  1050. return self.Writer(self._child.writer(*args, **kwargs), self._fixedlen)
  1051. def reader(self, *args, **kwargs):
  1052. return self.Reader(self._child.reader(*args, **kwargs), self._fixedlen)
  1053. class Writer(WrappedColumnWriter):
  1054. def __init__(self, child, fixedlen):
  1055. self._child = child
  1056. self._fixedlen = fixedlen
  1057. self._lengths = GrowableArray()
  1058. self._count = 0
  1059. def add(self, docnum, ls):
  1060. out = []
  1061. for v in ls:
  1062. assert len(v) == self._fixedlen
  1063. out.append(v)
  1064. b = emptybytes.join(out)
  1065. self._child.add(docnum, b)
  1066. class Reader(ListColumnReader, WrappedColumnReader):
  1067. def __init__(self, child, fixedlen):
  1068. self._child = child
  1069. self._fixedlen = fixedlen
  1070. def __getitem__(self, docnum):
  1071. fixedlen = self._fixedlen
  1072. v = self._child[docnum]
  1073. if not v:
  1074. return []
  1075. ls = [v[i:i + fixedlen] for i in xrange(0, len(v), fixedlen)]
  1076. return ls
  1077. #class RefListColumn(Column):
  1078. # def __init__(self, fixedlen=0):
  1079. # """
  1080. # :param fixedlen: an optional fixed length for the values. If you
  1081. # specify a number other than 0, the column will require all values
  1082. # to be the specified length.
  1083. # :param default: a default value to use for documents that don't specify
  1084. # one. If you don't specify a default, the column will use an empty
  1085. # bytestring (``b''``), or if you specify a fixed length,
  1086. # ``b'\\x00' * fixedlen``.
  1087. # """
  1088. #
  1089. # self._fixedlen = fixedlen
  1090. #
  1091. # def stores_lists(self):
  1092. # return True
  1093. #
  1094. # def writer(self, dbfile):
  1095. # return self.Writer(dbfile, self._fixedlen)
  1096. #
  1097. # def reader(self, dbfile, basepos, length, doccount):
  1098. # return self.Reader(dbfile, basepos, length, doccount, self._fixedlen)
  1099. #
  1100. # class Writer(ColumnWriter):
  1101. # def __init__(self, dbfile, fixedlen):
  1102. # self._dbfile = dbfile
  1103. # self._fixedlen = fixedlen
  1104. #
  1105. # self._refs = GrowableArray(allow_longs=False)
  1106. # self._lengths = GrowableArray(allow_longs=False)
  1107. # self._count = 0
  1108. #
  1109. # def __repr__(self):
  1110. # return "<RefList.Writer>"
  1111. #
  1112. # def fill(self, docnum):
  1113. # if docnum > self._count:
  1114. # self._lengths.extend(0 for _ in xrange(docnum - self._count))
  1115. #
  1116. # def add(self, docnum, ls):
  1117. # uniques = self._uniques
  1118. # refs = self._refs
  1119. #
  1120. # self.fill(docnum)
  1121. # self._lengths.append(len(ls))
  1122. # for v in ls:
  1123. # try:
  1124. # i = uniques[v]
  1125. # except KeyError:
  1126. # uniques[v] = i = len(uniques)
  1127. # refs.append(i)
  1128. #
  1129. # self._count = docnum + 1
  1130. #
  1131. # def finish(self, doccount):
  1132. # dbfile = self._dbfile
  1133. # refs = self._refs.array
  1134. # lengths = self._lengths.array
  1135. #
  1136. # self.fill(doccount)
  1137. # dbfile.write_byte(ord(lengths.typecode))
  1138. # dbfile.write_array(lengths)
  1139. # dbfile.write_byte(ord(refs.typecode))
  1140. # self._write_uniques(refs.typecode)
  1141. # dbfile.write_array(refs)
  1142. #
  1143. # class Reader(ListColumnReader):
  1144. # def __init__(self, dbfile, basepos, length, doccount, fixedlen):
  1145. # self._dbfile = dbfile
  1146. # self._basepos = basepos
  1147. # self._doccount = doccount
  1148. # self._fixedlen = fixedlen
  1149. #
  1150. # dbfile.seek(basepos)
  1151. # lencode = chr(dbfile.read_byte())
  1152. # self._lengths = dbfile.read_array(lencode, doccount)
  1153. #
  1154. # self._typecode = chr(dbfile.read_byte())
  1155. # refst = struct.Struct("!" + self._typecode)
  1156. # self._unpack = refst.unpack
  1157. # self._itemsize = refst.size
  1158. #
  1159. # self._read_uniques()
  1160. # self._refbase = dbfile.tell()
  1161. #
  1162. # # Create an array of offsets into the references using the lengths
  1163. # offsets = array("i", (0,))
  1164. # for length in self._lengths:
  1165. # offsets.append(offsets[-1] + length)
  1166. # self._offsets = offsets
  1167. #
  1168. # def __repr__(self):
  1169. # return "<RefBytes.Reader>"
  1170. #
  1171. # def _get_ref(self, docnum):
  1172. # pos = self._basepos + 1 + docnum * self._itemsize
  1173. # return self._unpack(self._dbfile.get(pos, self._itemsize))[0]
  1174. #
  1175. # def __getitem__(self, docnum):
  1176. # offset = self._offsets[docnum]
  1177. # length = self._lengths[docnum]
  1178. #
  1179. # pos = self._refbase + offset * self._itemsize
  1180. # reflist = self._dbfile.get_array(pos, self._typecode, length)
  1181. # return [self._uniques[ref] for ref in reflist]