whoosh3.py 42 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291
  1. # Copyright 2012 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. """
  28. This module implements a "codec" for writing/reading Whoosh X indexes.
  29. """
  30. import struct
  31. from array import array
  32. from collections import defaultdict
  33. from whoosh import columns, formats
  34. from whoosh.compat import b, bytes_type, string_type, integer_types
  35. from whoosh.compat import dumps, loads, iteritems, xrange
  36. from whoosh.codec import base
  37. from whoosh.filedb import compound, filetables
  38. from whoosh.matching import ListMatcher, ReadTooFar, LeafMatcher
  39. from whoosh.reading import TermInfo, TermNotFound
  40. from whoosh.system import emptybytes
  41. from whoosh.system import _SHORT_SIZE, _INT_SIZE, _LONG_SIZE, _FLOAT_SIZE
  42. from whoosh.system import pack_ushort, unpack_ushort
  43. from whoosh.system import pack_int, unpack_int, pack_long, unpack_long
  44. from whoosh.util.numlists import delta_encode, delta_decode
  45. from whoosh.util.numeric import length_to_byte, byte_to_length
  46. try:
  47. import zlib
  48. except ImportError:
  49. zlib = None
  50. # This byte sequence is written at the start of a posting list to identify the
  51. # codec/version
  52. WHOOSH3_HEADER_MAGIC = b("W3Bl")
  53. # Column type to store field length info
  54. LENGTHS_COLUMN = columns.NumericColumn("B", default=0)
  55. # Column type to store pointers to vector posting lists
  56. VECTOR_COLUMN = columns.NumericColumn("I")
  57. # Column type to store vector posting list lengths
  58. VECTOR_LEN_COLUMN = columns.NumericColumn("i")
  59. # Column type to store values of stored fields
  60. STORED_COLUMN = columns.PickleColumn(columns.CompressedBytesColumn())
  61. class W3Codec(base.Codec):
  62. # File extensions
  63. TERMS_EXT = ".trm" # Term index
  64. POSTS_EXT = ".pst" # Term postings
  65. VPOSTS_EXT = ".vps" # Vector postings
  66. COLUMN_EXT = ".col" # Per-document value columns
  67. def __init__(self, blocklimit=128, compression=3, inlinelimit=1):
  68. self._blocklimit = blocklimit
  69. self._compression = compression
  70. self._inlinelimit = inlinelimit
  71. # def automata(self):
  72. # Per-document value writer
  73. def per_document_writer(self, storage, segment):
  74. return W3PerDocWriter(self, storage, segment)
  75. # Inverted index writer
  76. def field_writer(self, storage, segment):
  77. return W3FieldWriter(self, storage, segment)
  78. # Postings
  79. def postings_writer(self, dbfile, byteids=False):
  80. return W3PostingsWriter(dbfile, blocklimit=self._blocklimit,
  81. byteids=byteids, compression=self._compression,
  82. inlinelimit=self._inlinelimit)
  83. def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None):
  84. if terminfo.is_inlined():
  85. # If the postings were inlined into the terminfo object, pull them
  86. # out and use a ListMatcher to wrap them in a Matcher interface
  87. ids, weights, values = terminfo.inlined_postings()
  88. m = ListMatcher(ids, weights, values, format_, scorer=scorer,
  89. term=term, terminfo=terminfo)
  90. else:
  91. offset, length = terminfo.extent()
  92. m = W3LeafMatcher(dbfile, offset, length, format_, term=term,
  93. scorer=scorer)
  94. return m
  95. # Readers
  96. def per_document_reader(self, storage, segment):
  97. return W3PerDocReader(storage, segment)
  98. def terms_reader(self, storage, segment):
  99. tiname = segment.make_filename(self.TERMS_EXT)
  100. tilen = storage.file_length(tiname)
  101. tifile = storage.open_file(tiname)
  102. postfile = segment.open_file(storage, self.POSTS_EXT)
  103. return W3TermsReader(self, tifile, tilen, postfile)
  104. # Graph methods provided by CodecWithGraph
  105. # Columns
  106. def supports_columns(self):
  107. return True
  108. @classmethod
  109. def column_filename(cls, segment, fieldname):
  110. ext = "".join((".", fieldname, cls.COLUMN_EXT))
  111. return segment.make_filename(ext)
  112. # Segments and generations
  113. def new_segment(self, storage, indexname):
  114. return W3Segment(self, indexname)
  115. # Common functions
  116. def _vecfield(fieldname):
  117. return "_%s_vec" % fieldname
  118. def _lenfield(fieldname):
  119. return "_%s_len" % fieldname
  120. # Per-doc information writer
  121. class W3PerDocWriter(base.PerDocWriterWithColumns):
  122. def __init__(self, codec, storage, segment):
  123. self._codec = codec
  124. self._storage = storage
  125. self._segment = segment
  126. tempst = storage.temp_storage("%s.tmp" % segment.indexname)
  127. self._cols = compound.CompoundWriter(tempst)
  128. self._colwriters = {}
  129. self._create_column("_stored", STORED_COLUMN)
  130. self._fieldlengths = defaultdict(int)
  131. self._doccount = 0
  132. self._docnum = None
  133. self._storedfields = None
  134. self._indoc = False
  135. self.is_closed = False
  136. # We'll wait to create the vector file until someone actually tries
  137. # to add a vector
  138. self._vpostfile = None
  139. def _create_file(self, ext):
  140. return self._segment.create_file(self._storage, ext)
  141. def _has_column(self, fieldname):
  142. return fieldname in self._colwriters
  143. def _create_column(self, fieldname, column):
  144. writers = self._colwriters
  145. if fieldname in writers:
  146. raise Exception("Already added column %r" % fieldname)
  147. f = self._cols.create_file(fieldname)
  148. writers[fieldname] = column.writer(f)
  149. def _get_column(self, fieldname):
  150. return self._colwriters[fieldname]
  151. def _prep_vectors(self):
  152. self._vpostfile = self._create_file(W3Codec.VPOSTS_EXT)
  153. # We'll use offset==0 as a marker for "no vectors", so we can't start
  154. # postings at position 0, so just write a few header bytes :)
  155. self._vpostfile.write(b("VPST"))
  156. def start_doc(self, docnum):
  157. if self._indoc:
  158. raise Exception("Called start_doc when already in a doc")
  159. if docnum != self._doccount:
  160. raise Exception("Called start_doc(%r) was expecting %r"
  161. % (docnum, self._doccount))
  162. self._docnum = docnum
  163. self._doccount += 1
  164. self._storedfields = {}
  165. self._indoc = True
  166. def add_field(self, fieldname, fieldobj, value, length):
  167. if value is not None:
  168. self._storedfields[fieldname] = value
  169. if length:
  170. # Add byte to length column
  171. lenfield = _lenfield(fieldname)
  172. lb = length_to_byte(length)
  173. self.add_column_value(lenfield, LENGTHS_COLUMN, lb)
  174. # Add length to total field length
  175. self._fieldlengths[fieldname] += length
  176. def add_vector_items(self, fieldname, fieldobj, items):
  177. if not items:
  178. # Don't do anything if the list of items is empty
  179. return
  180. if self._vpostfile is None:
  181. self._prep_vectors()
  182. # Write vector postings
  183. vpostwriter = self._codec.postings_writer(self._vpostfile, byteids=True)
  184. vpostwriter.start_postings(fieldobj.vector, W3TermInfo())
  185. for text, weight, vbytes in items:
  186. vpostwriter.add_posting(text, weight, vbytes)
  187. # finish_postings() returns terminfo object
  188. vinfo = vpostwriter.finish_postings()
  189. # Add row to vector lookup column
  190. vecfield = _vecfield(fieldname) # Compute vector column name
  191. offset, length = vinfo.extent()
  192. assert offset != 0
  193. self.add_column_value(vecfield, VECTOR_COLUMN, offset)
  194. self.add_column_value(vecfield + "L", VECTOR_LEN_COLUMN, length)
  195. def finish_doc(self):
  196. sf = self._storedfields
  197. if sf:
  198. self.add_column_value("_stored", STORED_COLUMN, sf)
  199. sf.clear()
  200. self._indoc = False
  201. def _column_filename(self, fieldname):
  202. return W3Codec.column_filename(self._segment, fieldname)
  203. def close(self):
  204. if self._indoc is not None:
  205. # Called close without calling finish_doc
  206. self.finish_doc()
  207. self._segment._fieldlengths = self._fieldlengths
  208. # Finish open columns and close the columns writer
  209. for writer in self._colwriters.values():
  210. writer.finish(self._doccount)
  211. self._cols.save_as_files(self._storage, self._column_filename)
  212. # If vectors were written, close the vector writers
  213. if self._vpostfile:
  214. self._vpostfile.close()
  215. self.is_closed = True
  216. class W3FieldWriter(base.FieldWriter):
  217. def __init__(self, codec, storage, segment):
  218. self._codec = codec
  219. self._storage = storage
  220. self._segment = segment
  221. self._fieldname = None
  222. self._fieldid = None
  223. self._btext = None
  224. self._fieldobj = None
  225. self._format = None
  226. _tifile = self._create_file(W3Codec.TERMS_EXT)
  227. self._tindex = filetables.OrderedHashWriter(_tifile)
  228. self._fieldmap = self._tindex.extras["fieldmap"] = {}
  229. self._postfile = self._create_file(W3Codec.POSTS_EXT)
  230. self._postwriter = None
  231. self._infield = False
  232. self.is_closed = False
  233. def _create_file(self, ext):
  234. return self._segment.create_file(self._storage, ext)
  235. def start_field(self, fieldname, fieldobj):
  236. fmap = self._fieldmap
  237. if fieldname in fmap:
  238. self._fieldid = fmap[fieldname]
  239. else:
  240. self._fieldid = len(fmap)
  241. fmap[fieldname] = self._fieldid
  242. self._fieldname = fieldname
  243. self._fieldobj = fieldobj
  244. self._format = fieldobj.format
  245. self._infield = True
  246. # Start a new postwriter for this field
  247. self._postwriter = self._codec.postings_writer(self._postfile)
  248. def start_term(self, btext):
  249. if self._postwriter is None:
  250. raise Exception("Called start_term before start_field")
  251. self._btext = btext
  252. self._postwriter.start_postings(self._fieldobj.format, W3TermInfo())
  253. def add(self, docnum, weight, vbytes, length):
  254. self._postwriter.add_posting(docnum, weight, vbytes, length)
  255. def finish_term(self):
  256. terminfo = self._postwriter.finish_postings()
  257. # Add row to term info table
  258. keybytes = pack_ushort(self._fieldid) + self._btext
  259. valbytes = terminfo.to_bytes()
  260. self._tindex.add(keybytes, valbytes)
  261. # FieldWriterWithGraph.add_spell_word
  262. def finish_field(self):
  263. if not self._infield:
  264. raise Exception("Called finish_field before start_field")
  265. self._infield = False
  266. self._postwriter = None
  267. def close(self):
  268. self._tindex.close()
  269. self._postfile.close()
  270. self.is_closed = True
  271. # Reader objects
  272. class W3PerDocReader(base.PerDocumentReader):
  273. def __init__(self, storage, segment):
  274. self._storage = storage
  275. self._segment = segment
  276. self._doccount = segment.doc_count_all()
  277. self._vpostfile = None
  278. self._colfiles = {}
  279. self._readers = {}
  280. self._minlengths = {}
  281. self._maxlengths = {}
  282. def close(self):
  283. for colfile, _, _ in self._colfiles.values():
  284. colfile.close()
  285. if self._vpostfile:
  286. self._vpostfile.close()
  287. def doc_count(self):
  288. return self._doccount - self._segment.deleted_count()
  289. def doc_count_all(self):
  290. return self._doccount
  291. # Deletions
  292. def has_deletions(self):
  293. return self._segment.has_deletions()
  294. def is_deleted(self, docnum):
  295. return self._segment.is_deleted(docnum)
  296. def deleted_docs(self):
  297. return self._segment.deleted_docs()
  298. # Columns
  299. def has_column(self, fieldname):
  300. filename = W3Codec.column_filename(self._segment, fieldname)
  301. return self._storage.file_exists(filename)
  302. def _get_column_file(self, fieldname):
  303. filename = W3Codec.column_filename(self._segment, fieldname)
  304. length = self._storage.file_length(filename)
  305. colfile = self._storage.open_file(filename)
  306. return colfile, 0, length
  307. def column_reader(self, fieldname, column):
  308. if fieldname not in self._colfiles:
  309. self._colfiles[fieldname] = self._get_column_file(fieldname)
  310. colfile, offset, length = self._colfiles[fieldname]
  311. return column.reader(colfile, offset, length, self._doccount)
  312. # Lengths
  313. def _cached_reader(self, fieldname, column):
  314. if fieldname in self._readers:
  315. return self._readers[fieldname]
  316. else:
  317. if not self.has_column(fieldname):
  318. return None
  319. reader = self.column_reader(fieldname, column)
  320. self._readers[fieldname] = reader
  321. return reader
  322. def doc_field_length(self, docnum, fieldname, default=0):
  323. if docnum > self._doccount:
  324. raise IndexError("Asked for docnum %r of %d"
  325. % (docnum, self._doccount))
  326. lenfield = _lenfield(fieldname)
  327. reader = self._cached_reader(lenfield, LENGTHS_COLUMN)
  328. if reader is None:
  329. return default
  330. lbyte = reader[docnum]
  331. if lbyte:
  332. return byte_to_length(lbyte)
  333. def field_length(self, fieldname):
  334. return self._segment._fieldlengths.get(fieldname, 0)
  335. def _minmax_length(self, fieldname, op, cache):
  336. if fieldname in cache:
  337. return cache[fieldname]
  338. lenfield = _lenfield(fieldname)
  339. reader = self._cached_reader(lenfield, LENGTHS_COLUMN)
  340. length = byte_to_length(op(reader))
  341. cache[fieldname] = length
  342. return length
  343. def min_field_length(self, fieldname):
  344. return self._minmax_length(fieldname, min, self._minlengths)
  345. def max_field_length(self, fieldname):
  346. return self._minmax_length(fieldname, max, self._maxlengths)
  347. # Vectors
  348. def _prep_vectors(self):
  349. f = self._segment.open_file(self._storage, W3Codec.VPOSTS_EXT)
  350. self._vpostfile = f
  351. def _vector_extent(self, docnum, fieldname):
  352. if docnum > self._doccount:
  353. raise IndexError("Asked for document %r of %d"
  354. % (docnum, self._doccount))
  355. vecfield = _vecfield(fieldname) # Compute vector column name
  356. # Get the offset from the vector offset column
  357. offset = self._cached_reader(vecfield, VECTOR_COLUMN)[docnum]
  358. # Get the length from the length column, if it exists, otherwise return
  359. # -1 for the length (backwards compatibility with old dev versions)
  360. lreader = self._cached_reader(vecfield + "L", VECTOR_COLUMN)
  361. if lreader:
  362. length = lreader[docnum]
  363. else:
  364. length = -1
  365. return offset, length
  366. def has_vector(self, docnum, fieldname):
  367. if self.has_column(_vecfield(fieldname)):
  368. offset, length = self._vector_extent(docnum, fieldname)
  369. return offset != 0
  370. return False
  371. def vector(self, docnum, fieldname, format_):
  372. if self._vpostfile is None:
  373. self._prep_vectors()
  374. offset, length = self._vector_extent(docnum, fieldname)
  375. if not offset:
  376. raise Exception("Field %r has no vector in docnum %s" %
  377. (fieldname, docnum))
  378. m = W3LeafMatcher(self._vpostfile, offset, length, format_,
  379. byteids=True)
  380. return m
  381. # Stored fields
  382. def stored_fields(self, docnum):
  383. reader = self._cached_reader("_stored", STORED_COLUMN)
  384. v = reader[docnum]
  385. if v is None:
  386. v = {}
  387. return v
  388. class W3FieldCursor(base.FieldCursor):
  389. def __init__(self, tindex, fieldname, keycoder, keydecoder, fieldobj):
  390. self._tindex = tindex
  391. self._fieldname = fieldname
  392. self._keycoder = keycoder
  393. self._keydecoder = keydecoder
  394. self._fieldobj = fieldobj
  395. prefixbytes = keycoder(fieldname, b'')
  396. self._startpos = self._tindex.closest_key_pos(prefixbytes)
  397. self._pos = self._startpos
  398. self._text = None
  399. self._datapos = None
  400. self._datalen = None
  401. self.next()
  402. def first(self):
  403. self._pos = self._startpos
  404. return self.next()
  405. def find(self, term):
  406. if not isinstance(term, bytes_type):
  407. term = self._fieldobj.to_bytes(term)
  408. key = self._keycoder(self._fieldname, term)
  409. self._pos = self._tindex.closest_key_pos(key)
  410. return self.next()
  411. def next(self):
  412. if self._pos is not None:
  413. keyrng = self._tindex.key_and_range_at(self._pos)
  414. if keyrng is not None:
  415. keybytes, datapos, datalen = keyrng
  416. fname, text = self._keydecoder(keybytes)
  417. if fname == self._fieldname:
  418. self._pos = datapos + datalen
  419. self._text = self._fieldobj.from_bytes(text)
  420. self._datapos = datapos
  421. self._datalen = datalen
  422. return self._text
  423. self._text = self._pos = self._datapos = self._datalen = None
  424. return None
  425. def text(self):
  426. return self._text
  427. def term_info(self):
  428. if self._pos is None:
  429. return None
  430. databytes = self._tindex.dbfile.get(self._datapos, self._datalen)
  431. return W3TermInfo.from_bytes(databytes)
  432. def is_valid(self):
  433. return self._pos is not None
  434. class W3TermsReader(base.TermsReader):
  435. def __init__(self, codec, dbfile, length, postfile):
  436. self._codec = codec
  437. self._dbfile = dbfile
  438. self._tindex = filetables.OrderedHashReader(dbfile, length)
  439. self._fieldmap = self._tindex.extras["fieldmap"]
  440. self._postfile = postfile
  441. self._fieldunmap = [None] * len(self._fieldmap)
  442. for fieldname, num in iteritems(self._fieldmap):
  443. self._fieldunmap[num] = fieldname
  444. def _keycoder(self, fieldname, tbytes):
  445. assert isinstance(tbytes, bytes_type), "tbytes=%r" % tbytes
  446. fnum = self._fieldmap.get(fieldname, 65535)
  447. return pack_ushort(fnum) + tbytes
  448. def _keydecoder(self, keybytes):
  449. fieldid = unpack_ushort(keybytes[:_SHORT_SIZE])[0]
  450. return self._fieldunmap[fieldid], keybytes[_SHORT_SIZE:]
  451. def _range_for_key(self, fieldname, tbytes):
  452. return self._tindex.range_for_key(self._keycoder(fieldname, tbytes))
  453. def __contains__(self, term):
  454. return self._keycoder(*term) in self._tindex
  455. def indexed_field_names(self):
  456. return self._fieldmap.keys()
  457. def cursor(self, fieldname, fieldobj):
  458. tindex = self._tindex
  459. coder = self._keycoder
  460. decoder = self._keydecoder
  461. return W3FieldCursor(tindex, fieldname, coder, decoder, fieldobj)
  462. def terms(self):
  463. keydecoder = self._keydecoder
  464. return (keydecoder(keybytes) for keybytes in self._tindex.keys())
  465. def terms_from(self, fieldname, prefix):
  466. prefixbytes = self._keycoder(fieldname, prefix)
  467. keydecoder = self._keydecoder
  468. return (keydecoder(keybytes) for keybytes
  469. in self._tindex.keys_from(prefixbytes))
  470. def items(self):
  471. tidecoder = W3TermInfo.from_bytes
  472. keydecoder = self._keydecoder
  473. return ((keydecoder(keybytes), tidecoder(valbytes))
  474. for keybytes, valbytes in self._tindex.items())
  475. def items_from(self, fieldname, prefix):
  476. prefixbytes = self._keycoder(fieldname, prefix)
  477. tidecoder = W3TermInfo.from_bytes
  478. keydecoder = self._keydecoder
  479. return ((keydecoder(keybytes), tidecoder(valbytes))
  480. for keybytes, valbytes in self._tindex.items_from(prefixbytes))
  481. def term_info(self, fieldname, tbytes):
  482. key = self._keycoder(fieldname, tbytes)
  483. try:
  484. return W3TermInfo.from_bytes(self._tindex[key])
  485. except KeyError:
  486. raise TermNotFound("No term %s:%r" % (fieldname, tbytes))
  487. def frequency(self, fieldname, tbytes):
  488. datapos = self._range_for_key(fieldname, tbytes)[0]
  489. return W3TermInfo.read_weight(self._dbfile, datapos)
  490. def doc_frequency(self, fieldname, tbytes):
  491. datapos = self._range_for_key(fieldname, tbytes)[0]
  492. return W3TermInfo.read_doc_freq(self._dbfile, datapos)
  493. def matcher(self, fieldname, tbytes, format_, scorer=None):
  494. terminfo = self.term_info(fieldname, tbytes)
  495. m = self._codec.postings_reader(self._postfile, terminfo, format_,
  496. term=(fieldname, tbytes), scorer=scorer)
  497. return m
  498. def close(self):
  499. self._tindex.close()
  500. self._postfile.close()
  501. # Postings
  502. class W3PostingsWriter(base.PostingsWriter):
  503. """This object writes posting lists to the postings file. It groups postings
  504. into blocks and tracks block level statistics to makes it easier to skip
  505. through the postings.
  506. """
  507. def __init__(self, postfile, blocklimit, byteids=False, compression=3,
  508. inlinelimit=1):
  509. self._postfile = postfile
  510. self._blocklimit = blocklimit
  511. self._byteids = byteids
  512. self._compression = compression
  513. self._inlinelimit = inlinelimit
  514. self._blockcount = 0
  515. self._format = None
  516. self._terminfo = None
  517. def written(self):
  518. return self._blockcount > 0
  519. def start_postings(self, format_, terminfo):
  520. # Start a new term
  521. if self._terminfo:
  522. # If self._terminfo is not None, that means we are already in a term
  523. raise Exception("Called start in a term")
  524. assert isinstance(format_, formats.Format)
  525. self._format = format_
  526. # Reset block count
  527. self._blockcount = 0
  528. # Reset block bufferg
  529. self._new_block()
  530. # Remember terminfo object passed to us
  531. self._terminfo = terminfo
  532. # Remember where we started in the posting file
  533. self._startoffset = self._postfile.tell()
  534. def add_posting(self, id_, weight, vbytes, length=None):
  535. # Add a posting to the buffered block
  536. # If the number of buffered postings == the block limit, write out the
  537. # buffered block and reset before adding this one
  538. if len(self._ids) >= self._blocklimit:
  539. self._write_block()
  540. # Check types
  541. if self._byteids:
  542. assert isinstance(id_, string_type), "id_=%r" % id_
  543. else:
  544. assert isinstance(id_, integer_types), "id_=%r" % id_
  545. assert isinstance(weight, (int, float)), "weight=%r" % weight
  546. assert isinstance(vbytes, bytes_type), "vbytes=%r" % vbytes
  547. assert length is None or isinstance(length, integer_types)
  548. self._ids.append(id_)
  549. self._weights.append(weight)
  550. if weight > self._maxweight:
  551. self._maxweight = weight
  552. if vbytes:
  553. self._values.append(vbytes)
  554. if length:
  555. minlength = self._minlength
  556. if minlength is None or length < minlength:
  557. self._minlength = length
  558. if length > self._maxlength:
  559. self._maxlength = length
  560. def finish_postings(self):
  561. terminfo = self._terminfo
  562. # If we have fewer than "inlinelimit" postings in this posting list,
  563. # "inline" the postings into the terminfo instead of writing them to
  564. # the posting file
  565. if not self.written() and len(self) < self._inlinelimit:
  566. terminfo.add_block(self)
  567. terminfo.set_inline(self._ids, self._weights, self._values)
  568. else:
  569. # If there are leftover items in the current block, write them out
  570. if self._ids:
  571. self._write_block(last=True)
  572. startoffset = self._startoffset
  573. length = self._postfile.tell() - startoffset
  574. terminfo.set_extent(startoffset, length)
  575. # Clear self._terminfo to indicate we're between terms
  576. self._terminfo = None
  577. # Return the current terminfo object
  578. return terminfo
  579. def _new_block(self):
  580. # Reset block buffer
  581. # List of IDs (docnums for regular posting list, terms for vector PL)
  582. self._ids = [] if self._byteids else array("I")
  583. # List of weights
  584. self._weights = array("f")
  585. # List of encoded payloads
  586. self._values = []
  587. # Statistics
  588. self._minlength = None
  589. self._maxlength = 0
  590. self._maxweight = 0
  591. def _write_block(self, last=False):
  592. # Write the buffered block to the postings file
  593. # If this is the first block, write a small header first
  594. if not self._blockcount:
  595. self._postfile.write(WHOOSH3_HEADER_MAGIC)
  596. # Add this block's statistics to the terminfo object, which tracks the
  597. # overall statistics for all term postings
  598. self._terminfo.add_block(self)
  599. # Minify the IDs, weights, and values, and put them in a tuple
  600. data = (self._mini_ids(), self._mini_weights(), self._mini_values())
  601. # Pickle the tuple
  602. databytes = dumps(data, 2)
  603. # If the pickle is less than 20 bytes, don't bother compressing
  604. if len(databytes) < 20:
  605. comp = 0
  606. # Compress the pickle (if self._compression > 0)
  607. comp = self._compression
  608. if comp:
  609. databytes = zlib.compress(databytes, comp)
  610. # Make a tuple of block info. The posting reader can check this info
  611. # and decide whether to skip the block without having to decompress the
  612. # full block data
  613. #
  614. # - Number of postings in block
  615. # - Last ID in block
  616. # - Maximum weight in block
  617. # - Compression level
  618. # - Minimum length byte
  619. # - Maximum length byte
  620. ids = self._ids
  621. infobytes = dumps((len(ids), ids[-1], self._maxweight, comp,
  622. length_to_byte(self._minlength),
  623. length_to_byte(self._maxlength),
  624. ), 2)
  625. # Write block length
  626. postfile = self._postfile
  627. blocklength = len(infobytes) + len(databytes)
  628. if last:
  629. # If this is the last block, use a negative number
  630. blocklength *= -1
  631. postfile.write_int(blocklength)
  632. # Write block info
  633. postfile.write(infobytes)
  634. # Write block data
  635. postfile.write(databytes)
  636. self._blockcount += 1
  637. # Reset block buffer
  638. self._new_block()
  639. # Methods to reduce the byte size of the various lists
  640. def _mini_ids(self):
  641. # Minify IDs
  642. ids = self._ids
  643. if not self._byteids:
  644. ids = delta_encode(ids)
  645. return tuple(ids)
  646. def _mini_weights(self):
  647. # Minify weights
  648. weights = self._weights
  649. if all(w == 1.0 for w in weights):
  650. return None
  651. elif all(w == weights[0] for w in weights):
  652. return weights[0]
  653. else:
  654. return tuple(weights)
  655. def _mini_values(self):
  656. # Minify values
  657. fixedsize = self._format.fixed_value_size()
  658. values = self._values
  659. if fixedsize is None or fixedsize < 0:
  660. vs = tuple(values)
  661. elif fixedsize == 0:
  662. vs = None
  663. else:
  664. vs = emptybytes.join(values)
  665. return vs
  666. # Block stats methods
  667. def __len__(self):
  668. # Returns the number of unwritten buffered postings
  669. return len(self._ids)
  670. def min_id(self):
  671. # First ID in the buffered block
  672. return self._ids[0]
  673. def max_id(self):
  674. # Last ID in the buffered block
  675. return self._ids[-1]
  676. def min_length(self):
  677. # Shortest field length in the buffered block
  678. return self._minlength
  679. def max_length(self):
  680. # Longest field length in the buffered block
  681. return self._maxlength
  682. def max_weight(self):
  683. # Highest weight in the buffered block
  684. return self._maxweight
  685. class W3LeafMatcher(LeafMatcher):
  686. """Reads on-disk postings from the postings file and presents the
  687. :class:`whoosh.matching.Matcher` interface.
  688. """
  689. def __init__(self, postfile, startoffset, length, format_, term=None,
  690. byteids=None, scorer=None):
  691. self._postfile = postfile
  692. self._startoffset = startoffset
  693. self._length = length
  694. self.format = format_
  695. self._term = term
  696. self._byteids = byteids
  697. self.scorer = scorer
  698. self._fixedsize = self.format.fixed_value_size()
  699. # Read the header tag at the start of the postings
  700. self._read_header()
  701. # "Reset" to read the first block
  702. self.reset()
  703. def _read_header(self):
  704. # Seek to the start of the postings and check the header tag
  705. postfile = self._postfile
  706. postfile.seek(self._startoffset)
  707. magic = postfile.read(4)
  708. if magic != WHOOSH3_HEADER_MAGIC:
  709. raise Exception("Block tag error %r" % magic)
  710. # Remember the base offset (start of postings, after the header)
  711. self._baseoffset = postfile.tell()
  712. def reset(self):
  713. # Reset block stats
  714. self._blocklength = None
  715. self._maxid = None
  716. self._maxweight = None
  717. self._compression = None
  718. self._minlength = None
  719. self._maxlength = None
  720. self._lastblock = False
  721. self._atend = False
  722. # Consume first block
  723. self._goto(self._baseoffset)
  724. def _goto(self, position):
  725. # Read the posting block at the given position
  726. postfile = self._postfile
  727. # Reset block data -- we'll lazy load the data from the new block as
  728. # needed
  729. self._data = None
  730. self._ids = None
  731. self._weights = None
  732. self._values = None
  733. # Reset pointer into the block
  734. self._i = 0
  735. # Seek to the start of the block
  736. postfile.seek(position)
  737. # Read the block length
  738. length = postfile.read_int()
  739. # If the block length is negative, that means this is the last block
  740. if length < 0:
  741. self._lastblock = True
  742. length *= -1
  743. # Remember the offset of the next block
  744. self._nextoffset = position + _INT_SIZE + length
  745. # Read the pickled block info tuple
  746. info = postfile.read_pickle()
  747. # Remember the offset of the block's data
  748. self._dataoffset = postfile.tell()
  749. # Decompose the info tuple to set the current block info
  750. (self._blocklength, self._maxid, self._maxweight, self._compression,
  751. mnlen, mxlen) = info
  752. self._minlength = byte_to_length(mnlen)
  753. self._maxlength = byte_to_length(mxlen)
  754. def _next_block(self):
  755. if self._atend:
  756. # We were already at the end, and yet somebody called _next_block()
  757. # again, so something is wrong somewhere
  758. raise Exception("No next block")
  759. elif self._lastblock:
  760. # Reached the end of the postings
  761. self._atend = True
  762. else:
  763. # Go to the next block
  764. self._goto(self._nextoffset)
  765. def _skip_to_block(self, skipwhile):
  766. # Skip blocks as long as the skipwhile() function returns True
  767. skipped = 0
  768. while self.is_active() and skipwhile():
  769. self._next_block()
  770. skipped += 1
  771. return skipped
  772. def is_active(self):
  773. return not self._atend and self._i < self._blocklength
  774. def id(self):
  775. # Get the current ID (docnum for regular postings, term for vector)
  776. # If we haven't loaded the block IDs yet, load them now
  777. if self._ids is None:
  778. self._read_ids()
  779. return self._ids[self._i]
  780. def weight(self):
  781. # Get the weight for the current posting
  782. # If we haven't loaded the block weights yet, load them now
  783. if self._weights is None:
  784. self._read_weights()
  785. return self._weights[self._i]
  786. def value(self):
  787. # Get the value for the current posting
  788. # If we haven't loaded the block values yet, load them now
  789. if self._values is None:
  790. self._read_values()
  791. return self._values[self._i]
  792. def next(self):
  793. # Move to the next posting
  794. # Increment the in-block pointer
  795. self._i += 1
  796. # If we reached the end of the block, move to the next block
  797. if self._i == self._blocklength:
  798. self._next_block()
  799. return True
  800. else:
  801. return False
  802. def skip_to(self, targetid):
  803. # Skip to the next ID equal to or greater than the given target ID
  804. if not self.is_active():
  805. raise ReadTooFar
  806. # If we're already at or past target ID, do nothing
  807. if targetid <= self.id():
  808. return
  809. # Skip to the block that would contain the target ID
  810. block_max_id = self.block_max_id
  811. if targetid > block_max_id():
  812. self._skip_to_block(lambda: targetid > block_max_id())
  813. # Iterate through the IDs in the block until we find or pass the
  814. # target
  815. while self.is_active() and self.id() < targetid:
  816. self.next()
  817. def skip_to_quality(self, minquality):
  818. # Skip blocks until we find one that might exceed the given minimum
  819. # quality
  820. block_quality = self.block_quality
  821. # If the quality of this block is already higher than the minimum,
  822. # do nothing
  823. if block_quality() > minquality:
  824. return 0
  825. # Skip blocks as long as the block quality is not greater than the
  826. # minimum
  827. return self._skip_to_block(lambda: block_quality() <= minquality)
  828. def block_min_id(self):
  829. if self._ids is None:
  830. self._read_ids()
  831. return self._ids[0]
  832. def block_max_id(self):
  833. return self._maxid
  834. def block_min_length(self):
  835. return self._minlength
  836. def block_max_length(self):
  837. return self._maxlength
  838. def block_max_weight(self):
  839. return self._maxweight
  840. def _read_data(self):
  841. # Load block data tuple from disk
  842. datalen = self._nextoffset - self._dataoffset
  843. b = self._postfile.get(self._dataoffset, datalen)
  844. # Decompress the pickled data if necessary
  845. if self._compression:
  846. b = zlib.decompress(b)
  847. # Unpickle the data tuple and save it in an attribute
  848. self._data = loads(b)
  849. def _read_ids(self):
  850. # If we haven't loaded the data from disk yet, load it now
  851. if self._data is None:
  852. self._read_data()
  853. ids = self._data[0]
  854. # De-minify the IDs
  855. if not self._byteids:
  856. ids = tuple(delta_decode(ids))
  857. self._ids = ids
  858. def _read_weights(self):
  859. # If we haven't loaded the data from disk yet, load it now
  860. if self._data is None:
  861. self._read_data()
  862. weights = self._data[1]
  863. # De-minify the weights
  864. postcount = self._blocklength
  865. if weights is None:
  866. self._weights = array("f", (1.0 for _ in xrange(postcount)))
  867. elif isinstance(weights, float):
  868. self._weights = array("f", (weights for _ in xrange(postcount)))
  869. else:
  870. self._weights = weights
  871. def _read_values(self):
  872. # If we haven't loaded the data from disk yet, load it now
  873. if self._data is None:
  874. self._read_data()
  875. # De-minify the values
  876. fixedsize = self._fixedsize
  877. vs = self._data[2]
  878. if fixedsize is None or fixedsize < 0:
  879. self._values = vs
  880. elif fixedsize is 0:
  881. self._values = (None,) * self._blocklength
  882. else:
  883. assert isinstance(vs, bytes_type)
  884. self._values = tuple(vs[i:i + fixedsize]
  885. for i in xrange(0, len(vs), fixedsize))
  886. # Term info implementation
  887. class W3TermInfo(TermInfo):
  888. # B | Flags
  889. # f | Total weight
  890. # I | Total doc freq
  891. # B | Min length (encoded as byte)
  892. # B | Max length (encoded as byte)
  893. # f | Max weight
  894. # I | Minimum (first) ID
  895. # I | Maximum (last) ID
  896. _struct = struct.Struct("!BfIBBfII")
  897. def __init__(self, *args, **kwargs):
  898. TermInfo.__init__(self, *args, **kwargs)
  899. self._offset = None
  900. self._length = None
  901. self._inlined = None
  902. def add_block(self, block):
  903. self._weight += sum(block._weights)
  904. self._df += len(block)
  905. ml = block.min_length()
  906. if self._minlength is None:
  907. self._minlength = ml
  908. else:
  909. self._minlength = min(self._minlength, ml)
  910. self._maxlength = max(self._maxlength, block.max_length())
  911. self._maxweight = max(self._maxweight, block.max_weight())
  912. if self._minid is None:
  913. self._minid = block.min_id()
  914. self._maxid = block.max_id()
  915. def set_extent(self, offset, length):
  916. self._offset = offset
  917. self._length = length
  918. def extent(self):
  919. return self._offset, self._length
  920. def set_inlined(self, ids, weights, values):
  921. self._inlined = (tuple(ids), tuple(weights), tuple(values))
  922. def is_inlined(self):
  923. return self._inlined is not None
  924. def inlined_postings(self):
  925. return self._inlined
  926. def to_bytes(self):
  927. isinlined = self.is_inlined()
  928. # Encode the lengths as 0-255 values
  929. minlength = (0 if self._minlength is None
  930. else length_to_byte(self._minlength))
  931. maxlength = length_to_byte(self._maxlength)
  932. # Convert None values to the out-of-band NO_ID constant so they can be
  933. # stored as unsigned ints
  934. minid = 0xffffffff if self._minid is None else self._minid
  935. maxid = 0xffffffff if self._maxid is None else self._maxid
  936. # Pack the term info into bytes
  937. st = self._struct.pack(isinlined, self._weight, self._df,
  938. minlength, maxlength, self._maxweight,
  939. minid, maxid)
  940. if isinlined:
  941. # Postings are inlined - dump them using the pickle protocol
  942. postbytes = dumps(self._inlined, 2)
  943. else:
  944. postbytes = pack_long(self._offset) + pack_int(self._length)
  945. st += postbytes
  946. return st
  947. @classmethod
  948. def from_bytes(cls, s):
  949. st = cls._struct
  950. vals = st.unpack(s[:st.size])
  951. terminfo = cls()
  952. flags = vals[0]
  953. terminfo._weight = vals[1]
  954. terminfo._df = vals[2]
  955. terminfo._minlength = byte_to_length(vals[3])
  956. terminfo._maxlength = byte_to_length(vals[4])
  957. terminfo._maxweight = vals[5]
  958. terminfo._minid = None if vals[6] == 0xffffffff else vals[6]
  959. terminfo._maxid = None if vals[7] == 0xffffffff else vals[7]
  960. if flags:
  961. # Postings are stored inline
  962. terminfo._inlined = loads(s[st.size:])
  963. else:
  964. # Last bytes are pointer into posting file and length
  965. offpos = st.size
  966. lenpos = st.size + _LONG_SIZE
  967. terminfo._offset = unpack_long(s[offpos:lenpos])[0]
  968. terminfo._length = unpack_int(s[lenpos:lenpos + _INT_SIZE])
  969. return terminfo
  970. @classmethod
  971. def read_weight(cls, dbfile, datapos):
  972. return dbfile.get_float(datapos + 1)
  973. @classmethod
  974. def read_doc_freq(cls, dbfile, datapos):
  975. return dbfile.get_uint(datapos + 1 + _FLOAT_SIZE)
  976. @classmethod
  977. def read_min_and_max_length(cls, dbfile, datapos):
  978. lenpos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE
  979. ml = byte_to_length(dbfile.get_byte(lenpos))
  980. xl = byte_to_length(dbfile.get_byte(lenpos + 1))
  981. return ml, xl
  982. @classmethod
  983. def read_max_weight(cls, dbfile, datapos):
  984. weightspos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE + 2
  985. return dbfile.get_float(weightspos)
  986. # Segment implementation
  987. class W3Segment(base.Segment):
  988. def __init__(self, codec, indexname, doccount=0, segid=None, deleted=None):
  989. self.indexname = indexname
  990. self.segid = self._random_id() if segid is None else segid
  991. self._codec = codec
  992. self._doccount = doccount
  993. self._deleted = deleted
  994. self.compound = False
  995. def codec(self, **kwargs):
  996. return self._codec
  997. def set_doc_count(self, dc):
  998. self._doccount = dc
  999. def doc_count_all(self):
  1000. return self._doccount
  1001. def deleted_count(self):
  1002. if self._deleted is None:
  1003. return 0
  1004. return len(self._deleted)
  1005. def deleted_docs(self):
  1006. if self._deleted is None:
  1007. return ()
  1008. else:
  1009. return iter(self._deleted)
  1010. def delete_document(self, docnum, delete=True):
  1011. if delete:
  1012. if self._deleted is None:
  1013. self._deleted = set()
  1014. self._deleted.add(docnum)
  1015. elif self._deleted is not None and docnum in self._deleted:
  1016. self._deleted.clear(docnum)
  1017. def is_deleted(self, docnum):
  1018. if self._deleted is None:
  1019. return False
  1020. return docnum in self._deleted