| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291 |
- # Copyright 2012 Matt Chaput. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # 1. Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
- # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # The views and conclusions contained in the software and documentation are
- # those of the authors and should not be interpreted as representing official
- # policies, either expressed or implied, of Matt Chaput.
- """
- This module implements a "codec" for writing/reading Whoosh X indexes.
- """
- import struct
- from array import array
- from collections import defaultdict
- from whoosh import columns, formats
- from whoosh.compat import b, bytes_type, string_type, integer_types
- from whoosh.compat import dumps, loads, iteritems, xrange
- from whoosh.codec import base
- from whoosh.filedb import compound, filetables
- from whoosh.matching import ListMatcher, ReadTooFar, LeafMatcher
- from whoosh.reading import TermInfo, TermNotFound
- from whoosh.system import emptybytes
- from whoosh.system import _SHORT_SIZE, _INT_SIZE, _LONG_SIZE, _FLOAT_SIZE
- from whoosh.system import pack_ushort, unpack_ushort
- from whoosh.system import pack_int, unpack_int, pack_long, unpack_long
- from whoosh.util.numlists import delta_encode, delta_decode
- from whoosh.util.numeric import length_to_byte, byte_to_length
- try:
- import zlib
- except ImportError:
- zlib = None
- # This byte sequence is written at the start of a posting list to identify the
- # codec/version
- WHOOSH3_HEADER_MAGIC = b("W3Bl")
- # Column type to store field length info
- LENGTHS_COLUMN = columns.NumericColumn("B", default=0)
- # Column type to store pointers to vector posting lists
- VECTOR_COLUMN = columns.NumericColumn("I")
- # Column type to store vector posting list lengths
- VECTOR_LEN_COLUMN = columns.NumericColumn("i")
- # Column type to store values of stored fields
- STORED_COLUMN = columns.PickleColumn(columns.CompressedBytesColumn())
- class W3Codec(base.Codec):
- # File extensions
- TERMS_EXT = ".trm" # Term index
- POSTS_EXT = ".pst" # Term postings
- VPOSTS_EXT = ".vps" # Vector postings
- COLUMN_EXT = ".col" # Per-document value columns
- def __init__(self, blocklimit=128, compression=3, inlinelimit=1):
- self._blocklimit = blocklimit
- self._compression = compression
- self._inlinelimit = inlinelimit
- # def automata(self):
- # Per-document value writer
- def per_document_writer(self, storage, segment):
- return W3PerDocWriter(self, storage, segment)
- # Inverted index writer
- def field_writer(self, storage, segment):
- return W3FieldWriter(self, storage, segment)
- # Postings
- def postings_writer(self, dbfile, byteids=False):
- return W3PostingsWriter(dbfile, blocklimit=self._blocklimit,
- byteids=byteids, compression=self._compression,
- inlinelimit=self._inlinelimit)
- def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None):
- if terminfo.is_inlined():
- # If the postings were inlined into the terminfo object, pull them
- # out and use a ListMatcher to wrap them in a Matcher interface
- ids, weights, values = terminfo.inlined_postings()
- m = ListMatcher(ids, weights, values, format_, scorer=scorer,
- term=term, terminfo=terminfo)
- else:
- offset, length = terminfo.extent()
- m = W3LeafMatcher(dbfile, offset, length, format_, term=term,
- scorer=scorer)
- return m
- # Readers
- def per_document_reader(self, storage, segment):
- return W3PerDocReader(storage, segment)
- def terms_reader(self, storage, segment):
- tiname = segment.make_filename(self.TERMS_EXT)
- tilen = storage.file_length(tiname)
- tifile = storage.open_file(tiname)
- postfile = segment.open_file(storage, self.POSTS_EXT)
- return W3TermsReader(self, tifile, tilen, postfile)
- # Graph methods provided by CodecWithGraph
- # Columns
- def supports_columns(self):
- return True
- @classmethod
- def column_filename(cls, segment, fieldname):
- ext = "".join((".", fieldname, cls.COLUMN_EXT))
- return segment.make_filename(ext)
- # Segments and generations
- def new_segment(self, storage, indexname):
- return W3Segment(self, indexname)
- # Common functions
- def _vecfield(fieldname):
- return "_%s_vec" % fieldname
- def _lenfield(fieldname):
- return "_%s_len" % fieldname
- # Per-doc information writer
- class W3PerDocWriter(base.PerDocWriterWithColumns):
- def __init__(self, codec, storage, segment):
- self._codec = codec
- self._storage = storage
- self._segment = segment
- tempst = storage.temp_storage("%s.tmp" % segment.indexname)
- self._cols = compound.CompoundWriter(tempst)
- self._colwriters = {}
- self._create_column("_stored", STORED_COLUMN)
- self._fieldlengths = defaultdict(int)
- self._doccount = 0
- self._docnum = None
- self._storedfields = None
- self._indoc = False
- self.is_closed = False
- # We'll wait to create the vector file until someone actually tries
- # to add a vector
- self._vpostfile = None
- def _create_file(self, ext):
- return self._segment.create_file(self._storage, ext)
- def _has_column(self, fieldname):
- return fieldname in self._colwriters
- def _create_column(self, fieldname, column):
- writers = self._colwriters
- if fieldname in writers:
- raise Exception("Already added column %r" % fieldname)
- f = self._cols.create_file(fieldname)
- writers[fieldname] = column.writer(f)
- def _get_column(self, fieldname):
- return self._colwriters[fieldname]
- def _prep_vectors(self):
- self._vpostfile = self._create_file(W3Codec.VPOSTS_EXT)
- # We'll use offset==0 as a marker for "no vectors", so we can't start
- # postings at position 0, so just write a few header bytes :)
- self._vpostfile.write(b("VPST"))
- def start_doc(self, docnum):
- if self._indoc:
- raise Exception("Called start_doc when already in a doc")
- if docnum != self._doccount:
- raise Exception("Called start_doc(%r) was expecting %r"
- % (docnum, self._doccount))
- self._docnum = docnum
- self._doccount += 1
- self._storedfields = {}
- self._indoc = True
- def add_field(self, fieldname, fieldobj, value, length):
- if value is not None:
- self._storedfields[fieldname] = value
- if length:
- # Add byte to length column
- lenfield = _lenfield(fieldname)
- lb = length_to_byte(length)
- self.add_column_value(lenfield, LENGTHS_COLUMN, lb)
- # Add length to total field length
- self._fieldlengths[fieldname] += length
- def add_vector_items(self, fieldname, fieldobj, items):
- if not items:
- # Don't do anything if the list of items is empty
- return
- if self._vpostfile is None:
- self._prep_vectors()
- # Write vector postings
- vpostwriter = self._codec.postings_writer(self._vpostfile, byteids=True)
- vpostwriter.start_postings(fieldobj.vector, W3TermInfo())
- for text, weight, vbytes in items:
- vpostwriter.add_posting(text, weight, vbytes)
- # finish_postings() returns terminfo object
- vinfo = vpostwriter.finish_postings()
- # Add row to vector lookup column
- vecfield = _vecfield(fieldname) # Compute vector column name
- offset, length = vinfo.extent()
- assert offset != 0
- self.add_column_value(vecfield, VECTOR_COLUMN, offset)
- self.add_column_value(vecfield + "L", VECTOR_LEN_COLUMN, length)
- def finish_doc(self):
- sf = self._storedfields
- if sf:
- self.add_column_value("_stored", STORED_COLUMN, sf)
- sf.clear()
- self._indoc = False
- def _column_filename(self, fieldname):
- return W3Codec.column_filename(self._segment, fieldname)
- def close(self):
- if self._indoc is not None:
- # Called close without calling finish_doc
- self.finish_doc()
- self._segment._fieldlengths = self._fieldlengths
- # Finish open columns and close the columns writer
- for writer in self._colwriters.values():
- writer.finish(self._doccount)
- self._cols.save_as_files(self._storage, self._column_filename)
- # If vectors were written, close the vector writers
- if self._vpostfile:
- self._vpostfile.close()
- self.is_closed = True
- class W3FieldWriter(base.FieldWriter):
- def __init__(self, codec, storage, segment):
- self._codec = codec
- self._storage = storage
- self._segment = segment
- self._fieldname = None
- self._fieldid = None
- self._btext = None
- self._fieldobj = None
- self._format = None
- _tifile = self._create_file(W3Codec.TERMS_EXT)
- self._tindex = filetables.OrderedHashWriter(_tifile)
- self._fieldmap = self._tindex.extras["fieldmap"] = {}
- self._postfile = self._create_file(W3Codec.POSTS_EXT)
- self._postwriter = None
- self._infield = False
- self.is_closed = False
- def _create_file(self, ext):
- return self._segment.create_file(self._storage, ext)
- def start_field(self, fieldname, fieldobj):
- fmap = self._fieldmap
- if fieldname in fmap:
- self._fieldid = fmap[fieldname]
- else:
- self._fieldid = len(fmap)
- fmap[fieldname] = self._fieldid
- self._fieldname = fieldname
- self._fieldobj = fieldobj
- self._format = fieldobj.format
- self._infield = True
- # Start a new postwriter for this field
- self._postwriter = self._codec.postings_writer(self._postfile)
- def start_term(self, btext):
- if self._postwriter is None:
- raise Exception("Called start_term before start_field")
- self._btext = btext
- self._postwriter.start_postings(self._fieldobj.format, W3TermInfo())
- def add(self, docnum, weight, vbytes, length):
- self._postwriter.add_posting(docnum, weight, vbytes, length)
- def finish_term(self):
- terminfo = self._postwriter.finish_postings()
- # Add row to term info table
- keybytes = pack_ushort(self._fieldid) + self._btext
- valbytes = terminfo.to_bytes()
- self._tindex.add(keybytes, valbytes)
- # FieldWriterWithGraph.add_spell_word
- def finish_field(self):
- if not self._infield:
- raise Exception("Called finish_field before start_field")
- self._infield = False
- self._postwriter = None
- def close(self):
- self._tindex.close()
- self._postfile.close()
- self.is_closed = True
- # Reader objects
- class W3PerDocReader(base.PerDocumentReader):
- def __init__(self, storage, segment):
- self._storage = storage
- self._segment = segment
- self._doccount = segment.doc_count_all()
- self._vpostfile = None
- self._colfiles = {}
- self._readers = {}
- self._minlengths = {}
- self._maxlengths = {}
- def close(self):
- for colfile, _, _ in self._colfiles.values():
- colfile.close()
- if self._vpostfile:
- self._vpostfile.close()
- def doc_count(self):
- return self._doccount - self._segment.deleted_count()
- def doc_count_all(self):
- return self._doccount
- # Deletions
- def has_deletions(self):
- return self._segment.has_deletions()
- def is_deleted(self, docnum):
- return self._segment.is_deleted(docnum)
- def deleted_docs(self):
- return self._segment.deleted_docs()
- # Columns
- def has_column(self, fieldname):
- filename = W3Codec.column_filename(self._segment, fieldname)
- return self._storage.file_exists(filename)
- def _get_column_file(self, fieldname):
- filename = W3Codec.column_filename(self._segment, fieldname)
- length = self._storage.file_length(filename)
- colfile = self._storage.open_file(filename)
- return colfile, 0, length
- def column_reader(self, fieldname, column):
- if fieldname not in self._colfiles:
- self._colfiles[fieldname] = self._get_column_file(fieldname)
- colfile, offset, length = self._colfiles[fieldname]
- return column.reader(colfile, offset, length, self._doccount)
- # Lengths
- def _cached_reader(self, fieldname, column):
- if fieldname in self._readers:
- return self._readers[fieldname]
- else:
- if not self.has_column(fieldname):
- return None
- reader = self.column_reader(fieldname, column)
- self._readers[fieldname] = reader
- return reader
- def doc_field_length(self, docnum, fieldname, default=0):
- if docnum > self._doccount:
- raise IndexError("Asked for docnum %r of %d"
- % (docnum, self._doccount))
- lenfield = _lenfield(fieldname)
- reader = self._cached_reader(lenfield, LENGTHS_COLUMN)
- if reader is None:
- return default
- lbyte = reader[docnum]
- if lbyte:
- return byte_to_length(lbyte)
- def field_length(self, fieldname):
- return self._segment._fieldlengths.get(fieldname, 0)
- def _minmax_length(self, fieldname, op, cache):
- if fieldname in cache:
- return cache[fieldname]
- lenfield = _lenfield(fieldname)
- reader = self._cached_reader(lenfield, LENGTHS_COLUMN)
- length = byte_to_length(op(reader))
- cache[fieldname] = length
- return length
- def min_field_length(self, fieldname):
- return self._minmax_length(fieldname, min, self._minlengths)
- def max_field_length(self, fieldname):
- return self._minmax_length(fieldname, max, self._maxlengths)
- # Vectors
- def _prep_vectors(self):
- f = self._segment.open_file(self._storage, W3Codec.VPOSTS_EXT)
- self._vpostfile = f
- def _vector_extent(self, docnum, fieldname):
- if docnum > self._doccount:
- raise IndexError("Asked for document %r of %d"
- % (docnum, self._doccount))
- vecfield = _vecfield(fieldname) # Compute vector column name
- # Get the offset from the vector offset column
- offset = self._cached_reader(vecfield, VECTOR_COLUMN)[docnum]
- # Get the length from the length column, if it exists, otherwise return
- # -1 for the length (backwards compatibility with old dev versions)
- lreader = self._cached_reader(vecfield + "L", VECTOR_COLUMN)
- if lreader:
- length = lreader[docnum]
- else:
- length = -1
- return offset, length
- def has_vector(self, docnum, fieldname):
- if self.has_column(_vecfield(fieldname)):
- offset, length = self._vector_extent(docnum, fieldname)
- return offset != 0
- return False
- def vector(self, docnum, fieldname, format_):
- if self._vpostfile is None:
- self._prep_vectors()
- offset, length = self._vector_extent(docnum, fieldname)
- if not offset:
- raise Exception("Field %r has no vector in docnum %s" %
- (fieldname, docnum))
- m = W3LeafMatcher(self._vpostfile, offset, length, format_,
- byteids=True)
- return m
- # Stored fields
- def stored_fields(self, docnum):
- reader = self._cached_reader("_stored", STORED_COLUMN)
- v = reader[docnum]
- if v is None:
- v = {}
- return v
- class W3FieldCursor(base.FieldCursor):
- def __init__(self, tindex, fieldname, keycoder, keydecoder, fieldobj):
- self._tindex = tindex
- self._fieldname = fieldname
- self._keycoder = keycoder
- self._keydecoder = keydecoder
- self._fieldobj = fieldobj
- prefixbytes = keycoder(fieldname, b'')
- self._startpos = self._tindex.closest_key_pos(prefixbytes)
- self._pos = self._startpos
- self._text = None
- self._datapos = None
- self._datalen = None
- self.next()
- def first(self):
- self._pos = self._startpos
- return self.next()
- def find(self, term):
- if not isinstance(term, bytes_type):
- term = self._fieldobj.to_bytes(term)
- key = self._keycoder(self._fieldname, term)
- self._pos = self._tindex.closest_key_pos(key)
- return self.next()
- def next(self):
- if self._pos is not None:
- keyrng = self._tindex.key_and_range_at(self._pos)
- if keyrng is not None:
- keybytes, datapos, datalen = keyrng
- fname, text = self._keydecoder(keybytes)
- if fname == self._fieldname:
- self._pos = datapos + datalen
- self._text = self._fieldobj.from_bytes(text)
- self._datapos = datapos
- self._datalen = datalen
- return self._text
- self._text = self._pos = self._datapos = self._datalen = None
- return None
- def text(self):
- return self._text
- def term_info(self):
- if self._pos is None:
- return None
- databytes = self._tindex.dbfile.get(self._datapos, self._datalen)
- return W3TermInfo.from_bytes(databytes)
- def is_valid(self):
- return self._pos is not None
- class W3TermsReader(base.TermsReader):
- def __init__(self, codec, dbfile, length, postfile):
- self._codec = codec
- self._dbfile = dbfile
- self._tindex = filetables.OrderedHashReader(dbfile, length)
- self._fieldmap = self._tindex.extras["fieldmap"]
- self._postfile = postfile
- self._fieldunmap = [None] * len(self._fieldmap)
- for fieldname, num in iteritems(self._fieldmap):
- self._fieldunmap[num] = fieldname
- def _keycoder(self, fieldname, tbytes):
- assert isinstance(tbytes, bytes_type), "tbytes=%r" % tbytes
- fnum = self._fieldmap.get(fieldname, 65535)
- return pack_ushort(fnum) + tbytes
- def _keydecoder(self, keybytes):
- fieldid = unpack_ushort(keybytes[:_SHORT_SIZE])[0]
- return self._fieldunmap[fieldid], keybytes[_SHORT_SIZE:]
- def _range_for_key(self, fieldname, tbytes):
- return self._tindex.range_for_key(self._keycoder(fieldname, tbytes))
- def __contains__(self, term):
- return self._keycoder(*term) in self._tindex
- def indexed_field_names(self):
- return self._fieldmap.keys()
- def cursor(self, fieldname, fieldobj):
- tindex = self._tindex
- coder = self._keycoder
- decoder = self._keydecoder
- return W3FieldCursor(tindex, fieldname, coder, decoder, fieldobj)
- def terms(self):
- keydecoder = self._keydecoder
- return (keydecoder(keybytes) for keybytes in self._tindex.keys())
- def terms_from(self, fieldname, prefix):
- prefixbytes = self._keycoder(fieldname, prefix)
- keydecoder = self._keydecoder
- return (keydecoder(keybytes) for keybytes
- in self._tindex.keys_from(prefixbytes))
- def items(self):
- tidecoder = W3TermInfo.from_bytes
- keydecoder = self._keydecoder
- return ((keydecoder(keybytes), tidecoder(valbytes))
- for keybytes, valbytes in self._tindex.items())
- def items_from(self, fieldname, prefix):
- prefixbytes = self._keycoder(fieldname, prefix)
- tidecoder = W3TermInfo.from_bytes
- keydecoder = self._keydecoder
- return ((keydecoder(keybytes), tidecoder(valbytes))
- for keybytes, valbytes in self._tindex.items_from(prefixbytes))
- def term_info(self, fieldname, tbytes):
- key = self._keycoder(fieldname, tbytes)
- try:
- return W3TermInfo.from_bytes(self._tindex[key])
- except KeyError:
- raise TermNotFound("No term %s:%r" % (fieldname, tbytes))
- def frequency(self, fieldname, tbytes):
- datapos = self._range_for_key(fieldname, tbytes)[0]
- return W3TermInfo.read_weight(self._dbfile, datapos)
- def doc_frequency(self, fieldname, tbytes):
- datapos = self._range_for_key(fieldname, tbytes)[0]
- return W3TermInfo.read_doc_freq(self._dbfile, datapos)
- def matcher(self, fieldname, tbytes, format_, scorer=None):
- terminfo = self.term_info(fieldname, tbytes)
- m = self._codec.postings_reader(self._postfile, terminfo, format_,
- term=(fieldname, tbytes), scorer=scorer)
- return m
- def close(self):
- self._tindex.close()
- self._postfile.close()
- # Postings
- class W3PostingsWriter(base.PostingsWriter):
- """This object writes posting lists to the postings file. It groups postings
- into blocks and tracks block level statistics to makes it easier to skip
- through the postings.
- """
- def __init__(self, postfile, blocklimit, byteids=False, compression=3,
- inlinelimit=1):
- self._postfile = postfile
- self._blocklimit = blocklimit
- self._byteids = byteids
- self._compression = compression
- self._inlinelimit = inlinelimit
- self._blockcount = 0
- self._format = None
- self._terminfo = None
- def written(self):
- return self._blockcount > 0
- def start_postings(self, format_, terminfo):
- # Start a new term
- if self._terminfo:
- # If self._terminfo is not None, that means we are already in a term
- raise Exception("Called start in a term")
- assert isinstance(format_, formats.Format)
- self._format = format_
- # Reset block count
- self._blockcount = 0
- # Reset block bufferg
- self._new_block()
- # Remember terminfo object passed to us
- self._terminfo = terminfo
- # Remember where we started in the posting file
- self._startoffset = self._postfile.tell()
- def add_posting(self, id_, weight, vbytes, length=None):
- # Add a posting to the buffered block
- # If the number of buffered postings == the block limit, write out the
- # buffered block and reset before adding this one
- if len(self._ids) >= self._blocklimit:
- self._write_block()
- # Check types
- if self._byteids:
- assert isinstance(id_, string_type), "id_=%r" % id_
- else:
- assert isinstance(id_, integer_types), "id_=%r" % id_
- assert isinstance(weight, (int, float)), "weight=%r" % weight
- assert isinstance(vbytes, bytes_type), "vbytes=%r" % vbytes
- assert length is None or isinstance(length, integer_types)
- self._ids.append(id_)
- self._weights.append(weight)
- if weight > self._maxweight:
- self._maxweight = weight
- if vbytes:
- self._values.append(vbytes)
- if length:
- minlength = self._minlength
- if minlength is None or length < minlength:
- self._minlength = length
- if length > self._maxlength:
- self._maxlength = length
- def finish_postings(self):
- terminfo = self._terminfo
- # If we have fewer than "inlinelimit" postings in this posting list,
- # "inline" the postings into the terminfo instead of writing them to
- # the posting file
- if not self.written() and len(self) < self._inlinelimit:
- terminfo.add_block(self)
- terminfo.set_inline(self._ids, self._weights, self._values)
- else:
- # If there are leftover items in the current block, write them out
- if self._ids:
- self._write_block(last=True)
- startoffset = self._startoffset
- length = self._postfile.tell() - startoffset
- terminfo.set_extent(startoffset, length)
- # Clear self._terminfo to indicate we're between terms
- self._terminfo = None
- # Return the current terminfo object
- return terminfo
- def _new_block(self):
- # Reset block buffer
- # List of IDs (docnums for regular posting list, terms for vector PL)
- self._ids = [] if self._byteids else array("I")
- # List of weights
- self._weights = array("f")
- # List of encoded payloads
- self._values = []
- # Statistics
- self._minlength = None
- self._maxlength = 0
- self._maxweight = 0
- def _write_block(self, last=False):
- # Write the buffered block to the postings file
- # If this is the first block, write a small header first
- if not self._blockcount:
- self._postfile.write(WHOOSH3_HEADER_MAGIC)
- # Add this block's statistics to the terminfo object, which tracks the
- # overall statistics for all term postings
- self._terminfo.add_block(self)
- # Minify the IDs, weights, and values, and put them in a tuple
- data = (self._mini_ids(), self._mini_weights(), self._mini_values())
- # Pickle the tuple
- databytes = dumps(data, 2)
- # If the pickle is less than 20 bytes, don't bother compressing
- if len(databytes) < 20:
- comp = 0
- # Compress the pickle (if self._compression > 0)
- comp = self._compression
- if comp:
- databytes = zlib.compress(databytes, comp)
- # Make a tuple of block info. The posting reader can check this info
- # and decide whether to skip the block without having to decompress the
- # full block data
- #
- # - Number of postings in block
- # - Last ID in block
- # - Maximum weight in block
- # - Compression level
- # - Minimum length byte
- # - Maximum length byte
- ids = self._ids
- infobytes = dumps((len(ids), ids[-1], self._maxweight, comp,
- length_to_byte(self._minlength),
- length_to_byte(self._maxlength),
- ), 2)
- # Write block length
- postfile = self._postfile
- blocklength = len(infobytes) + len(databytes)
- if last:
- # If this is the last block, use a negative number
- blocklength *= -1
- postfile.write_int(blocklength)
- # Write block info
- postfile.write(infobytes)
- # Write block data
- postfile.write(databytes)
- self._blockcount += 1
- # Reset block buffer
- self._new_block()
- # Methods to reduce the byte size of the various lists
- def _mini_ids(self):
- # Minify IDs
- ids = self._ids
- if not self._byteids:
- ids = delta_encode(ids)
- return tuple(ids)
- def _mini_weights(self):
- # Minify weights
- weights = self._weights
- if all(w == 1.0 for w in weights):
- return None
- elif all(w == weights[0] for w in weights):
- return weights[0]
- else:
- return tuple(weights)
- def _mini_values(self):
- # Minify values
- fixedsize = self._format.fixed_value_size()
- values = self._values
- if fixedsize is None or fixedsize < 0:
- vs = tuple(values)
- elif fixedsize == 0:
- vs = None
- else:
- vs = emptybytes.join(values)
- return vs
- # Block stats methods
- def __len__(self):
- # Returns the number of unwritten buffered postings
- return len(self._ids)
- def min_id(self):
- # First ID in the buffered block
- return self._ids[0]
- def max_id(self):
- # Last ID in the buffered block
- return self._ids[-1]
- def min_length(self):
- # Shortest field length in the buffered block
- return self._minlength
- def max_length(self):
- # Longest field length in the buffered block
- return self._maxlength
- def max_weight(self):
- # Highest weight in the buffered block
- return self._maxweight
- class W3LeafMatcher(LeafMatcher):
- """Reads on-disk postings from the postings file and presents the
- :class:`whoosh.matching.Matcher` interface.
- """
- def __init__(self, postfile, startoffset, length, format_, term=None,
- byteids=None, scorer=None):
- self._postfile = postfile
- self._startoffset = startoffset
- self._length = length
- self.format = format_
- self._term = term
- self._byteids = byteids
- self.scorer = scorer
- self._fixedsize = self.format.fixed_value_size()
- # Read the header tag at the start of the postings
- self._read_header()
- # "Reset" to read the first block
- self.reset()
- def _read_header(self):
- # Seek to the start of the postings and check the header tag
- postfile = self._postfile
- postfile.seek(self._startoffset)
- magic = postfile.read(4)
- if magic != WHOOSH3_HEADER_MAGIC:
- raise Exception("Block tag error %r" % magic)
- # Remember the base offset (start of postings, after the header)
- self._baseoffset = postfile.tell()
- def reset(self):
- # Reset block stats
- self._blocklength = None
- self._maxid = None
- self._maxweight = None
- self._compression = None
- self._minlength = None
- self._maxlength = None
- self._lastblock = False
- self._atend = False
- # Consume first block
- self._goto(self._baseoffset)
- def _goto(self, position):
- # Read the posting block at the given position
- postfile = self._postfile
- # Reset block data -- we'll lazy load the data from the new block as
- # needed
- self._data = None
- self._ids = None
- self._weights = None
- self._values = None
- # Reset pointer into the block
- self._i = 0
- # Seek to the start of the block
- postfile.seek(position)
- # Read the block length
- length = postfile.read_int()
- # If the block length is negative, that means this is the last block
- if length < 0:
- self._lastblock = True
- length *= -1
- # Remember the offset of the next block
- self._nextoffset = position + _INT_SIZE + length
- # Read the pickled block info tuple
- info = postfile.read_pickle()
- # Remember the offset of the block's data
- self._dataoffset = postfile.tell()
- # Decompose the info tuple to set the current block info
- (self._blocklength, self._maxid, self._maxweight, self._compression,
- mnlen, mxlen) = info
- self._minlength = byte_to_length(mnlen)
- self._maxlength = byte_to_length(mxlen)
- def _next_block(self):
- if self._atend:
- # We were already at the end, and yet somebody called _next_block()
- # again, so something is wrong somewhere
- raise Exception("No next block")
- elif self._lastblock:
- # Reached the end of the postings
- self._atend = True
- else:
- # Go to the next block
- self._goto(self._nextoffset)
- def _skip_to_block(self, skipwhile):
- # Skip blocks as long as the skipwhile() function returns True
- skipped = 0
- while self.is_active() and skipwhile():
- self._next_block()
- skipped += 1
- return skipped
- def is_active(self):
- return not self._atend and self._i < self._blocklength
- def id(self):
- # Get the current ID (docnum for regular postings, term for vector)
- # If we haven't loaded the block IDs yet, load them now
- if self._ids is None:
- self._read_ids()
- return self._ids[self._i]
- def weight(self):
- # Get the weight for the current posting
- # If we haven't loaded the block weights yet, load them now
- if self._weights is None:
- self._read_weights()
- return self._weights[self._i]
- def value(self):
- # Get the value for the current posting
- # If we haven't loaded the block values yet, load them now
- if self._values is None:
- self._read_values()
- return self._values[self._i]
- def next(self):
- # Move to the next posting
- # Increment the in-block pointer
- self._i += 1
- # If we reached the end of the block, move to the next block
- if self._i == self._blocklength:
- self._next_block()
- return True
- else:
- return False
- def skip_to(self, targetid):
- # Skip to the next ID equal to or greater than the given target ID
- if not self.is_active():
- raise ReadTooFar
- # If we're already at or past target ID, do nothing
- if targetid <= self.id():
- return
- # Skip to the block that would contain the target ID
- block_max_id = self.block_max_id
- if targetid > block_max_id():
- self._skip_to_block(lambda: targetid > block_max_id())
- # Iterate through the IDs in the block until we find or pass the
- # target
- while self.is_active() and self.id() < targetid:
- self.next()
- def skip_to_quality(self, minquality):
- # Skip blocks until we find one that might exceed the given minimum
- # quality
- block_quality = self.block_quality
- # If the quality of this block is already higher than the minimum,
- # do nothing
- if block_quality() > minquality:
- return 0
- # Skip blocks as long as the block quality is not greater than the
- # minimum
- return self._skip_to_block(lambda: block_quality() <= minquality)
- def block_min_id(self):
- if self._ids is None:
- self._read_ids()
- return self._ids[0]
- def block_max_id(self):
- return self._maxid
- def block_min_length(self):
- return self._minlength
- def block_max_length(self):
- return self._maxlength
- def block_max_weight(self):
- return self._maxweight
- def _read_data(self):
- # Load block data tuple from disk
- datalen = self._nextoffset - self._dataoffset
- b = self._postfile.get(self._dataoffset, datalen)
- # Decompress the pickled data if necessary
- if self._compression:
- b = zlib.decompress(b)
- # Unpickle the data tuple and save it in an attribute
- self._data = loads(b)
- def _read_ids(self):
- # If we haven't loaded the data from disk yet, load it now
- if self._data is None:
- self._read_data()
- ids = self._data[0]
- # De-minify the IDs
- if not self._byteids:
- ids = tuple(delta_decode(ids))
- self._ids = ids
- def _read_weights(self):
- # If we haven't loaded the data from disk yet, load it now
- if self._data is None:
- self._read_data()
- weights = self._data[1]
- # De-minify the weights
- postcount = self._blocklength
- if weights is None:
- self._weights = array("f", (1.0 for _ in xrange(postcount)))
- elif isinstance(weights, float):
- self._weights = array("f", (weights for _ in xrange(postcount)))
- else:
- self._weights = weights
- def _read_values(self):
- # If we haven't loaded the data from disk yet, load it now
- if self._data is None:
- self._read_data()
- # De-minify the values
- fixedsize = self._fixedsize
- vs = self._data[2]
- if fixedsize is None or fixedsize < 0:
- self._values = vs
- elif fixedsize is 0:
- self._values = (None,) * self._blocklength
- else:
- assert isinstance(vs, bytes_type)
- self._values = tuple(vs[i:i + fixedsize]
- for i in xrange(0, len(vs), fixedsize))
- # Term info implementation
- class W3TermInfo(TermInfo):
- # B | Flags
- # f | Total weight
- # I | Total doc freq
- # B | Min length (encoded as byte)
- # B | Max length (encoded as byte)
- # f | Max weight
- # I | Minimum (first) ID
- # I | Maximum (last) ID
- _struct = struct.Struct("!BfIBBfII")
- def __init__(self, *args, **kwargs):
- TermInfo.__init__(self, *args, **kwargs)
- self._offset = None
- self._length = None
- self._inlined = None
- def add_block(self, block):
- self._weight += sum(block._weights)
- self._df += len(block)
- ml = block.min_length()
- if self._minlength is None:
- self._minlength = ml
- else:
- self._minlength = min(self._minlength, ml)
- self._maxlength = max(self._maxlength, block.max_length())
- self._maxweight = max(self._maxweight, block.max_weight())
- if self._minid is None:
- self._minid = block.min_id()
- self._maxid = block.max_id()
- def set_extent(self, offset, length):
- self._offset = offset
- self._length = length
- def extent(self):
- return self._offset, self._length
- def set_inlined(self, ids, weights, values):
- self._inlined = (tuple(ids), tuple(weights), tuple(values))
- def is_inlined(self):
- return self._inlined is not None
- def inlined_postings(self):
- return self._inlined
- def to_bytes(self):
- isinlined = self.is_inlined()
- # Encode the lengths as 0-255 values
- minlength = (0 if self._minlength is None
- else length_to_byte(self._minlength))
- maxlength = length_to_byte(self._maxlength)
- # Convert None values to the out-of-band NO_ID constant so they can be
- # stored as unsigned ints
- minid = 0xffffffff if self._minid is None else self._minid
- maxid = 0xffffffff if self._maxid is None else self._maxid
- # Pack the term info into bytes
- st = self._struct.pack(isinlined, self._weight, self._df,
- minlength, maxlength, self._maxweight,
- minid, maxid)
- if isinlined:
- # Postings are inlined - dump them using the pickle protocol
- postbytes = dumps(self._inlined, 2)
- else:
- postbytes = pack_long(self._offset) + pack_int(self._length)
- st += postbytes
- return st
- @classmethod
- def from_bytes(cls, s):
- st = cls._struct
- vals = st.unpack(s[:st.size])
- terminfo = cls()
- flags = vals[0]
- terminfo._weight = vals[1]
- terminfo._df = vals[2]
- terminfo._minlength = byte_to_length(vals[3])
- terminfo._maxlength = byte_to_length(vals[4])
- terminfo._maxweight = vals[5]
- terminfo._minid = None if vals[6] == 0xffffffff else vals[6]
- terminfo._maxid = None if vals[7] == 0xffffffff else vals[7]
- if flags:
- # Postings are stored inline
- terminfo._inlined = loads(s[st.size:])
- else:
- # Last bytes are pointer into posting file and length
- offpos = st.size
- lenpos = st.size + _LONG_SIZE
- terminfo._offset = unpack_long(s[offpos:lenpos])[0]
- terminfo._length = unpack_int(s[lenpos:lenpos + _INT_SIZE])
- return terminfo
- @classmethod
- def read_weight(cls, dbfile, datapos):
- return dbfile.get_float(datapos + 1)
- @classmethod
- def read_doc_freq(cls, dbfile, datapos):
- return dbfile.get_uint(datapos + 1 + _FLOAT_SIZE)
- @classmethod
- def read_min_and_max_length(cls, dbfile, datapos):
- lenpos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE
- ml = byte_to_length(dbfile.get_byte(lenpos))
- xl = byte_to_length(dbfile.get_byte(lenpos + 1))
- return ml, xl
- @classmethod
- def read_max_weight(cls, dbfile, datapos):
- weightspos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE + 2
- return dbfile.get_float(weightspos)
- # Segment implementation
- class W3Segment(base.Segment):
- def __init__(self, codec, indexname, doccount=0, segid=None, deleted=None):
- self.indexname = indexname
- self.segid = self._random_id() if segid is None else segid
- self._codec = codec
- self._doccount = doccount
- self._deleted = deleted
- self.compound = False
- def codec(self, **kwargs):
- return self._codec
- def set_doc_count(self, dc):
- self._doccount = dc
- def doc_count_all(self):
- return self._doccount
- def deleted_count(self):
- if self._deleted is None:
- return 0
- return len(self._deleted)
- def deleted_docs(self):
- if self._deleted is None:
- return ()
- else:
- return iter(self._deleted)
- def delete_document(self, docnum, delete=True):
- if delete:
- if self._deleted is None:
- self._deleted = set()
- self._deleted.add(docnum)
- elif self._deleted is not None and docnum in self._deleted:
- self._deleted.clear(docnum)
- def is_deleted(self, docnum):
- if self._deleted is None:
- return False
- return docnum in self._deleted
|