123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843 |
- # Copyright 2011 Matt Chaput. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # 1. Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
- # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # The views and conclusions contained in the software and documentation are
- # those of the authors and should not be interpreted as representing official
- # policies, either expressed or implied, of Matt Chaput.
- """
- This module contains base classes/interfaces for "codec" objects.
- """
- from bisect import bisect_right
- from whoosh import columns
- from whoosh.automata import lev
- from whoosh.compat import abstractmethod, izip, unichr, xrange
- from whoosh.filedb.compound import CompoundStorage
- from whoosh.system import emptybytes
- from whoosh.util import random_name
- # Exceptions
- class OutOfOrderError(Exception):
- pass
- # Base classes
- class Codec(object):
- length_stats = True
- # Per document value writer
- @abstractmethod
- def per_document_writer(self, storage, segment):
- raise NotImplementedError
- # Inverted index writer
- @abstractmethod
- def field_writer(self, storage, segment):
- raise NotImplementedError
- # Postings
- @abstractmethod
- def postings_writer(self, dbfile, byteids=False):
- raise NotImplementedError
- @abstractmethod
- def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None):
- raise NotImplementedError
- # Index readers
- def automata(self, storage, segment):
- return Automata()
- @abstractmethod
- def terms_reader(self, storage, segment):
- raise NotImplementedError
- @abstractmethod
- def per_document_reader(self, storage, segment):
- raise NotImplementedError
- # Segments and generations
- @abstractmethod
- def new_segment(self, storage, indexname):
- raise NotImplementedError
- class WrappingCodec(Codec):
- def __init__(self, child):
- self._child = child
- def per_document_writer(self, storage, segment):
- return self._child.per_document_writer(storage, segment)
- def field_writer(self, storage, segment):
- return self._child.field_writer(storage, segment)
- def postings_writer(self, dbfile, byteids=False):
- return self._child.postings_writer(dbfile, byteids=byteids)
- def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None):
- return self._child.postings_reader(dbfile, terminfo, format_, term=term,
- scorer=scorer)
- def automata(self, storage, segment):
- return self._child.automata(storage, segment)
- def terms_reader(self, storage, segment):
- return self._child.terms_reader(storage, segment)
- def per_document_reader(self, storage, segment):
- return self._child.per_document_reader(storage, segment)
- def new_segment(self, storage, indexname):
- return self._child.new_segment(storage, indexname)
- # Writer classes
- class PerDocumentWriter(object):
- @abstractmethod
- def start_doc(self, docnum):
- raise NotImplementedError
- @abstractmethod
- def add_field(self, fieldname, fieldobj, value, length):
- raise NotImplementedError
- @abstractmethod
- def add_column_value(self, fieldname, columnobj, value):
- raise NotImplementedError("Codec does not implement writing columns")
- @abstractmethod
- def add_vector_items(self, fieldname, fieldobj, items):
- raise NotImplementedError
- def add_vector_matcher(self, fieldname, fieldobj, vmatcher):
- def readitems():
- while vmatcher.is_active():
- text = vmatcher.id()
- weight = vmatcher.weight()
- valuestring = vmatcher.value()
- yield (text, weight, valuestring)
- vmatcher.next()
- self.add_vector_items(fieldname, fieldobj, readitems())
- def finish_doc(self):
- pass
- def close(self):
- pass
- class FieldWriter(object):
- def add_postings(self, schema, lengths, items):
- # This method translates a generator of (fieldname, btext, docnum, w, v)
- # postings into calls to start_field(), start_term(), add(),
- # finish_term(), finish_field(), etc.
- start_field = self.start_field
- start_term = self.start_term
- add = self.add
- finish_term = self.finish_term
- finish_field = self.finish_field
- if lengths:
- dfl = lengths.doc_field_length
- else:
- dfl = lambda docnum, fieldname: 0
- # The fieldname of the previous posting
- lastfn = None
- # The bytes text of the previous posting
- lasttext = None
- # The (fieldname, btext) of the previous spelling posting
- lastspell = None
- # The field object for the current field
- fieldobj = None
- for fieldname, btext, docnum, weight, value in items:
- # Check for out-of-order postings. This is convoluted because Python
- # 3 removed the ability to compare a string to None
- if lastfn is not None and fieldname < lastfn:
- raise OutOfOrderError("Field %r .. %r" % (lastfn, fieldname))
- if fieldname == lastfn and lasttext and btext < lasttext:
- raise OutOfOrderError("Term %s:%r .. %s:%r"
- % (lastfn, lasttext, fieldname, btext))
- # If the fieldname of this posting is different from the last one,
- # tell the writer we're starting a new field
- if fieldname != lastfn:
- if lasttext is not None:
- finish_term()
- if lastfn is not None and fieldname != lastfn:
- finish_field()
- fieldobj = schema[fieldname]
- start_field(fieldname, fieldobj)
- lastfn = fieldname
- lasttext = None
- # HACK: items where docnum == -1 indicate words that should be added
- # to the spelling graph, not the postings
- if docnum == -1:
- # spellterm = (fieldname, btext)
- # # There can be duplicates of spelling terms, so only add a spell
- # # term if it's greater than the last one
- # if lastspell is None or spellterm > lastspell:
- # spellword = fieldobj.from_bytes(btext)
- # self.add_spell_word(fieldname, spellword)
- # lastspell = spellterm
- continue
- # If this term is different from the term in the previous posting,
- # tell the writer to start a new term
- if btext != lasttext:
- if lasttext is not None:
- finish_term()
- start_term(btext)
- lasttext = btext
- # Add this posting
- length = dfl(docnum, fieldname)
- if value is None:
- value = emptybytes
- add(docnum, weight, value, length)
- if lasttext is not None:
- finish_term()
- if lastfn is not None:
- finish_field()
- @abstractmethod
- def start_field(self, fieldname, fieldobj):
- raise NotImplementedError
- @abstractmethod
- def start_term(self, text):
- raise NotImplementedError
- @abstractmethod
- def add(self, docnum, weight, vbytes, length):
- raise NotImplementedError
- def add_spell_word(self, fieldname, text):
- raise NotImplementedError
- @abstractmethod
- def finish_term(self):
- raise NotImplementedError
- def finish_field(self):
- pass
- def close(self):
- pass
- # Postings
- class PostingsWriter(object):
- @abstractmethod
- def start_postings(self, format_, terminfo):
- raise NotImplementedError
- @abstractmethod
- def add_posting(self, id_, weight, vbytes, length=None):
- raise NotImplementedError
- def finish_postings(self):
- pass
- @abstractmethod
- def written(self):
- """Returns True if this object has already written to disk.
- """
- raise NotImplementedError
- # Reader classes
- class FieldCursor(object):
- def first(self):
- raise NotImplementedError
- def find(self, string):
- raise NotImplementedError
- def next(self):
- raise NotImplementedError
- def term(self):
- raise NotImplementedError
- class TermsReader(object):
- @abstractmethod
- def __contains__(self, term):
- raise NotImplementedError
- @abstractmethod
- def cursor(self, fieldname, fieldobj):
- raise NotImplementedError
- @abstractmethod
- def terms(self):
- raise NotImplementedError
- @abstractmethod
- def terms_from(self, fieldname, prefix):
- raise NotImplementedError
- @abstractmethod
- def items(self):
- raise NotImplementedError
- @abstractmethod
- def items_from(self, fieldname, prefix):
- raise NotImplementedError
- @abstractmethod
- def term_info(self, fieldname, text):
- raise NotImplementedError
- @abstractmethod
- def frequency(self, fieldname, text):
- return self.term_info(fieldname, text).weight()
- @abstractmethod
- def doc_frequency(self, fieldname, text):
- return self.term_info(fieldname, text).doc_frequency()
- @abstractmethod
- def matcher(self, fieldname, text, format_, scorer=None):
- raise NotImplementedError
- @abstractmethod
- def indexed_field_names(self):
- raise NotImplementedError
- def close(self):
- pass
- class Automata(object):
- @staticmethod
- def levenshtein_dfa(uterm, maxdist, prefix=0):
- return lev.levenshtein_automaton(uterm, maxdist, prefix).to_dfa()
- @staticmethod
- def find_matches(dfa, cur):
- unull = unichr(0)
- term = cur.text()
- if term is None:
- return
- match = dfa.next_valid_string(term)
- while match:
- cur.find(match)
- term = cur.text()
- if term is None:
- return
- if match == term:
- yield match
- term += unull
- match = dfa.next_valid_string(term)
- def terms_within(self, fieldcur, uterm, maxdist, prefix=0):
- dfa = self.levenshtein_dfa(uterm, maxdist, prefix)
- return self.find_matches(dfa, fieldcur)
- # Per-doc value reader
- class PerDocumentReader(object):
- def close(self):
- pass
- @abstractmethod
- def doc_count(self):
- raise NotImplementedError
- @abstractmethod
- def doc_count_all(self):
- raise NotImplementedError
- # Deletions
- @abstractmethod
- def has_deletions(self):
- raise NotImplementedError
- @abstractmethod
- def is_deleted(self, docnum):
- raise NotImplementedError
- @abstractmethod
- def deleted_docs(self):
- raise NotImplementedError
- def all_doc_ids(self):
- """
- Returns an iterator of all (undeleted) document IDs in the reader.
- """
- is_deleted = self.is_deleted
- return (docnum for docnum in xrange(self.doc_count_all())
- if not is_deleted(docnum))
- def iter_docs(self):
- for docnum in self.all_doc_ids():
- yield docnum, self.stored_fields(docnum)
- # Columns
- def supports_columns(self):
- return False
- def has_column(self, fieldname):
- return False
- def list_columns(self):
- raise NotImplementedError
- # Don't need to override this if supports_columns() returns False
- def column_reader(self, fieldname, column):
- raise NotImplementedError
- # Bitmaps
- def field_docs(self, fieldname):
- return None
- # Lengths
- @abstractmethod
- def doc_field_length(self, docnum, fieldname, default=0):
- raise NotImplementedError
- @abstractmethod
- def field_length(self, fieldname):
- raise NotImplementedError
- @abstractmethod
- def min_field_length(self, fieldname):
- raise NotImplementedError
- @abstractmethod
- def max_field_length(self, fieldname):
- raise NotImplementedError
- # Vectors
- def has_vector(self, docnum, fieldname):
- return False
- # Don't need to override this if has_vector() always returns False
- def vector(self, docnum, fieldname, format_):
- raise NotImplementedError
- # Stored
- @abstractmethod
- def stored_fields(self, docnum):
- raise NotImplementedError
- def all_stored_fields(self):
- for docnum in self.all_doc_ids():
- yield self.stored_fields(docnum)
- # Segment base class
- class Segment(object):
- """Do not instantiate this object directly. It is used by the Index object
- to hold information about a segment. A list of objects of this class are
- pickled as part of the TOC file.
- The TOC file stores a minimal amount of information -- mostly a list of
- Segment objects. Segments are the real reverse indexes. Having multiple
- segments allows quick incremental indexing: just create a new segment for
- the new documents, and have the index overlay the new segment over previous
- ones for purposes of reading/search. "Optimizing" the index combines the
- contents of existing segments into one (removing any deleted documents
- along the way).
- """
- # Extension for compound segment files
- COMPOUND_EXT = ".seg"
- # self.indexname
- # self.segid
- def __init__(self, indexname):
- self.indexname = indexname
- self.segid = self._random_id()
- self.compound = False
- @classmethod
- def _random_id(cls, size=16):
- return random_name(size=size)
- def __repr__(self):
- return "<%s %s>" % (self.__class__.__name__, self.segment_id())
- def codec(self):
- raise NotImplementedError
- def index_name(self):
- return self.indexname
- def segment_id(self):
- if hasattr(self, "name"):
- # Old segment class
- return self.name
- else:
- return "%s_%s" % (self.index_name(), self.segid)
- def is_compound(self):
- if not hasattr(self, "compound"):
- return False
- return self.compound
- # File convenience methods
- def make_filename(self, ext):
- return "%s%s" % (self.segment_id(), ext)
- def list_files(self, storage):
- prefix = "%s." % self.segment_id()
- return [name for name in storage.list() if name.startswith(prefix)]
- def create_file(self, storage, ext, **kwargs):
- """Convenience method to create a new file in the given storage named
- with this segment's ID and the given extension. Any keyword arguments
- are passed to the storage's create_file method.
- """
- fname = self.make_filename(ext)
- return storage.create_file(fname, **kwargs)
- def open_file(self, storage, ext, **kwargs):
- """Convenience method to open a file in the given storage named with
- this segment's ID and the given extension. Any keyword arguments are
- passed to the storage's open_file method.
- """
- fname = self.make_filename(ext)
- return storage.open_file(fname, **kwargs)
- def create_compound_file(self, storage):
- segfiles = self.list_files(storage)
- assert not any(name.endswith(self.COMPOUND_EXT) for name in segfiles)
- cfile = self.create_file(storage, self.COMPOUND_EXT)
- CompoundStorage.assemble(cfile, storage, segfiles)
- for name in segfiles:
- storage.delete_file(name)
- self.compound = True
- def open_compound_file(self, storage):
- name = self.make_filename(self.COMPOUND_EXT)
- dbfile = storage.open_file(name)
- return CompoundStorage(dbfile, use_mmap=storage.supports_mmap)
- # Abstract methods
- @abstractmethod
- def doc_count_all(self):
- """
- Returns the total number of documents, DELETED OR UNDELETED, in this
- segment.
- """
- raise NotImplementedError
- def doc_count(self):
- """
- Returns the number of (undeleted) documents in this segment.
- """
- return self.doc_count_all() - self.deleted_count()
- def set_doc_count(self, doccount):
- raise NotImplementedError
- def has_deletions(self):
- """
- Returns True if any documents in this segment are deleted.
- """
- return self.deleted_count() > 0
- @abstractmethod
- def deleted_count(self):
- """
- Returns the total number of deleted documents in this segment.
- """
- raise NotImplementedError
- @abstractmethod
- def deleted_docs(self):
- raise NotImplementedError
- @abstractmethod
- def delete_document(self, docnum, delete=True):
- """Deletes the given document number. The document is not actually
- removed from the index until it is optimized.
- :param docnum: The document number to delete.
- :param delete: If False, this undeletes a deleted document.
- """
- raise NotImplementedError
- @abstractmethod
- def is_deleted(self, docnum):
- """
- Returns True if the given document number is deleted.
- """
- raise NotImplementedError
- def should_assemble(self):
- return True
- # Wrapping Segment
- class WrappingSegment(Segment):
- def __init__(self, child):
- self._child = child
- def codec(self):
- return self._child.codec()
- def index_name(self):
- return self._child.index_name()
- def segment_id(self):
- return self._child.segment_id()
- def is_compound(self):
- return self._child.is_compound()
- def should_assemble(self):
- return self._child.should_assemble()
- def make_filename(self, ext):
- return self._child.make_filename(ext)
- def list_files(self, storage):
- return self._child.list_files(storage)
- def create_file(self, storage, ext, **kwargs):
- return self._child.create_file(storage, ext, **kwargs)
- def open_file(self, storage, ext, **kwargs):
- return self._child.open_file(storage, ext, **kwargs)
- def create_compound_file(self, storage):
- return self._child.create_compound_file(storage)
- def open_compound_file(self, storage):
- return self._child.open_compound_file(storage)
- def delete_document(self, docnum, delete=True):
- return self._child.delete_document(docnum, delete=delete)
- def has_deletions(self):
- return self._child.has_deletions()
- def deleted_count(self):
- return self._child.deleted_count()
- def deleted_docs(self):
- return self._child.deleted_docs()
- def is_deleted(self, docnum):
- return self._child.is_deleted(docnum)
- def set_doc_count(self, doccount):
- self._child.set_doc_count(doccount)
- def doc_count(self):
- return self._child.doc_count()
- def doc_count_all(self):
- return self._child.doc_count_all()
- # Multi per doc reader
- class MultiPerDocumentReader(PerDocumentReader):
- def __init__(self, readers, offset=0):
- self._readers = readers
- self._doc_offsets = []
- self._doccount = 0
- for pdr in readers:
- self._doc_offsets.append(self._doccount)
- self._doccount += pdr.doc_count_all()
- self.is_closed = False
- def close(self):
- for r in self._readers:
- r.close()
- self.is_closed = True
- def doc_count_all(self):
- return self._doccount
- def doc_count(self):
- total = 0
- for r in self._readers:
- total += r.doc_count()
- return total
- def _document_reader(self, docnum):
- return max(0, bisect_right(self._doc_offsets, docnum) - 1)
- def _reader_and_docnum(self, docnum):
- rnum = self._document_reader(docnum)
- offset = self._doc_offsets[rnum]
- return rnum, docnum - offset
- # Deletions
- def has_deletions(self):
- return any(r.has_deletions() for r in self._readers)
- def is_deleted(self, docnum):
- x, y = self._reader_and_docnum(docnum)
- return self._readers[x].is_deleted(y)
- def deleted_docs(self):
- for r, offset in izip(self._readers, self._doc_offsets):
- for docnum in r.deleted_docs():
- yield docnum + offset
- def all_doc_ids(self):
- for r, offset in izip(self._readers, self._doc_offsets):
- for docnum in r.all_doc_ids():
- yield docnum + offset
- # Columns
- def has_column(self, fieldname):
- return any(r.has_column(fieldname) for r in self._readers)
- def column_reader(self, fieldname, column):
- if not self.has_column(fieldname):
- raise ValueError("No column %r" % (fieldname,))
- default = column.default_value()
- colreaders = []
- for r in self._readers:
- if r.has_column(fieldname):
- cr = r.column_reader(fieldname, column)
- else:
- cr = columns.EmptyColumnReader(default, r.doc_count_all())
- colreaders.append(cr)
- if len(colreaders) == 1:
- return colreaders[0]
- else:
- return columns.MultiColumnReader(colreaders)
- # Lengths
- def doc_field_length(self, docnum, fieldname, default=0):
- x, y = self._reader_and_docnum(docnum)
- return self._readers[x].doc_field_length(y, fieldname, default)
- def field_length(self, fieldname):
- total = 0
- for r in self._readers:
- total += r.field_length(fieldname)
- return total
- def min_field_length(self):
- return min(r.min_field_length() for r in self._readers)
- def max_field_length(self):
- return max(r.max_field_length() for r in self._readers)
- # Extended base classes
- class PerDocWriterWithColumns(PerDocumentWriter):
- def __init__(self):
- PerDocumentWriter.__init__(self)
- # Implementations need to set these attributes
- self._storage = None
- self._segment = None
- self._docnum = None
- @abstractmethod
- def _has_column(self, fieldname):
- raise NotImplementedError
- @abstractmethod
- def _create_column(self, fieldname, column):
- raise NotImplementedError
- @abstractmethod
- def _get_column(self, fieldname):
- raise NotImplementedError
- def add_column_value(self, fieldname, column, value):
- if not self._has_column(fieldname):
- self._create_column(fieldname, column)
- self._get_column(fieldname).add(self._docnum, value)
- # FieldCursor implementations
- class EmptyCursor(FieldCursor):
- def first(self):
- return None
- def find(self, term):
- return None
- def next(self):
- return None
- def text(self):
- return None
- def term_info(self):
- return None
- def is_valid(self):
- return False
|