123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296 |
- # Copyright 2007 Matt Chaput. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # 1. Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
- # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # The views and conclusions contained in the software and documentation are
- # those of the authors and should not be interpreted as representing official
- # policies, either expressed or implied, of Matt Chaput.
- """This module contains classes that allow reading from an index.
- """
- from math import log
- from bisect import bisect_right
- from heapq import heapify, heapreplace, heappop, nlargest
- from whoosh import columns
- from whoosh.compat import abstractmethod
- from whoosh.compat import xrange, zip_, next, iteritems
- from whoosh.filedb.filestore import OverlayStorage
- from whoosh.matching import MultiMatcher
- from whoosh.support.levenshtein import distance
- from whoosh.system import emptybytes
- # Exceptions
- class ReaderClosed(Exception):
- """Exception raised when you try to do some operation on a closed searcher
- (or a Results object derived from a searcher that has since been closed).
- """
- message = "Operation on a closed reader"
- class TermNotFound(Exception):
- pass
- # Term Info base class
- class TermInfo(object):
- """Represents a set of statistics about a term. This object is returned by
- :meth:`IndexReader.term_info`. These statistics may be useful for
- optimizations and scoring algorithms.
- """
- def __init__(self, weight=0, df=0, minlength=None,
- maxlength=0, maxweight=0, minid=None, maxid=0):
- self._weight = weight
- self._df = df
- self._minlength = minlength
- self._maxlength = maxlength
- self._maxweight = maxweight
- self._minid = minid
- self._maxid = maxid
- def add_posting(self, docnum, weight, length=None):
- if self._minid is None:
- self._minid = docnum
- self._maxid = docnum
- self._weight += weight
- self._df += 1
- self._maxweight = max(self._maxweight, weight)
- if length is not None:
- if self._minlength is None:
- self._minlength = length
- else:
- self._minlength = min(self._minlength, length)
- self._maxlength = max(self._maxlength, length)
- def weight(self):
- """Returns the total frequency of the term across all documents.
- """
- return self._weight
- def doc_frequency(self):
- """Returns the number of documents the term appears in.
- """
- return self._df
- def min_length(self):
- """Returns the length of the shortest field value the term appears
- in.
- """
- return self._minlength
- def max_length(self):
- """Returns the length of the longest field value the term appears
- in.
- """
- return self._maxlength
- def max_weight(self):
- """Returns the number of times the term appears in the document in
- which it appears the most.
- """
- return self._maxweight
- def min_id(self):
- """Returns the lowest document ID this term appears in.
- """
- return self._minid
- def max_id(self):
- """Returns the highest document ID this term appears in.
- """
- return self._maxid
- # Reader base class
- class IndexReader(object):
- """Do not instantiate this object directly. Instead use Index.reader().
- """
- def __enter__(self):
- return self
- def __exit__(self, *args):
- self.close()
- @abstractmethod
- def __contains__(self, term):
- """Returns True if the given term tuple (fieldname, text) is
- in this reader.
- """
- raise NotImplementedError
- def codec(self):
- """Returns the :class:`whoosh.codec.base.Codec` object used to read
- this reader's segment. If this reader is not atomic
- (``reader.is_atomic() == True``), returns None.
- """
- return None
- def segment(self):
- """Returns the :class:`whoosh.index.Segment` object used by this reader.
- If this reader is not atomic (``reader.is_atomic() == True``), returns
- None.
- """
- return None
- def storage(self):
- """Returns the :class:`whoosh.filedb.filestore.Storage` object used by
- this reader to read its files. If the reader is not atomic,
- (``reader.is_atomic() == True``), returns None.
- """
- return None
- def is_atomic(self):
- return True
- def _text_to_bytes(self, fieldname, text):
- if fieldname not in self.schema:
- raise TermNotFound((fieldname, text))
- return self.schema[fieldname].to_bytes(text)
- def close(self):
- """Closes the open files associated with this reader.
- """
- pass
- def generation(self):
- """Returns the generation of the index being read, or -1 if the backend
- is not versioned.
- """
- return None
- @abstractmethod
- def indexed_field_names(self):
- """Returns an iterable of strings representing the names of the indexed
- fields. This may include additional names not explicitly listed in the
- Schema if you use "glob" fields.
- """
- raise NotImplementedError
- @abstractmethod
- def all_terms(self):
- """Yields (fieldname, text) tuples for every term in the index.
- """
- raise NotImplementedError
- def terms_from(self, fieldname, prefix):
- """Yields (fieldname, text) tuples for every term in the index starting
- at the given prefix.
- """
- # The default implementation just scans the whole list of terms
- for fname, text in self.all_terms():
- if fname < fieldname or text < prefix:
- continue
- yield (fname, text)
- @abstractmethod
- def term_info(self, fieldname, text):
- """Returns a :class:`TermInfo` object allowing access to various
- statistics about the given term.
- """
- raise NotImplementedError
- def expand_prefix(self, fieldname, prefix):
- """Yields terms in the given field that start with the given prefix.
- """
- prefix = self._text_to_bytes(fieldname, prefix)
- for fn, text in self.terms_from(fieldname, prefix):
- if fn != fieldname or not text.startswith(prefix):
- return
- yield text
- def lexicon(self, fieldname):
- """Yields all bytestrings in the given field.
- """
- for fn, btext in self.terms_from(fieldname, emptybytes):
- if fn != fieldname:
- return
- yield btext
- def field_terms(self, fieldname):
- """Yields all term values (converted from on-disk bytes) in the given
- field.
- """
- from_bytes = self.schema[fieldname].from_bytes
- for btext in self.lexicon(fieldname):
- yield from_bytes(btext)
- def __iter__(self):
- """Yields ((fieldname, text), terminfo) tuples for each term in the
- reader, in lexical order.
- """
- term_info = self.term_info
- for term in self.all_terms():
- yield (term, term_info(*term))
- def iter_from(self, fieldname, text):
- """Yields ((fieldname, text), terminfo) tuples for all terms in the
- reader, starting at the given term.
- """
- term_info = self.term_info
- text = self._text_to_bytes(fieldname, text)
- for term in self.terms_from(fieldname, text):
- yield (term, term_info(*term))
- def iter_field(self, fieldname, prefix=''):
- """Yields (text, terminfo) tuples for all terms in the given field.
- """
- prefix = self._text_to_bytes(fieldname, prefix)
- for (fn, text), terminfo in self.iter_from(fieldname, prefix):
- if fn != fieldname:
- return
- yield text, terminfo
- def iter_prefix(self, fieldname, prefix):
- """Yields (text, terminfo) tuples for all terms in the given field with
- a certain prefix.
- """
- prefix = self._text_to_bytes(fieldname, prefix)
- for text, terminfo in self.iter_field(fieldname, prefix):
- if not text.startswith(prefix):
- return
- yield (text, terminfo)
- @abstractmethod
- def has_deletions(self):
- """Returns True if the underlying index/segment has deleted
- documents.
- """
- raise NotImplementedError
- def all_doc_ids(self):
- """Returns an iterator of all (undeleted) document IDs in the reader.
- """
- is_deleted = self.is_deleted
- return (docnum for docnum in xrange(self.doc_count_all())
- if not is_deleted(docnum))
- def iter_docs(self):
- """Yields a series of ``(docnum, stored_fields_dict)``
- tuples for the undeleted documents in the reader.
- """
- for docnum in self.all_doc_ids():
- yield docnum, self.stored_fields(docnum)
- @abstractmethod
- def is_deleted(self, docnum):
- """Returns True if the given document number is marked deleted.
- """
- raise NotImplementedError
- @abstractmethod
- def stored_fields(self, docnum):
- """Returns the stored fields for the given document number.
- :param numerickeys: use field numbers as the dictionary keys instead of
- field names.
- """
- raise NotImplementedError
- def all_stored_fields(self):
- """Yields the stored fields for all non-deleted documents.
- """
- is_deleted = self.is_deleted
- for docnum in xrange(self.doc_count_all()):
- if not is_deleted(docnum):
- yield self.stored_fields(docnum)
- @abstractmethod
- def doc_count_all(self):
- """Returns the total number of documents, DELETED OR UNDELETED,
- in this reader.
- """
- raise NotImplementedError
- @abstractmethod
- def doc_count(self):
- """Returns the total number of UNDELETED documents in this reader.
- """
- return self.doc_count_all() - self.deleted_count()
- @abstractmethod
- def frequency(self, fieldname, text):
- """Returns the total number of instances of the given term in the
- collection.
- """
- raise NotImplementedError
- @abstractmethod
- def doc_frequency(self, fieldname, text):
- """Returns how many documents the given term appears in.
- """
- raise NotImplementedError
- @abstractmethod
- def field_length(self, fieldname):
- """Returns the total number of terms in the given field. This is used
- by some scoring algorithms.
- """
- raise NotImplementedError
- @abstractmethod
- def min_field_length(self, fieldname):
- """Returns the minimum length of the field across all documents. This
- is used by some scoring algorithms.
- """
- raise NotImplementedError
- @abstractmethod
- def max_field_length(self, fieldname):
- """Returns the minimum length of the field across all documents. This
- is used by some scoring algorithms.
- """
- raise NotImplementedError
- @abstractmethod
- def doc_field_length(self, docnum, fieldname, default=0):
- """Returns the number of terms in the given field in the given
- document. This is used by some scoring algorithms.
- """
- raise NotImplementedError
- def first_id(self, fieldname, text):
- """Returns the first ID in the posting list for the given term. This
- may be optimized in certain backends.
- """
- text = self._text_to_bytes(fieldname, text)
- p = self.postings(fieldname, text)
- if p.is_active():
- return p.id()
- raise TermNotFound((fieldname, text))
- def iter_postings(self):
- """Low-level method, yields all postings in the reader as
- ``(fieldname, text, docnum, weight, valuestring)`` tuples.
- """
- for fieldname, btext in self.all_terms():
- m = self.postings(fieldname, btext)
- while m.is_active():
- yield (fieldname, btext, m.id(), m.weight(), m.value())
- m.next()
- @abstractmethod
- def postings(self, fieldname, text):
- """Returns a :class:`~whoosh.matching.Matcher` for the postings of the
- given term.
- >>> pr = reader.postings("content", "render")
- >>> pr.skip_to(10)
- >>> pr.id
- 12
- :param fieldname: the field name or field number of the term.
- :param text: the text of the term.
- :rtype: :class:`whoosh.matching.Matcher`
- """
- raise NotImplementedError
- @abstractmethod
- def has_vector(self, docnum, fieldname):
- """Returns True if the given document has a term vector for the given
- field.
- """
- raise NotImplementedError
- @abstractmethod
- def vector(self, docnum, fieldname, format_=None):
- """Returns a :class:`~whoosh.matching.Matcher` object for the
- given term vector.
- >>> docnum = searcher.document_number(path=u'/a/b/c')
- >>> v = searcher.vector(docnum, "content")
- >>> v.all_as("frequency")
- [(u"apple", 3), (u"bear", 2), (u"cab", 2)]
- :param docnum: the document number of the document for which you want
- the term vector.
- :param fieldname: the field name or field number of the field for which
- you want the term vector.
- :rtype: :class:`whoosh.matching.Matcher`
- """
- raise NotImplementedError
- def vector_as(self, astype, docnum, fieldname):
- """Returns an iterator of (termtext, value) pairs for the terms in the
- given term vector. This is a convenient shortcut to calling vector()
- and using the Matcher object when all you want are the terms and/or
- values.
- >>> docnum = searcher.document_number(path=u'/a/b/c')
- >>> searcher.vector_as("frequency", docnum, "content")
- [(u"apple", 3), (u"bear", 2), (u"cab", 2)]
- :param docnum: the document number of the document for which you want
- the term vector.
- :param fieldname: the field name or field number of the field for which
- you want the term vector.
- :param astype: a string containing the name of the format you want the
- term vector's data in, for example "weights".
- """
- vec = self.vector(docnum, fieldname)
- if astype == "weight":
- while vec.is_active():
- yield (vec.id(), vec.weight())
- vec.next()
- else:
- format_ = self.schema[fieldname].format
- decoder = format_.decoder(astype)
- while vec.is_active():
- yield (vec.id(), decoder(vec.value()))
- vec.next()
- def corrector(self, fieldname):
- """Returns a :class:`whoosh.spelling.Corrector` object that suggests
- corrections based on the terms in the given field.
- """
- from whoosh.spelling import ReaderCorrector
- fieldobj = self.schema[fieldname]
- return ReaderCorrector(self, fieldname, fieldobj)
- def terms_within(self, fieldname, text, maxdist, prefix=0):
- """
- Returns a generator of words in the given field within ``maxdist``
- Damerau-Levenshtein edit distance of the given text.
- Important: the terms are returned in **no particular order**. The only
- criterion is that they are within ``maxdist`` edits of ``text``. You
- may want to run this method multiple times with increasing ``maxdist``
- values to ensure you get the closest matches first. You may also have
- additional information (such as term frequency or an acoustic matching
- algorithm) you can use to rank terms with the same edit distance.
- :param maxdist: the maximum edit distance.
- :param prefix: require suggestions to share a prefix of this length
- with the given word. This is often justifiable since most
- misspellings do not involve the first letter of the word.
- Using a prefix dramatically decreases the time it takes to generate
- the list of words.
- :param seen: an optional set object. Words that appear in the set will
- not be yielded.
- """
- fieldobj = self.schema[fieldname]
- for btext in self.expand_prefix(fieldname, text[:prefix]):
- word = fieldobj.from_bytes(btext)
- k = distance(word, text, limit=maxdist)
- if k <= maxdist:
- yield word
- def most_frequent_terms(self, fieldname, number=5, prefix=''):
- """Returns the top 'number' most frequent terms in the given field as a
- list of (frequency, text) tuples.
- """
- gen = ((terminfo.weight(), text) for text, terminfo
- in self.iter_prefix(fieldname, prefix))
- return nlargest(number, gen)
- def most_distinctive_terms(self, fieldname, number=5, prefix=''):
- """Returns the top 'number' terms with the highest `tf*idf` scores as
- a list of (score, text) tuples.
- """
- N = float(self.doc_count())
- gen = ((terminfo.weight() * log(N / terminfo.doc_frequency()), text)
- for text, terminfo in self.iter_prefix(fieldname, prefix))
- return nlargest(number, gen)
- def leaf_readers(self):
- """Returns a list of (IndexReader, docbase) pairs for the child readers
- of this reader if it is a composite reader. If this is not a composite
- reader, it returns `[(self, 0)]`.
- """
- return [(self, 0)]
- def supports_caches(self):
- return False
- def has_column(self, fieldname):
- return False
- def column_reader(self, fieldname, column=None, reverse=False,
- translate=False):
- """
- :param fieldname: the name of the field for which to get a reader.
- :param column: if passed, use this Column object instead of the one
- associated with the field in the Schema.
- :param reverse: if passed, reverses the order of keys returned by the
- reader's ``sort_key()`` method. If the column type is not
- reversible, this will raise a ``NotImplementedError``.
- :param translate: if True, wrap the reader to call the field's
- ``from_bytes()`` method on the returned values.
- :return: a :class:`whoosh.columns.ColumnReader` object.
- """
- raise NotImplementedError
- # Segment-based reader
- class SegmentReader(IndexReader):
- def __init__(self, storage, schema, segment, generation=None, codec=None):
- self.schema = schema
- self.is_closed = False
- self._segment = segment
- self._segid = self._segment.segment_id()
- self._gen = generation
- # self.files is a storage object from which to load the segment files.
- # This is different from the general storage (which will be used for
- # caches) if the segment is in a compound file.
- if segment.is_compound():
- # Open the compound file as a storage object
- files = segment.open_compound_file(storage)
- # Use an overlay here instead of just the compound storage, in rare
- # circumstances a segment file may be added after the segment is
- # written
- self._storage = OverlayStorage(files, storage)
- else:
- self._storage = storage
- # Get subreaders from codec
- self._codec = codec if codec else segment.codec()
- self._terms = self._codec.terms_reader(self._storage, segment)
- self._perdoc = self._codec.per_document_reader(self._storage, segment)
- def codec(self):
- return self._codec
- def segment(self):
- return self._segment
- def storage(self):
- return self._storage
- def has_deletions(self):
- if self.is_closed:
- raise ReaderClosed
- return self._perdoc.has_deletions()
- def doc_count(self):
- if self.is_closed:
- raise ReaderClosed
- return self._perdoc.doc_count()
- def doc_count_all(self):
- if self.is_closed:
- raise ReaderClosed
- return self._perdoc.doc_count_all()
- def is_deleted(self, docnum):
- if self.is_closed:
- raise ReaderClosed
- return self._perdoc.is_deleted(docnum)
- def generation(self):
- return self._gen
- def __repr__(self):
- return "%s(%r, %r)" % (self.__class__.__name__, self._storage,
- self._segment)
- def __contains__(self, term):
- if self.is_closed:
- raise ReaderClosed
- fieldname, text = term
- if fieldname not in self.schema:
- return False
- text = self._text_to_bytes(fieldname, text)
- return (fieldname, text) in self._terms
- def close(self):
- if self.is_closed:
- raise ReaderClosed("Reader already closed")
- self._terms.close()
- self._perdoc.close()
- # It's possible some weird codec that doesn't use storage might have
- # passed None instead of a storage object
- if self._storage:
- self._storage.close()
- self.is_closed = True
- def stored_fields(self, docnum):
- if self.is_closed:
- raise ReaderClosed
- assert docnum >= 0
- schema = self.schema
- sfs = self._perdoc.stored_fields(docnum)
- # Double-check with schema to filter out removed fields
- return dict(item for item in iteritems(sfs) if item[0] in schema)
- # Delegate doc methods to the per-doc reader
- def all_doc_ids(self):
- if self.is_closed:
- raise ReaderClosed
- return self._perdoc.all_doc_ids()
- def iter_docs(self):
- if self.is_closed:
- raise ReaderClosed
- return self._perdoc.iter_docs()
- def all_stored_fields(self):
- if self.is_closed:
- raise ReaderClosed
- return self._perdoc.all_stored_fields()
- def field_length(self, fieldname):
- if self.is_closed:
- raise ReaderClosed
- return self._perdoc.field_length(fieldname)
- def min_field_length(self, fieldname):
- if self.is_closed:
- raise ReaderClosed
- return self._perdoc.min_field_length(fieldname)
- def max_field_length(self, fieldname):
- if self.is_closed:
- raise ReaderClosed
- return self._perdoc.max_field_length(fieldname)
- def doc_field_length(self, docnum, fieldname, default=0):
- if self.is_closed:
- raise ReaderClosed
- return self._perdoc.doc_field_length(docnum, fieldname, default)
- def has_vector(self, docnum, fieldname):
- if self.is_closed:
- raise ReaderClosed
- return self._perdoc.has_vector(docnum, fieldname)
- #
- def _test_field(self, fieldname):
- if self.is_closed:
- raise ReaderClosed
- if fieldname not in self.schema:
- raise TermNotFound("No field %r" % fieldname)
- if self.schema[fieldname].format is None:
- raise TermNotFound("Field %r is not indexed" % fieldname)
- def indexed_field_names(self):
- return self._terms.indexed_field_names()
- def all_terms(self):
- if self.is_closed:
- raise ReaderClosed
- schema = self.schema
- return ((fieldname, text) for fieldname, text in self._terms.terms()
- if fieldname in schema)
- def terms_from(self, fieldname, prefix):
- self._test_field(fieldname)
- prefix = self._text_to_bytes(fieldname, prefix)
- schema = self.schema
- return ((fname, text) for fname, text
- in self._terms.terms_from(fieldname, prefix)
- if fname in schema)
- def term_info(self, fieldname, text):
- self._test_field(fieldname)
- text = self._text_to_bytes(fieldname, text)
- try:
- return self._terms.term_info(fieldname, text)
- except KeyError:
- raise TermNotFound("%s:%r" % (fieldname, text))
- def expand_prefix(self, fieldname, prefix):
- self._test_field(fieldname)
- prefix = self._text_to_bytes(fieldname, prefix)
- return IndexReader.expand_prefix(self, fieldname, prefix)
- def lexicon(self, fieldname):
- self._test_field(fieldname)
- return IndexReader.lexicon(self, fieldname)
- def __iter__(self):
- if self.is_closed:
- raise ReaderClosed
- schema = self.schema
- return ((term, terminfo) for term, terminfo in self._terms.items()
- if term[0] in schema)
- def iter_from(self, fieldname, text):
- self._test_field(fieldname)
- schema = self.schema
- text = self._text_to_bytes(fieldname, text)
- for term, terminfo in self._terms.items_from(fieldname, text):
- if term[0] not in schema:
- continue
- yield (term, terminfo)
- def frequency(self, fieldname, text):
- self._test_field(fieldname)
- text = self._text_to_bytes(fieldname, text)
- try:
- return self._terms.frequency(fieldname, text)
- except KeyError:
- return 0
- def doc_frequency(self, fieldname, text):
- self._test_field(fieldname)
- text = self._text_to_bytes(fieldname, text)
- try:
- return self._terms.doc_frequency(fieldname, text)
- except KeyError:
- return 0
- def postings(self, fieldname, text, scorer=None):
- from whoosh.matching.wrappers import FilterMatcher
- if self.is_closed:
- raise ReaderClosed
- if fieldname not in self.schema:
- raise TermNotFound("No field %r" % fieldname)
- text = self._text_to_bytes(fieldname, text)
- format_ = self.schema[fieldname].format
- matcher = self._terms.matcher(fieldname, text, format_, scorer=scorer)
- deleted = frozenset(self._perdoc.deleted_docs())
- if deleted:
- matcher = FilterMatcher(matcher, deleted, exclude=True)
- return matcher
- def vector(self, docnum, fieldname, format_=None):
- if self.is_closed:
- raise ReaderClosed
- if fieldname not in self.schema:
- raise TermNotFound("No field %r" % fieldname)
- vformat = format_ or self.schema[fieldname].vector
- if not vformat:
- raise Exception("No vectors are stored for field %r" % fieldname)
- return self._perdoc.vector(docnum, fieldname, vformat)
- def cursor(self, fieldname):
- if self.is_closed:
- raise ReaderClosed
- fieldobj = self.schema[fieldname]
- return self._terms.cursor(fieldname, fieldobj)
- def terms_within(self, fieldname, text, maxdist, prefix=0):
- # Replaces the horribly inefficient base implementation with one based
- # on skipping through the word list efficiently using a DFA
- fieldobj = self.schema[fieldname]
- spellfield = fieldobj.spelling_fieldname(fieldname)
- auto = self._codec.automata(self._storage, self._segment)
- fieldcur = self.cursor(spellfield)
- return auto.terms_within(fieldcur, text, maxdist, prefix)
- # Column methods
- def has_column(self, fieldname):
- if self.is_closed:
- raise ReaderClosed
- coltype = self.schema[fieldname].column_type
- return coltype and self._perdoc.has_column(fieldname)
- def column_reader(self, fieldname, column=None, reverse=False,
- translate=True):
- if self.is_closed:
- raise ReaderClosed
- fieldobj = self.schema[fieldname]
- column = column or fieldobj.column_type
- if not column:
- raise Exception("No column for field %r in %r"
- % (fieldname, self))
- if self._perdoc.has_column(fieldname):
- creader = self._perdoc.column_reader(fieldname, column)
- if reverse:
- creader.set_reverse()
- else:
- # This segment doesn't have a column file for this field, so create
- # a fake column reader that always returns the default value.
- default = column.default_value(reverse)
- creader = columns.EmptyColumnReader(default, self.doc_count_all())
- if translate:
- # Wrap the column in a Translator to give the caller
- # nice values instead of sortable representations
- fcv = fieldobj.from_column_value
- creader = columns.TranslatingColumnReader(creader, fcv)
- return creader
- # Fake IndexReader class for empty indexes
- class EmptyReader(IndexReader):
- def __init__(self, schema):
- self.schema = schema
- def __contains__(self, term):
- return False
- def __iter__(self):
- return iter([])
- def cursor(self, fieldname):
- from whoosh.codec.base import EmptyCursor
- return EmptyCursor()
- def indexed_field_names(self):
- return []
- def all_terms(self):
- return iter([])
- def term_info(self, fieldname, text):
- raise TermNotFound((fieldname, text))
- def iter_from(self, fieldname, text):
- return iter([])
- def iter_field(self, fieldname, prefix=''):
- return iter([])
- def iter_prefix(self, fieldname, prefix=''):
- return iter([])
- def lexicon(self, fieldname):
- return iter([])
- def has_deletions(self):
- return False
- def is_deleted(self, docnum):
- return False
- def stored_fields(self, docnum):
- raise KeyError("No document number %s" % docnum)
- def all_stored_fields(self):
- return iter([])
- def doc_count_all(self):
- return 0
- def doc_count(self):
- return 0
- def frequency(self, fieldname, text):
- return 0
- def doc_frequency(self, fieldname, text):
- return 0
- def field_length(self, fieldname):
- return 0
- def min_field_length(self, fieldname):
- return 0
- def max_field_length(self, fieldname):
- return 0
- def doc_field_length(self, docnum, fieldname, default=0):
- return default
- def postings(self, fieldname, text, scorer=None):
- raise TermNotFound("%s:%r" % (fieldname, text))
- def has_vector(self, docnum, fieldname):
- return False
- def vector(self, docnum, fieldname, format_=None):
- raise KeyError("No document number %s" % docnum)
- def most_frequent_terms(self, fieldname, number=5, prefix=''):
- return iter([])
- def most_distinctive_terms(self, fieldname, number=5, prefix=None):
- return iter([])
- # Multisegment reader class
- class MultiReader(IndexReader):
- """Do not instantiate this object directly. Instead use Index.reader().
- """
- def __init__(self, readers, generation=None):
- self.readers = readers
- self._gen = generation
- self.schema = None
- if readers:
- self.schema = readers[0].schema
- self.doc_offsets = []
- self.base = 0
- for r in self.readers:
- self.doc_offsets.append(self.base)
- self.base += r.doc_count_all()
- self.is_closed = False
- def _document_segment(self, docnum):
- return max(0, bisect_right(self.doc_offsets, docnum) - 1)
- def _segment_and_docnum(self, docnum):
- segmentnum = self._document_segment(docnum)
- offset = self.doc_offsets[segmentnum]
- return segmentnum, docnum - offset
- def cursor(self, fieldname):
- return MultiCursor([r.cursor(fieldname) for r in self.readers])
- def is_atomic(self):
- return False
- def leaf_readers(self):
- return zip_(self.readers, self.doc_offsets)
- def add_reader(self, reader):
- self.readers.append(reader)
- self.doc_offsets.append(self.base)
- self.base += reader.doc_count_all()
- def close(self):
- for d in self.readers:
- d.close()
- self.is_closed = True
- def generation(self):
- return self._gen
- def format(self, fieldname):
- for r in self.readers:
- fmt = r.format(fieldname)
- if fmt is not None:
- return fmt
- def vector_format(self, fieldname):
- for r in self.readers:
- vfmt = r.vector_format(fieldname)
- if vfmt is not None:
- return vfmt
- # Term methods
- def __contains__(self, term):
- return any(r.__contains__(term) for r in self.readers)
- def _merge_terms(self, iterlist):
- # Merge-sorts terms coming from a list of term iterators.
- # Create a map so we can look up each iterator by its id() value
- itermap = {}
- for it in iterlist:
- itermap[id(it)] = it
- # Fill in the list with the head term from each iterator.
- current = []
- for it in iterlist:
- try:
- term = next(it)
- except StopIteration:
- continue
- current.append((term, id(it)))
- # Number of active iterators
- active = len(current)
- # If only one iterator is active, just yield from it and return
- if active == 1:
- term, itid = current[0]
- it = itermap[itid]
- yield term
- for term in it:
- yield term
- return
- # Otherwise, do a streaming heap sort of the terms from the iterators
- heapify(current)
- while active:
- # Peek at the first term in the sorted list
- term = current[0][0]
- # Re-iterate on all items in the list that have that term
- while active and current[0][0] == term:
- it = itermap[current[0][1]]
- try:
- nextterm = next(it)
- heapreplace(current, (nextterm, id(it)))
- except StopIteration:
- heappop(current)
- active -= 1
- # Yield the term
- yield term
- def indexed_field_names(self):
- names = set()
- for r in self.readers:
- names.update(r.indexed_field_names())
- return iter(names)
- def all_terms(self):
- return self._merge_terms([r.all_terms() for r in self.readers])
- def terms_from(self, fieldname, prefix):
- return self._merge_terms([r.terms_from(fieldname, prefix)
- for r in self.readers])
- def term_info(self, fieldname, text):
- term = (fieldname, text)
- # Get the term infos for the sub-readers containing the term
- tis = [(r.term_info(fieldname, text), offset) for r, offset
- in zip_(self.readers, self.doc_offsets) if term in r]
- # If only one reader had the term, return its terminfo with the offset
- # added
- if not tis:
- raise TermNotFound(term)
- return combine_terminfos(tis)
- def frequency(self, fieldname, text):
- return sum(r.frequency(fieldname, text) for r in self.readers)
- def doc_frequency(self, fieldname, text):
- return sum(r.doc_frequency(fieldname, text) for r in self.readers)
- def postings(self, fieldname, text):
- # This method does not add a scorer; for that, use Searcher.postings()
- postreaders = []
- docoffsets = []
- term = (fieldname, text)
- for i, r in enumerate(self.readers):
- if term in r:
- offset = self.doc_offsets[i]
- pr = r.postings(fieldname, text)
- postreaders.append(pr)
- docoffsets.append(offset)
- if not postreaders:
- raise TermNotFound(fieldname, text)
- return MultiMatcher(postreaders, docoffsets)
- def first_id(self, fieldname, text):
- for i, r in enumerate(self.readers):
- try:
- id = r.first_id(fieldname, text)
- except (KeyError, TermNotFound):
- pass
- else:
- if id is None:
- raise TermNotFound((fieldname, text))
- else:
- return self.doc_offsets[i] + id
- raise TermNotFound((fieldname, text))
- # Deletion methods
- def has_deletions(self):
- return any(r.has_deletions() for r in self.readers)
- def is_deleted(self, docnum):
- segmentnum, segmentdoc = self._segment_and_docnum(docnum)
- return self.readers[segmentnum].is_deleted(segmentdoc)
- def stored_fields(self, docnum):
- segmentnum, segmentdoc = self._segment_and_docnum(docnum)
- return self.readers[segmentnum].stored_fields(segmentdoc)
- # Columns
- def has_column(self, fieldname):
- return any(r.has_column(fieldname) for r in self.readers)
- def column_reader(self, fieldname, column=None, reverse=False,
- translate=True):
- crs = []
- doc_offsets = []
- for i, r in enumerate(self.readers):
- if r.has_column(fieldname):
- cr = r.column_reader(fieldname, column=column, reverse=reverse,
- translate=translate)
- crs.append(cr)
- doc_offsets.append(self.doc_offsets[i])
- return columns.MultiColumnReader(crs, doc_offsets)
- # Per doc methods
- def all_stored_fields(self):
- for reader in self.readers:
- for result in reader.all_stored_fields():
- yield result
- def doc_count_all(self):
- return sum(dr.doc_count_all() for dr in self.readers)
- def doc_count(self):
- return sum(dr.doc_count() for dr in self.readers)
- def field_length(self, fieldname):
- return sum(dr.field_length(fieldname) for dr in self.readers)
- def min_field_length(self, fieldname):
- return min(r.min_field_length(fieldname) for r in self.readers)
- def max_field_length(self, fieldname):
- return max(r.max_field_length(fieldname) for r in self.readers)
- def doc_field_length(self, docnum, fieldname, default=0):
- segmentnum, segmentdoc = self._segment_and_docnum(docnum)
- reader = self.readers[segmentnum]
- return reader.doc_field_length(segmentdoc, fieldname, default=default)
- def has_vector(self, docnum, fieldname):
- segmentnum, segmentdoc = self._segment_and_docnum(docnum)
- return self.readers[segmentnum].has_vector(segmentdoc, fieldname)
- def vector(self, docnum, fieldname, format_=None):
- segmentnum, segmentdoc = self._segment_and_docnum(docnum)
- return self.readers[segmentnum].vector(segmentdoc, fieldname)
- def vector_as(self, astype, docnum, fieldname):
- segmentnum, segmentdoc = self._segment_and_docnum(docnum)
- return self.readers[segmentnum].vector_as(astype, segmentdoc,
- fieldname)
- def combine_terminfos(tis):
- if len(tis) == 1:
- ti, offset = tis[0]
- ti._minid += offset
- ti._maxid += offset
- return ti
- # Combine the various statistics
- w = sum(ti.weight() for ti, _ in tis)
- df = sum(ti.doc_frequency() for ti, _ in tis)
- ml = min(ti.min_length() for ti, _ in tis)
- xl = max(ti.max_length() for ti, _ in tis)
- xw = max(ti.max_weight() for ti, _ in tis)
- # For min and max ID, we need to add the doc offsets
- mid = min(ti.min_id() + offset for ti, offset in tis)
- xid = max(ti.max_id() + offset for ti, offset in tis)
- return TermInfo(w, df, ml, xl, xw, mid, xid)
- class MultiCursor(object):
- def __init__(self, cursors):
- self._cursors = [c for c in cursors if c.is_valid()]
- self._low = []
- self._text = None
- self.next()
- def _find_low(self):
- low = []
- lowterm = None
- for c in self._cursors:
- if c.is_valid():
- cterm = c.term()
- if low and cterm == lowterm:
- low.append(c)
- elif low and cterm < lowterm:
- low = [c]
- lowterm = cterm
- self._low = low
- self._text = lowterm
- return lowterm
- def first(self):
- for c in self._cursors:
- c.first()
- return self._find_low()
- def find(self, term):
- for c in self._cursors:
- c.find(term)
- return self._find_low()
- def next(self):
- for c in self._cursors:
- c.next()
- return self._find_low()
- def term_info(self):
- tis = [c.term_info() for c in self._low]
- return combine_terminfos(tis) if tis else None
- def is_valid(self):
- return any(c.is_valid() for c in self._cursors)
|