| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334 |
- # Copyright 2012 Matt Chaput. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # 1. Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
- # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # The views and conclusions contained in the software and documentation are
- # those of the authors and should not be interpreted as representing official
- # policies, either expressed or implied, of Matt Chaput.
- from __future__ import with_statement
- from bisect import bisect_left
- from threading import Lock, RLock
- from whoosh.compat import xrange
- from whoosh.codec import base
- from whoosh.matching import ListMatcher
- from whoosh.reading import SegmentReader, TermInfo, TermNotFound
- from whoosh.writing import SegmentWriter
- class MemWriter(SegmentWriter):
- def commit(self):
- self._finalize_segment()
- class MemoryCodec(base.Codec):
- def __init__(self):
- from whoosh.filedb.filestore import RamStorage
- self.storage = RamStorage()
- self.segment = MemSegment(self, "blah")
- def writer(self, schema):
- ix = self.storage.create_index(schema)
- return MemWriter(ix, _lk=False, codec=self,
- docbase=self.segment._doccount)
- def reader(self, schema):
- return SegmentReader(self.storage, schema, self.segment, codec=self)
- def per_document_writer(self, storage, segment):
- return MemPerDocWriter(self.storage, self.segment)
- def field_writer(self, storage, segment):
- return MemFieldWriter(self.storage, self.segment)
- def per_document_reader(self, storage, segment):
- return MemPerDocReader(self.storage, self.segment)
- def terms_reader(self, storage, segment):
- return MemTermsReader(self.storage, self.segment)
- def new_segment(self, storage, indexname):
- return self.segment
- class MemPerDocWriter(base.PerDocWriterWithColumns):
- def __init__(self, storage, segment):
- self._storage = storage
- self._segment = segment
- self.is_closed = False
- self._colwriters = {}
- self._doccount = 0
- def _has_column(self, fieldname):
- return fieldname in self._colwriters
- def _create_column(self, fieldname, column):
- colfile = self._storage.create_file("%s.c" % fieldname)
- self._colwriters[fieldname] = (colfile, column.writer(colfile))
- def _get_column(self, fieldname):
- return self._colwriters[fieldname][1]
- def start_doc(self, docnum):
- self._doccount += 1
- self._docnum = docnum
- self._stored = {}
- self._lengths = {}
- self._vectors = {}
- def add_field(self, fieldname, fieldobj, value, length):
- if value is not None:
- self._stored[fieldname] = value
- if length is not None:
- self._lengths[fieldname] = length
- def add_vector_items(self, fieldname, fieldobj, items):
- self._vectors[fieldname] = tuple(items)
- def finish_doc(self):
- with self._segment._lock:
- docnum = self._docnum
- self._segment._stored[docnum] = self._stored
- self._segment._lengths[docnum] = self._lengths
- self._segment._vectors[docnum] = self._vectors
- def close(self):
- colwriters = self._colwriters
- for fieldname in colwriters:
- colfile, colwriter = colwriters[fieldname]
- colwriter.finish(self._doccount)
- colfile.close()
- self.is_closed = True
- class MemPerDocReader(base.PerDocumentReader):
- def __init__(self, storage, segment):
- self._storage = storage
- self._segment = segment
- def doc_count(self):
- return self._segment.doc_count()
- def doc_count_all(self):
- return self._segment.doc_count_all()
- def has_deletions(self):
- return self._segment.has_deletions()
- def is_deleted(self, docnum):
- return self._segment.is_deleted(docnum)
- def deleted_docs(self):
- return self._segment.deleted_docs()
- def supports_columns(self):
- return True
- def has_column(self, fieldname):
- filename = "%s.c" % fieldname
- return self._storage.file_exists(filename)
- def column_reader(self, fieldname, column):
- filename = "%s.c" % fieldname
- colfile = self._storage.open_file(filename)
- length = self._storage.file_length(filename)
- return column.reader(colfile, 0, length, self._segment.doc_count_all())
- def doc_field_length(self, docnum, fieldname, default=0):
- return self._segment._lengths[docnum].get(fieldname, default)
- def field_length(self, fieldname):
- return sum(lens.get(fieldname, 0) for lens
- in self._segment._lengths.values())
- def min_field_length(self, fieldname):
- return min(lens[fieldname] for lens in self._segment._lengths.values()
- if fieldname in lens)
- def max_field_length(self, fieldname):
- return max(lens[fieldname] for lens in self._segment._lengths.values()
- if fieldname in lens)
- def has_vector(self, docnum, fieldname):
- return (docnum in self._segment._vectors
- and fieldname in self._segment._vectors[docnum])
- def vector(self, docnum, fieldname, format_):
- items = self._segment._vectors[docnum][fieldname]
- ids, weights, values = zip(*items)
- return ListMatcher(ids, weights, values, format_)
- def stored_fields(self, docnum):
- return self._segment._stored[docnum]
- def close(self):
- pass
- class MemFieldWriter(base.FieldWriter):
- def __init__(self, storage, segment):
- self._storage = storage
- self._segment = segment
- self._fieldname = None
- self._btext = None
- self.is_closed = False
- def start_field(self, fieldname, fieldobj):
- if self._fieldname is not None:
- raise Exception("Called start_field in a field")
- with self._segment._lock:
- invindex = self._segment._invindex
- if fieldname not in invindex:
- invindex[fieldname] = {}
- self._fieldname = fieldname
- self._fieldobj = fieldobj
- def start_term(self, btext):
- if self._btext is not None:
- raise Exception("Called start_term in a term")
- fieldname = self._fieldname
- fielddict = self._segment._invindex[fieldname]
- terminfos = self._segment._terminfos
- with self._segment._lock:
- if btext not in fielddict:
- fielddict[btext] = []
- if (fieldname, btext) not in terminfos:
- terminfos[fieldname, btext] = TermInfo()
- self._postings = fielddict[btext]
- self._terminfo = terminfos[fieldname, btext]
- self._btext = btext
- def add(self, docnum, weight, vbytes, length):
- self._postings.append((docnum, weight, vbytes))
- self._terminfo.add_posting(docnum, weight, length)
- def finish_term(self):
- if self._btext is None:
- raise Exception("Called finish_term outside a term")
- self._postings = None
- self._btext = None
- self._terminfo = None
- def finish_field(self):
- if self._fieldname is None:
- raise Exception("Called finish_field outside a field")
- self._fieldname = None
- self._fieldobj = None
- def close(self):
- self.is_closed = True
- class MemTermsReader(base.TermsReader):
- def __init__(self, storage, segment):
- self._storage = storage
- self._segment = segment
- self._invindex = segment._invindex
- def __contains__(self, term):
- return term in self._segment._terminfos
- def terms(self):
- for fieldname in self._invindex:
- for btext in self._invindex[fieldname]:
- yield (fieldname, btext)
- def terms_from(self, fieldname, prefix):
- if fieldname not in self._invindex:
- raise TermNotFound("Unknown field %r" % (fieldname,))
- terms = sorted(self._invindex[fieldname])
- if not terms:
- return
- start = bisect_left(terms, prefix)
- for i in xrange(start, len(terms)):
- yield (fieldname, terms[i])
- def term_info(self, fieldname, text):
- return self._segment._terminfos[fieldname, text]
- def matcher(self, fieldname, btext, format_, scorer=None):
- items = self._invindex[fieldname][btext]
- ids, weights, values = zip(*items)
- return ListMatcher(ids, weights, values, format_, scorer=scorer)
- def indexed_field_names(self):
- return self._invindex.keys()
- def close(self):
- pass
- class MemSegment(base.Segment):
- def __init__(self, codec, indexname):
- base.Segment.__init__(self, indexname)
- self._codec = codec
- self._doccount = 0
- self._stored = {}
- self._lengths = {}
- self._vectors = {}
- self._invindex = {}
- self._terminfos = {}
- self._lock = Lock()
- def codec(self):
- return self._codec
- def set_doc_count(self, doccount):
- self._doccount = doccount
- def doc_count(self):
- return len(self._stored)
- def doc_count_all(self):
- return self._doccount
- def delete_document(self, docnum, delete=True):
- if not delete:
- raise Exception("MemoryCodec can't undelete")
- with self._lock:
- del self._stored[docnum]
- del self._lengths[docnum]
- del self._vectors[docnum]
- def has_deletions(self):
- with self._lock:
- return self._doccount - len(self._stored)
- def is_deleted(self, docnum):
- return docnum not in self._stored
- def deleted_docs(self):
- stored = self._stored
- for docnum in xrange(self.doc_count_all()):
- if docnum not in stored:
- yield docnum
- def should_assemble(self):
- return False
|