123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710 |
- # Copyright 2007 Matt Chaput. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # 1. Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
- # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # The views and conclusions contained in the software and documentation are
- # those of the authors and should not be interpreted as representing official
- # policies, either expressed or implied, of Matt Chaput.
- """Contains the main functions/classes for creating, maintaining, and using
- an index.
- """
- from __future__ import division
- import os.path, re, sys
- from time import time, sleep
- from whoosh import __version__
- from whoosh.legacy import toc_loaders
- from whoosh.compat import pickle, string_type
- from whoosh.fields import ensure_schema
- from whoosh.system import _INT_SIZE, _FLOAT_SIZE, _LONG_SIZE
- _DEF_INDEX_NAME = "MAIN"
- _CURRENT_TOC_VERSION = -111
- # Exceptions
- class LockError(Exception):
- pass
- class IndexError(Exception):
- """Generic index error."""
- class IndexVersionError(IndexError):
- """Raised when you try to open an index using a format that the current
- version of Whoosh cannot read. That is, when the index you're trying to
- open is either not backward or forward compatible with this version of
- Whoosh.
- """
- def __init__(self, msg, version, release=None):
- Exception.__init__(self, msg)
- self.version = version
- self.release = release
- class OutOfDateError(IndexError):
- """Raised when you try to commit changes to an index which is not the
- latest generation.
- """
- class EmptyIndexError(IndexError):
- """Raised when you try to work with an index that has no indexed terms.
- """
- # Convenience functions
- def create_in(dirname, schema, indexname=None):
- """Convenience function to create an index in a directory. Takes care of
- creating a FileStorage object for you.
- :param dirname: the path string of the directory in which to create the
- index.
- :param schema: a :class:`whoosh.fields.Schema` object describing the
- index's fields.
- :param indexname: the name of the index to create; you only need to specify
- this if you are creating multiple indexes within the same storage
- object.
- :returns: :class:`Index`
- """
- from whoosh.filedb.filestore import FileStorage
- if not indexname:
- indexname = _DEF_INDEX_NAME
- storage = FileStorage(dirname)
- return FileIndex.create(storage, schema, indexname)
- def open_dir(dirname, indexname=None, readonly=False, schema=None):
- """Convenience function for opening an index in a directory. Takes care of
- creating a FileStorage object for you. dirname is the filename of the
- directory in containing the index. indexname is the name of the index to
- create; you only need to specify this if you have multiple indexes within
- the same storage object.
- :param dirname: the path string of the directory in which to create the
- index.
- :param indexname: the name of the index to create; you only need to specify
- this if you have multiple indexes within the same storage object.
- """
- from whoosh.filedb.filestore import FileStorage
- if indexname is None:
- indexname = _DEF_INDEX_NAME
- storage = FileStorage(dirname, readonly=readonly)
- return FileIndex(storage, schema=schema, indexname=indexname)
- def exists_in(dirname, indexname=None):
- """Returns True if dirname contains a Whoosh index.
- :param dirname: the file path of a directory.
- :param indexname: the name of the index. If None, the default index name is
- used.
- """
- if os.path.exists(dirname):
- try:
- ix = open_dir(dirname, indexname=indexname)
- return ix.latest_generation() > -1
- except EmptyIndexError:
- pass
- return False
- def exists(storage, indexname=None):
- """Deprecated; use ``storage.index_exists()``.
- :param storage: a store.Storage object.
- :param indexname: the name of the index. If None, the default index name is
- used.
- """
- return storage.index_exists(indexname)
- def version_in(dirname, indexname=None):
- """Returns a tuple of (release_version, format_version), where
- release_version is the release version number of the Whoosh code that
- created the index -- e.g. (0, 1, 24) -- and format_version is the version
- number of the on-disk format used for the index -- e.g. -102.
- You should avoid attaching significance to the second number (the index
- version). This is simply a version number for the TOC file and probably
- should not have been exposed in a public interface. The best way to check
- if the current version of Whoosh can open an index is to actually try to
- open it and see if it raises a ``whoosh.index.IndexVersionError`` exception.
- Note that the release and format version are available as attributes on the
- Index object in Index.release and Index.version.
- :param dirname: the file path of a directory containing an index.
- :param indexname: the name of the index. If None, the default index name is
- used.
- :returns: ((major_ver, minor_ver, build_ver), format_ver)
- """
- from whoosh.filedb.filestore import FileStorage
- storage = FileStorage(dirname)
- return version(storage, indexname=indexname)
- def version(storage, indexname=None):
- """Returns a tuple of (release_version, format_version), where
- release_version is the release version number of the Whoosh code that
- created the index -- e.g. (0, 1, 24) -- and format_version is the version
- number of the on-disk format used for the index -- e.g. -102.
- You should avoid attaching significance to the second number (the index
- version). This is simply a version number for the TOC file and probably
- should not have been exposed in a public interface. The best way to check
- if the current version of Whoosh can open an index is to actually try to
- open it and see if it raises a ``whoosh.index.IndexVersionError`` exception.
- Note that the release and format version are available as attributes on the
- Index object in Index.release and Index.version.
- :param storage: a store.Storage object.
- :param indexname: the name of the index. If None, the default index name is
- used.
- :returns: ((major_ver, minor_ver, build_ver), format_ver)
- """
- try:
- if indexname is None:
- indexname = _DEF_INDEX_NAME
- ix = storage.open_index(indexname)
- return (ix.release, ix.version)
- except IndexVersionError:
- e = sys.exc_info()[1]
- return (None, e.version)
- # Index base class
- class Index(object):
- """Represents an indexed collection of documents.
- """
- def close(self):
- """Closes any open resources held by the Index object itself. This may
- not close all resources being used everywhere, for example by a
- Searcher object.
- """
- pass
- def add_field(self, fieldname, fieldspec):
- """Adds a field to the index's schema.
- :param fieldname: the name of the field to add.
- :param fieldspec: an instantiated :class:`whoosh.fields.FieldType`
- object.
- """
- w = self.writer()
- w.add_field(fieldname, fieldspec)
- w.commit()
- def remove_field(self, fieldname):
- """Removes the named field from the index's schema. Depending on the
- backend implementation, this may or may not actually remove existing
- data for the field from the index. Optimizing the index should always
- clear out existing data for a removed field.
- """
- w = self.writer()
- w.remove_field(fieldname)
- w.commit()
- def latest_generation(self):
- """Returns the generation number of the latest generation of this
- index, or -1 if the backend doesn't support versioning.
- """
- return -1
- def refresh(self):
- """Returns a new Index object representing the latest generation
- of this index (if this object is the latest generation, or the backend
- doesn't support versioning, returns self).
- :returns: :class:`Index`
- """
- return self
- def up_to_date(self):
- """Returns True if this object represents the latest generation of
- this index. Returns False if this object is not the latest generation
- (that is, someone else has updated the index since you opened this
- object).
- """
- return True
- def last_modified(self):
- """Returns the last modified time of the index, or -1 if the backend
- doesn't support last-modified times.
- """
- return -1
- def is_empty(self):
- """Returns True if this index is empty (that is, it has never had any
- documents successfully written to it.
- """
- raise NotImplementedError
- def optimize(self):
- """Optimizes this index, if necessary.
- """
- pass
- def doc_count_all(self):
- """Returns the total number of documents, DELETED OR UNDELETED,
- in this index.
- """
- r = self.reader()
- try:
- return r.doc_count_all()
- finally:
- r.close()
- def doc_count(self):
- """Returns the total number of UNDELETED documents in this index.
- """
- r = self.reader()
- try:
- return r.doc_count()
- finally:
- r.close()
- def searcher(self, **kwargs):
- """Returns a Searcher object for this index. Keyword arguments are
- passed to the Searcher object's constructor.
- :rtype: :class:`whoosh.searching.Searcher`
- """
- from whoosh.searching import Searcher
- return Searcher(self.reader(), fromindex=self, **kwargs)
- def field_length(self, fieldname):
- """Returns the total length of the field across all documents.
- """
- r = self.reader()
- try:
- return r.field_length(fieldname)
- finally:
- r.close()
- def max_field_length(self, fieldname):
- """Returns the maximum length of the field across all documents.
- """
- r = self.reader()
- try:
- return r.max_field_length(fieldname)
- finally:
- r.close()
- def reader(self, reuse=None):
- """Returns an IndexReader object for this index.
- :param reuse: an existing reader. Some implementations may recycle
- resources from this existing reader to create the new reader. Note
- that any resources in the "recycled" reader that are not used by
- the new reader will be CLOSED, so you CANNOT use it afterward.
- :rtype: :class:`whoosh.reading.IndexReader`
- """
- raise NotImplementedError
- def writer(self, **kwargs):
- """Returns an IndexWriter object for this index.
- :rtype: :class:`whoosh.writing.IndexWriter`
- """
- raise NotImplementedError
- def delete_by_term(self, fieldname, text, searcher=None):
- w = self.writer()
- w.delete_by_term(fieldname, text, searcher=searcher)
- w.commit()
- def delete_by_query(self, q, searcher=None):
- w = self.writer()
- w.delete_by_query(q, searcher=searcher)
- w.commit()
- # Codec-based index implementation
- def clean_files(storage, indexname, gen, segments):
- # Attempts to remove unused index files (called when a new generation
- # is created). If existing Index and/or reader objects have the files
- # open, they may not be deleted immediately (i.e. on Windows) but will
- # probably be deleted eventually by a later call to clean_files.
- current_segment_names = set(s.segment_id() for s in segments)
- tocpattern = TOC._pattern(indexname)
- segpattern = TOC._segment_pattern(indexname)
- todelete = set()
- for filename in storage:
- if filename.startswith("."):
- continue
- tocm = tocpattern.match(filename)
- segm = segpattern.match(filename)
- if tocm:
- if int(tocm.group(1)) != gen:
- todelete.add(filename)
- elif segm:
- name = segm.group(1)
- if name not in current_segment_names:
- todelete.add(filename)
- for filename in todelete:
- try:
- storage.delete_file(filename)
- except OSError:
- # Another process still has this file open, I guess
- pass
- class FileIndex(Index):
- def __init__(self, storage, schema=None, indexname=_DEF_INDEX_NAME):
- from whoosh.filedb.filestore import Storage
- if not isinstance(storage, Storage):
- raise ValueError("%r is not a Storage object" % storage)
- if not isinstance(indexname, string_type):
- raise ValueError("indexname %r is not a string" % indexname)
- if schema:
- schema = ensure_schema(schema)
- self.storage = storage
- self._schema = schema
- self.indexname = indexname
- # Try reading the TOC to see if it's possible
- TOC.read(self.storage, self.indexname, schema=self._schema)
- @classmethod
- def create(cls, storage, schema, indexname=_DEF_INDEX_NAME):
- TOC.create(storage, schema, indexname)
- return cls(storage, schema, indexname)
- def __repr__(self):
- return "%s(%r, %r)" % (self.__class__.__name__,
- self.storage, self.indexname)
- def close(self):
- pass
- # add_field
- # remove_field
- def latest_generation(self):
- return TOC._latest_generation(self.storage, self.indexname)
- # refresh
- # up_to_date
- def last_modified(self):
- gen = self.latest_generation()
- filename = TOC._filename(self.indexname, gen)
- return self.storage.file_modified(filename)
- def is_empty(self):
- return len(self._read_toc().segments) == 0
- def optimize(self, **kwargs):
- w = self.writer(**kwargs)
- w.commit(optimize=True)
- # searcher
- def writer(self, procs=1, **kwargs):
- if procs > 1:
- from whoosh.multiproc import MpWriter
- return MpWriter(self, procs=procs, **kwargs)
- else:
- from whoosh.writing import SegmentWriter
- return SegmentWriter(self, **kwargs)
- def lock(self, name):
- """Returns a lock object that you can try to call acquire() on to
- lock the index.
- """
- return self.storage.lock(self.indexname + "_" + name)
- def _read_toc(self):
- return TOC.read(self.storage, self.indexname, schema=self._schema)
- def _segments(self):
- return self._read_toc().segments
- def _current_schema(self):
- return self._read_toc().schema
- @property
- def schema(self):
- return self._current_schema()
- @property
- def release(self):
- return self._read_toc().release
- @property
- def version(self):
- return self._read_toc().version
- @classmethod
- def _reader(cls, storage, schema, segments, generation, reuse=None):
- # Returns a reader for the given segments, possibly reusing already
- # opened readers
- from whoosh.reading import SegmentReader, MultiReader, EmptyReader
- reusable = {}
- try:
- if len(segments) == 0:
- # This index has no segments! Return an EmptyReader object,
- # which simply returns empty or zero to every method
- return EmptyReader(schema)
- if reuse:
- # Put all atomic readers in a dictionary keyed by their
- # generation, so we can re-use them if them if possible
- readers = [r for r, _ in reuse.leaf_readers()]
- reusable = dict((r.generation(), r) for r in readers)
- # Make a function to open readers, which reuses reusable readers.
- # It removes any readers it reuses from the "reusable" dictionary,
- # so later we can close any readers left in the dictionary.
- def segreader(segment):
- segid = segment.segment_id()
- if segid in reusable:
- r = reusable[segid]
- del reusable[segid]
- return r
- else:
- return SegmentReader(storage, schema, segment,
- generation=generation)
- if len(segments) == 1:
- # This index has one segment, so return a SegmentReader object
- # for the segment
- return segreader(segments[0])
- else:
- # This index has multiple segments, so create a list of
- # SegmentReaders for the segments, then composite them with a
- # MultiReader
- readers = [segreader(segment) for segment in segments]
- return MultiReader(readers, generation=generation)
- finally:
- for r in reusable.values():
- r.close()
- def reader(self, reuse=None):
- retries = 10
- while retries > 0:
- # Read the information from the TOC file
- try:
- info = self._read_toc()
- return self._reader(self.storage, info.schema, info.segments,
- info.generation, reuse=reuse)
- except IOError:
- # Presume that we got a "file not found error" because a writer
- # deleted one of the files just as we were trying to open it,
- # and so retry a few times before actually raising the
- # exception
- e = sys.exc_info()[1]
- retries -= 1
- if retries <= 0:
- raise e
- sleep(0.05)
- # TOC class
- class TOC(object):
- """Object representing the state of the index after a commit. Essentially
- a container for the index's schema and the list of segment objects.
- """
- def __init__(self, schema, segments, generation,
- version=_CURRENT_TOC_VERSION, release=__version__):
- self.schema = schema
- self.segments = segments
- self.generation = generation
- self.version = version
- self.release = release
- @classmethod
- def _filename(cls, indexname, gen):
- return "_%s_%s.toc" % (indexname, gen)
- @classmethod
- def _pattern(cls, indexname):
- return re.compile("^_%s_([0-9]+).toc$" % indexname)
- @classmethod
- def _segment_pattern(cls, indexname):
- return re.compile("(%s_[0-9a-z]+)[.][A-Za-z0-9_.]+" % indexname)
- @classmethod
- def _latest_generation(cls, storage, indexname):
- pattern = cls._pattern(indexname)
- mx = -1
- for filename in storage:
- m = pattern.match(filename)
- if m:
- mx = max(int(m.group(1)), mx)
- return mx
- @classmethod
- def create(cls, storage, schema, indexname=_DEF_INDEX_NAME):
- schema = ensure_schema(schema)
- # Clear existing files
- prefix = "_%s_" % indexname
- for filename in storage:
- if filename.startswith(prefix):
- storage.delete_file(filename)
- # Write a TOC file with an empty list of segments
- toc = cls(schema, [], 0)
- toc.write(storage, indexname)
- @classmethod
- def read(cls, storage, indexname, gen=None, schema=None):
- if gen is None:
- gen = cls._latest_generation(storage, indexname)
- if gen < 0:
- raise EmptyIndexError("Index %r does not exist in %r"
- % (indexname, storage))
- # Read the content of this index from the .toc file.
- tocfilename = cls._filename(indexname, gen)
- stream = storage.open_file(tocfilename)
- def check_size(name, target):
- sz = stream.read_varint()
- if sz != target:
- raise IndexError("Index was created on different architecture:"
- " saved %s = %s, this computer = %s"
- % (name, sz, target))
- check_size("int", _INT_SIZE)
- check_size("long", _LONG_SIZE)
- check_size("float", _FLOAT_SIZE)
- if not stream.read_int() == -12345:
- raise IndexError("Number misread: byte order problem")
- version = stream.read_int()
- release = (stream.read_varint(), stream.read_varint(),
- stream.read_varint())
- if version != _CURRENT_TOC_VERSION:
- if version in toc_loaders:
- loader = toc_loaders[version]
- schema, segments = loader(stream, gen, schema, version)
- else:
- raise IndexVersionError("Can't read format %s" % version,
- version)
- else:
- # If the user supplied a schema object with the constructor, don't
- # load the pickled schema from the saved index.
- if schema:
- stream.skip_string()
- else:
- schema = pickle.loads(stream.read_string())
- schema = ensure_schema(schema)
- # Generation
- index_gen = stream.read_int()
- assert gen == index_gen
- _ = stream.read_int() # Unused
- segments = stream.read_pickle()
- stream.close()
- return cls(schema, segments, gen, version=version, release=release)
- def write(self, storage, indexname):
- schema = ensure_schema(self.schema)
- schema.clean()
- # Use a temporary file for atomic write.
- tocfilename = self._filename(indexname, self.generation)
- tempfilename = '%s.%s' % (tocfilename, time())
- stream = storage.create_file(tempfilename)
- stream.write_varint(_INT_SIZE)
- stream.write_varint(_LONG_SIZE)
- stream.write_varint(_FLOAT_SIZE)
- stream.write_int(-12345)
- stream.write_int(_CURRENT_TOC_VERSION)
- for num in __version__[:3]:
- stream.write_varint(num)
- try:
- stream.write_string(pickle.dumps(schema, 2))
- except pickle.PicklingError:
- # Try to narrow down the error to a single field
- for fieldname, field in schema.items():
- try:
- pickle.dumps(field)
- except pickle.PicklingError:
- e = sys.exc_info()[1]
- raise pickle.PicklingError("%s %s=%r" % (e, fieldname, field))
- except TypeError:
- e = sys.exc_info()[1]
- raise TypeError("%s %s=%r" % (e, fieldname, field))
- # Otherwise, re-raise the original exception
- raise
- stream.write_int(self.generation)
- stream.write_int(0) # Unused
- stream.write_pickle(self.segments)
- stream.close()
- # Rename temporary file to the proper filename
- storage.rename_file(tempfilename, tocfilename, safe=True)
|