index.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710
  1. # Copyright 2007 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. """Contains the main functions/classes for creating, maintaining, and using
  28. an index.
  29. """
  30. from __future__ import division
  31. import os.path, re, sys
  32. from time import time, sleep
  33. from whoosh import __version__
  34. from whoosh.legacy import toc_loaders
  35. from whoosh.compat import pickle, string_type
  36. from whoosh.fields import ensure_schema
  37. from whoosh.system import _INT_SIZE, _FLOAT_SIZE, _LONG_SIZE
  38. _DEF_INDEX_NAME = "MAIN"
  39. _CURRENT_TOC_VERSION = -111
  40. # Exceptions
  41. class LockError(Exception):
  42. pass
  43. class IndexError(Exception):
  44. """Generic index error."""
  45. class IndexVersionError(IndexError):
  46. """Raised when you try to open an index using a format that the current
  47. version of Whoosh cannot read. That is, when the index you're trying to
  48. open is either not backward or forward compatible with this version of
  49. Whoosh.
  50. """
  51. def __init__(self, msg, version, release=None):
  52. Exception.__init__(self, msg)
  53. self.version = version
  54. self.release = release
  55. class OutOfDateError(IndexError):
  56. """Raised when you try to commit changes to an index which is not the
  57. latest generation.
  58. """
  59. class EmptyIndexError(IndexError):
  60. """Raised when you try to work with an index that has no indexed terms.
  61. """
  62. # Convenience functions
  63. def create_in(dirname, schema, indexname=None):
  64. """Convenience function to create an index in a directory. Takes care of
  65. creating a FileStorage object for you.
  66. :param dirname: the path string of the directory in which to create the
  67. index.
  68. :param schema: a :class:`whoosh.fields.Schema` object describing the
  69. index's fields.
  70. :param indexname: the name of the index to create; you only need to specify
  71. this if you are creating multiple indexes within the same storage
  72. object.
  73. :returns: :class:`Index`
  74. """
  75. from whoosh.filedb.filestore import FileStorage
  76. if not indexname:
  77. indexname = _DEF_INDEX_NAME
  78. storage = FileStorage(dirname)
  79. return FileIndex.create(storage, schema, indexname)
  80. def open_dir(dirname, indexname=None, readonly=False, schema=None):
  81. """Convenience function for opening an index in a directory. Takes care of
  82. creating a FileStorage object for you. dirname is the filename of the
  83. directory in containing the index. indexname is the name of the index to
  84. create; you only need to specify this if you have multiple indexes within
  85. the same storage object.
  86. :param dirname: the path string of the directory in which to create the
  87. index.
  88. :param indexname: the name of the index to create; you only need to specify
  89. this if you have multiple indexes within the same storage object.
  90. """
  91. from whoosh.filedb.filestore import FileStorage
  92. if indexname is None:
  93. indexname = _DEF_INDEX_NAME
  94. storage = FileStorage(dirname, readonly=readonly)
  95. return FileIndex(storage, schema=schema, indexname=indexname)
  96. def exists_in(dirname, indexname=None):
  97. """Returns True if dirname contains a Whoosh index.
  98. :param dirname: the file path of a directory.
  99. :param indexname: the name of the index. If None, the default index name is
  100. used.
  101. """
  102. if os.path.exists(dirname):
  103. try:
  104. ix = open_dir(dirname, indexname=indexname)
  105. return ix.latest_generation() > -1
  106. except EmptyIndexError:
  107. pass
  108. return False
  109. def exists(storage, indexname=None):
  110. """Deprecated; use ``storage.index_exists()``.
  111. :param storage: a store.Storage object.
  112. :param indexname: the name of the index. If None, the default index name is
  113. used.
  114. """
  115. return storage.index_exists(indexname)
  116. def version_in(dirname, indexname=None):
  117. """Returns a tuple of (release_version, format_version), where
  118. release_version is the release version number of the Whoosh code that
  119. created the index -- e.g. (0, 1, 24) -- and format_version is the version
  120. number of the on-disk format used for the index -- e.g. -102.
  121. You should avoid attaching significance to the second number (the index
  122. version). This is simply a version number for the TOC file and probably
  123. should not have been exposed in a public interface. The best way to check
  124. if the current version of Whoosh can open an index is to actually try to
  125. open it and see if it raises a ``whoosh.index.IndexVersionError`` exception.
  126. Note that the release and format version are available as attributes on the
  127. Index object in Index.release and Index.version.
  128. :param dirname: the file path of a directory containing an index.
  129. :param indexname: the name of the index. If None, the default index name is
  130. used.
  131. :returns: ((major_ver, minor_ver, build_ver), format_ver)
  132. """
  133. from whoosh.filedb.filestore import FileStorage
  134. storage = FileStorage(dirname)
  135. return version(storage, indexname=indexname)
  136. def version(storage, indexname=None):
  137. """Returns a tuple of (release_version, format_version), where
  138. release_version is the release version number of the Whoosh code that
  139. created the index -- e.g. (0, 1, 24) -- and format_version is the version
  140. number of the on-disk format used for the index -- e.g. -102.
  141. You should avoid attaching significance to the second number (the index
  142. version). This is simply a version number for the TOC file and probably
  143. should not have been exposed in a public interface. The best way to check
  144. if the current version of Whoosh can open an index is to actually try to
  145. open it and see if it raises a ``whoosh.index.IndexVersionError`` exception.
  146. Note that the release and format version are available as attributes on the
  147. Index object in Index.release and Index.version.
  148. :param storage: a store.Storage object.
  149. :param indexname: the name of the index. If None, the default index name is
  150. used.
  151. :returns: ((major_ver, minor_ver, build_ver), format_ver)
  152. """
  153. try:
  154. if indexname is None:
  155. indexname = _DEF_INDEX_NAME
  156. ix = storage.open_index(indexname)
  157. return (ix.release, ix.version)
  158. except IndexVersionError:
  159. e = sys.exc_info()[1]
  160. return (None, e.version)
  161. # Index base class
  162. class Index(object):
  163. """Represents an indexed collection of documents.
  164. """
  165. def close(self):
  166. """Closes any open resources held by the Index object itself. This may
  167. not close all resources being used everywhere, for example by a
  168. Searcher object.
  169. """
  170. pass
  171. def add_field(self, fieldname, fieldspec):
  172. """Adds a field to the index's schema.
  173. :param fieldname: the name of the field to add.
  174. :param fieldspec: an instantiated :class:`whoosh.fields.FieldType`
  175. object.
  176. """
  177. w = self.writer()
  178. w.add_field(fieldname, fieldspec)
  179. w.commit()
  180. def remove_field(self, fieldname):
  181. """Removes the named field from the index's schema. Depending on the
  182. backend implementation, this may or may not actually remove existing
  183. data for the field from the index. Optimizing the index should always
  184. clear out existing data for a removed field.
  185. """
  186. w = self.writer()
  187. w.remove_field(fieldname)
  188. w.commit()
  189. def latest_generation(self):
  190. """Returns the generation number of the latest generation of this
  191. index, or -1 if the backend doesn't support versioning.
  192. """
  193. return -1
  194. def refresh(self):
  195. """Returns a new Index object representing the latest generation
  196. of this index (if this object is the latest generation, or the backend
  197. doesn't support versioning, returns self).
  198. :returns: :class:`Index`
  199. """
  200. return self
  201. def up_to_date(self):
  202. """Returns True if this object represents the latest generation of
  203. this index. Returns False if this object is not the latest generation
  204. (that is, someone else has updated the index since you opened this
  205. object).
  206. """
  207. return True
  208. def last_modified(self):
  209. """Returns the last modified time of the index, or -1 if the backend
  210. doesn't support last-modified times.
  211. """
  212. return -1
  213. def is_empty(self):
  214. """Returns True if this index is empty (that is, it has never had any
  215. documents successfully written to it.
  216. """
  217. raise NotImplementedError
  218. def optimize(self):
  219. """Optimizes this index, if necessary.
  220. """
  221. pass
  222. def doc_count_all(self):
  223. """Returns the total number of documents, DELETED OR UNDELETED,
  224. in this index.
  225. """
  226. r = self.reader()
  227. try:
  228. return r.doc_count_all()
  229. finally:
  230. r.close()
  231. def doc_count(self):
  232. """Returns the total number of UNDELETED documents in this index.
  233. """
  234. r = self.reader()
  235. try:
  236. return r.doc_count()
  237. finally:
  238. r.close()
  239. def searcher(self, **kwargs):
  240. """Returns a Searcher object for this index. Keyword arguments are
  241. passed to the Searcher object's constructor.
  242. :rtype: :class:`whoosh.searching.Searcher`
  243. """
  244. from whoosh.searching import Searcher
  245. return Searcher(self.reader(), fromindex=self, **kwargs)
  246. def field_length(self, fieldname):
  247. """Returns the total length of the field across all documents.
  248. """
  249. r = self.reader()
  250. try:
  251. return r.field_length(fieldname)
  252. finally:
  253. r.close()
  254. def max_field_length(self, fieldname):
  255. """Returns the maximum length of the field across all documents.
  256. """
  257. r = self.reader()
  258. try:
  259. return r.max_field_length(fieldname)
  260. finally:
  261. r.close()
  262. def reader(self, reuse=None):
  263. """Returns an IndexReader object for this index.
  264. :param reuse: an existing reader. Some implementations may recycle
  265. resources from this existing reader to create the new reader. Note
  266. that any resources in the "recycled" reader that are not used by
  267. the new reader will be CLOSED, so you CANNOT use it afterward.
  268. :rtype: :class:`whoosh.reading.IndexReader`
  269. """
  270. raise NotImplementedError
  271. def writer(self, **kwargs):
  272. """Returns an IndexWriter object for this index.
  273. :rtype: :class:`whoosh.writing.IndexWriter`
  274. """
  275. raise NotImplementedError
  276. def delete_by_term(self, fieldname, text, searcher=None):
  277. w = self.writer()
  278. w.delete_by_term(fieldname, text, searcher=searcher)
  279. w.commit()
  280. def delete_by_query(self, q, searcher=None):
  281. w = self.writer()
  282. w.delete_by_query(q, searcher=searcher)
  283. w.commit()
  284. # Codec-based index implementation
  285. def clean_files(storage, indexname, gen, segments):
  286. # Attempts to remove unused index files (called when a new generation
  287. # is created). If existing Index and/or reader objects have the files
  288. # open, they may not be deleted immediately (i.e. on Windows) but will
  289. # probably be deleted eventually by a later call to clean_files.
  290. current_segment_names = set(s.segment_id() for s in segments)
  291. tocpattern = TOC._pattern(indexname)
  292. segpattern = TOC._segment_pattern(indexname)
  293. todelete = set()
  294. for filename in storage:
  295. if filename.startswith("."):
  296. continue
  297. tocm = tocpattern.match(filename)
  298. segm = segpattern.match(filename)
  299. if tocm:
  300. if int(tocm.group(1)) != gen:
  301. todelete.add(filename)
  302. elif segm:
  303. name = segm.group(1)
  304. if name not in current_segment_names:
  305. todelete.add(filename)
  306. for filename in todelete:
  307. try:
  308. storage.delete_file(filename)
  309. except OSError:
  310. # Another process still has this file open, I guess
  311. pass
  312. class FileIndex(Index):
  313. def __init__(self, storage, schema=None, indexname=_DEF_INDEX_NAME):
  314. from whoosh.filedb.filestore import Storage
  315. if not isinstance(storage, Storage):
  316. raise ValueError("%r is not a Storage object" % storage)
  317. if not isinstance(indexname, string_type):
  318. raise ValueError("indexname %r is not a string" % indexname)
  319. if schema:
  320. schema = ensure_schema(schema)
  321. self.storage = storage
  322. self._schema = schema
  323. self.indexname = indexname
  324. # Try reading the TOC to see if it's possible
  325. TOC.read(self.storage, self.indexname, schema=self._schema)
  326. @classmethod
  327. def create(cls, storage, schema, indexname=_DEF_INDEX_NAME):
  328. TOC.create(storage, schema, indexname)
  329. return cls(storage, schema, indexname)
  330. def __repr__(self):
  331. return "%s(%r, %r)" % (self.__class__.__name__,
  332. self.storage, self.indexname)
  333. def close(self):
  334. pass
  335. # add_field
  336. # remove_field
  337. def latest_generation(self):
  338. return TOC._latest_generation(self.storage, self.indexname)
  339. # refresh
  340. # up_to_date
  341. def last_modified(self):
  342. gen = self.latest_generation()
  343. filename = TOC._filename(self.indexname, gen)
  344. return self.storage.file_modified(filename)
  345. def is_empty(self):
  346. return len(self._read_toc().segments) == 0
  347. def optimize(self, **kwargs):
  348. w = self.writer(**kwargs)
  349. w.commit(optimize=True)
  350. # searcher
  351. def writer(self, procs=1, **kwargs):
  352. if procs > 1:
  353. from whoosh.multiproc import MpWriter
  354. return MpWriter(self, procs=procs, **kwargs)
  355. else:
  356. from whoosh.writing import SegmentWriter
  357. return SegmentWriter(self, **kwargs)
  358. def lock(self, name):
  359. """Returns a lock object that you can try to call acquire() on to
  360. lock the index.
  361. """
  362. return self.storage.lock(self.indexname + "_" + name)
  363. def _read_toc(self):
  364. return TOC.read(self.storage, self.indexname, schema=self._schema)
  365. def _segments(self):
  366. return self._read_toc().segments
  367. def _current_schema(self):
  368. return self._read_toc().schema
  369. @property
  370. def schema(self):
  371. return self._current_schema()
  372. @property
  373. def release(self):
  374. return self._read_toc().release
  375. @property
  376. def version(self):
  377. return self._read_toc().version
  378. @classmethod
  379. def _reader(cls, storage, schema, segments, generation, reuse=None):
  380. # Returns a reader for the given segments, possibly reusing already
  381. # opened readers
  382. from whoosh.reading import SegmentReader, MultiReader, EmptyReader
  383. reusable = {}
  384. try:
  385. if len(segments) == 0:
  386. # This index has no segments! Return an EmptyReader object,
  387. # which simply returns empty or zero to every method
  388. return EmptyReader(schema)
  389. if reuse:
  390. # Put all atomic readers in a dictionary keyed by their
  391. # generation, so we can re-use them if them if possible
  392. readers = [r for r, _ in reuse.leaf_readers()]
  393. reusable = dict((r.generation(), r) for r in readers)
  394. # Make a function to open readers, which reuses reusable readers.
  395. # It removes any readers it reuses from the "reusable" dictionary,
  396. # so later we can close any readers left in the dictionary.
  397. def segreader(segment):
  398. segid = segment.segment_id()
  399. if segid in reusable:
  400. r = reusable[segid]
  401. del reusable[segid]
  402. return r
  403. else:
  404. return SegmentReader(storage, schema, segment,
  405. generation=generation)
  406. if len(segments) == 1:
  407. # This index has one segment, so return a SegmentReader object
  408. # for the segment
  409. return segreader(segments[0])
  410. else:
  411. # This index has multiple segments, so create a list of
  412. # SegmentReaders for the segments, then composite them with a
  413. # MultiReader
  414. readers = [segreader(segment) for segment in segments]
  415. return MultiReader(readers, generation=generation)
  416. finally:
  417. for r in reusable.values():
  418. r.close()
  419. def reader(self, reuse=None):
  420. retries = 10
  421. while retries > 0:
  422. # Read the information from the TOC file
  423. try:
  424. info = self._read_toc()
  425. return self._reader(self.storage, info.schema, info.segments,
  426. info.generation, reuse=reuse)
  427. except IOError:
  428. # Presume that we got a "file not found error" because a writer
  429. # deleted one of the files just as we were trying to open it,
  430. # and so retry a few times before actually raising the
  431. # exception
  432. e = sys.exc_info()[1]
  433. retries -= 1
  434. if retries <= 0:
  435. raise e
  436. sleep(0.05)
  437. # TOC class
  438. class TOC(object):
  439. """Object representing the state of the index after a commit. Essentially
  440. a container for the index's schema and the list of segment objects.
  441. """
  442. def __init__(self, schema, segments, generation,
  443. version=_CURRENT_TOC_VERSION, release=__version__):
  444. self.schema = schema
  445. self.segments = segments
  446. self.generation = generation
  447. self.version = version
  448. self.release = release
  449. @classmethod
  450. def _filename(cls, indexname, gen):
  451. return "_%s_%s.toc" % (indexname, gen)
  452. @classmethod
  453. def _pattern(cls, indexname):
  454. return re.compile("^_%s_([0-9]+).toc$" % indexname)
  455. @classmethod
  456. def _segment_pattern(cls, indexname):
  457. return re.compile("(%s_[0-9a-z]+)[.][A-Za-z0-9_.]+" % indexname)
  458. @classmethod
  459. def _latest_generation(cls, storage, indexname):
  460. pattern = cls._pattern(indexname)
  461. mx = -1
  462. for filename in storage:
  463. m = pattern.match(filename)
  464. if m:
  465. mx = max(int(m.group(1)), mx)
  466. return mx
  467. @classmethod
  468. def create(cls, storage, schema, indexname=_DEF_INDEX_NAME):
  469. schema = ensure_schema(schema)
  470. # Clear existing files
  471. prefix = "_%s_" % indexname
  472. for filename in storage:
  473. if filename.startswith(prefix):
  474. storage.delete_file(filename)
  475. # Write a TOC file with an empty list of segments
  476. toc = cls(schema, [], 0)
  477. toc.write(storage, indexname)
  478. @classmethod
  479. def read(cls, storage, indexname, gen=None, schema=None):
  480. if gen is None:
  481. gen = cls._latest_generation(storage, indexname)
  482. if gen < 0:
  483. raise EmptyIndexError("Index %r does not exist in %r"
  484. % (indexname, storage))
  485. # Read the content of this index from the .toc file.
  486. tocfilename = cls._filename(indexname, gen)
  487. stream = storage.open_file(tocfilename)
  488. def check_size(name, target):
  489. sz = stream.read_varint()
  490. if sz != target:
  491. raise IndexError("Index was created on different architecture:"
  492. " saved %s = %s, this computer = %s"
  493. % (name, sz, target))
  494. check_size("int", _INT_SIZE)
  495. check_size("long", _LONG_SIZE)
  496. check_size("float", _FLOAT_SIZE)
  497. if not stream.read_int() == -12345:
  498. raise IndexError("Number misread: byte order problem")
  499. version = stream.read_int()
  500. release = (stream.read_varint(), stream.read_varint(),
  501. stream.read_varint())
  502. if version != _CURRENT_TOC_VERSION:
  503. if version in toc_loaders:
  504. loader = toc_loaders[version]
  505. schema, segments = loader(stream, gen, schema, version)
  506. else:
  507. raise IndexVersionError("Can't read format %s" % version,
  508. version)
  509. else:
  510. # If the user supplied a schema object with the constructor, don't
  511. # load the pickled schema from the saved index.
  512. if schema:
  513. stream.skip_string()
  514. else:
  515. schema = pickle.loads(stream.read_string())
  516. schema = ensure_schema(schema)
  517. # Generation
  518. index_gen = stream.read_int()
  519. assert gen == index_gen
  520. _ = stream.read_int() # Unused
  521. segments = stream.read_pickle()
  522. stream.close()
  523. return cls(schema, segments, gen, version=version, release=release)
  524. def write(self, storage, indexname):
  525. schema = ensure_schema(self.schema)
  526. schema.clean()
  527. # Use a temporary file for atomic write.
  528. tocfilename = self._filename(indexname, self.generation)
  529. tempfilename = '%s.%s' % (tocfilename, time())
  530. stream = storage.create_file(tempfilename)
  531. stream.write_varint(_INT_SIZE)
  532. stream.write_varint(_LONG_SIZE)
  533. stream.write_varint(_FLOAT_SIZE)
  534. stream.write_int(-12345)
  535. stream.write_int(_CURRENT_TOC_VERSION)
  536. for num in __version__[:3]:
  537. stream.write_varint(num)
  538. try:
  539. stream.write_string(pickle.dumps(schema, 2))
  540. except pickle.PicklingError:
  541. # Try to narrow down the error to a single field
  542. for fieldname, field in schema.items():
  543. try:
  544. pickle.dumps(field)
  545. except pickle.PicklingError:
  546. e = sys.exc_info()[1]
  547. raise pickle.PicklingError("%s %s=%r" % (e, fieldname, field))
  548. except TypeError:
  549. e = sys.exc_info()[1]
  550. raise TypeError("%s %s=%r" % (e, fieldname, field))
  551. # Otherwise, re-raise the original exception
  552. raise
  553. stream.write_int(self.generation)
  554. stream.write_int(0) # Unused
  555. stream.write_pickle(self.segments)
  556. stream.close()
  557. # Rename temporary file to the proper filename
  558. storage.rename_file(tempfilename, tocfilename, safe=True)