filestore.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662
  1. # Copyright 2009 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. from __future__ import with_statement
  28. import errno, os, sys, tempfile
  29. from threading import Lock
  30. from whoosh.compat import BytesIO, memoryview_
  31. from whoosh.filedb.structfile import BufferFile, StructFile
  32. from whoosh.index import _DEF_INDEX_NAME, EmptyIndexError
  33. from whoosh.util import random_name
  34. from whoosh.util.filelock import FileLock
  35. # Exceptions
  36. class StorageError(Exception):
  37. pass
  38. class ReadOnlyError(StorageError):
  39. pass
  40. # Base class
  41. class Storage(object):
  42. """Abstract base class for storage objects.
  43. A storage object is a virtual flat filesystem, allowing the creation and
  44. retrieval of file-like objects
  45. (:class:`~whoosh.filedb.structfile.StructFile` objects). The default
  46. implementation (:class:`FileStorage`) uses actual files in a directory.
  47. All access to files in Whoosh goes through this object. This allows more
  48. different forms of storage (for example, in RAM, in a database, in a single
  49. file) to be used transparently.
  50. For example, to create a :class:`FileStorage` object::
  51. # Create a storage object
  52. st = FileStorage("indexdir")
  53. # Create the directory if it doesn't already exist
  54. st.create()
  55. The :meth:`Storage.create` method makes it slightly easier to swap storage
  56. implementations. The ``create()`` method handles set-up of the storage
  57. object. For example, ``FileStorage.create()`` creates the directory. A
  58. database implementation might create tables. This is designed to let you
  59. avoid putting implementation-specific setup code in your application.
  60. """
  61. readonly = False
  62. supports_mmap = False
  63. def __iter__(self):
  64. return iter(self.list())
  65. def __enter__(self):
  66. self.create()
  67. return self
  68. def __exit__(self, exc_type, exc_val, exc_tb):
  69. self.close()
  70. def create(self):
  71. """Creates any required implementation-specific resources. For example,
  72. a filesystem-based implementation might create a directory, while a
  73. database implementation might create tables. For example::
  74. from whoosh.filedb.filestore import FileStorage
  75. # Create a storage object
  76. st = FileStorage("indexdir")
  77. # Create any necessary resources
  78. st.create()
  79. This method returns ``self`` so you can also say::
  80. st = FileStorage("indexdir").create()
  81. Storage implementations should be written so that calling create() a
  82. second time on the same storage
  83. :return: a :class:`Storage` instance.
  84. """
  85. return self
  86. def destroy(self, *args, **kwargs):
  87. """Removes any implementation-specific resources related to this storage
  88. object. For example, a filesystem-based implementation might delete a
  89. directory, and a database implementation might drop tables.
  90. The arguments are implementation-specific.
  91. """
  92. pass
  93. def create_index(self, schema, indexname=_DEF_INDEX_NAME, indexclass=None):
  94. """Creates a new index in this storage.
  95. >>> from whoosh import fields
  96. >>> from whoosh.filedb.filestore import FileStorage
  97. >>> schema = fields.Schema(content=fields.TEXT)
  98. >>> # Create the storage directory
  99. >>> st = FileStorage.create("indexdir")
  100. >>> # Create an index in the storage
  101. >>> ix = st.create_index(schema)
  102. :param schema: the :class:`whoosh.fields.Schema` object to use for the
  103. new index.
  104. :param indexname: the name of the index within the storage object. You
  105. can use this option to store multiple indexes in the same storage.
  106. :param indexclass: an optional custom ``Index`` sub-class to use to
  107. create the index files. The default is
  108. :class:`whoosh.index.FileIndex`. This method will call the
  109. ``create`` class method on the given class to create the index.
  110. :return: a :class:`whoosh.index.Index` instance.
  111. """
  112. if self.readonly:
  113. raise ReadOnlyError
  114. if indexclass is None:
  115. import whoosh.index
  116. indexclass = whoosh.index.FileIndex
  117. return indexclass.create(self, schema, indexname)
  118. def open_index(self, indexname=_DEF_INDEX_NAME, schema=None, indexclass=None):
  119. """Opens an existing index (created using :meth:`create_index`) in this
  120. storage.
  121. >>> from whoosh.filedb.filestore import FileStorage
  122. >>> st = FileStorage("indexdir")
  123. >>> # Open an index in the storage
  124. >>> ix = st.open_index()
  125. :param indexname: the name of the index within the storage object. You
  126. can use this option to store multiple indexes in the same storage.
  127. :param schema: if you pass in a :class:`whoosh.fields.Schema` object
  128. using this argument, it will override the schema that was stored
  129. with the index.
  130. :param indexclass: an optional custom ``Index`` sub-class to use to
  131. open the index files. The default is
  132. :class:`whoosh.index.FileIndex`. This method will instantiate the
  133. class with this storage object.
  134. :return: a :class:`whoosh.index.Index` instance.
  135. """
  136. if indexclass is None:
  137. import whoosh.index
  138. indexclass = whoosh.index.FileIndex
  139. return indexclass(self, schema=schema, indexname=indexname)
  140. def index_exists(self, indexname=None):
  141. """Returns True if a non-empty index exists in this storage.
  142. :param indexname: the name of the index within the storage object. You
  143. can use this option to store multiple indexes in the same storage.
  144. :rtype: bool
  145. """
  146. if indexname is None:
  147. indexname = _DEF_INDEX_NAME
  148. try:
  149. ix = self.open_index(indexname)
  150. gen = ix.latest_generation()
  151. ix.close()
  152. return gen > -1
  153. except EmptyIndexError:
  154. pass
  155. return False
  156. def create_file(self, name):
  157. """Creates a file with the given name in this storage.
  158. :param name: the name for the new file.
  159. :return: a :class:`whoosh.filedb.structfile.StructFile` instance.
  160. """
  161. raise NotImplementedError
  162. def open_file(self, name, *args, **kwargs):
  163. """Opens a file with the given name in this storage.
  164. :param name: the name for the new file.
  165. :return: a :class:`whoosh.filedb.structfile.StructFile` instance.
  166. """
  167. raise NotImplementedError
  168. def list(self):
  169. """Returns a list of file names in this storage.
  170. :return: a list of strings
  171. """
  172. raise NotImplementedError
  173. def file_exists(self, name):
  174. """Returns True if the given file exists in this storage.
  175. :param name: the name to check.
  176. :rtype: bool
  177. """
  178. raise NotImplementedError
  179. def file_modified(self, name):
  180. """Returns the last-modified time of the given file in this storage (as
  181. a "ctime" UNIX timestamp).
  182. :param name: the name to check.
  183. :return: a "ctime" number.
  184. """
  185. raise NotImplementedError
  186. def file_length(self, name):
  187. """Returns the size (in bytes) of the given file in this storage.
  188. :param name: the name to check.
  189. :rtype: int
  190. """
  191. raise NotImplementedError
  192. def delete_file(self, name):
  193. """Removes the given file from this storage.
  194. :param name: the name to delete.
  195. """
  196. raise NotImplementedError
  197. def rename_file(self, frm, to, safe=False):
  198. """Renames a file in this storage.
  199. :param frm: The current name of the file.
  200. :param to: The new name for the file.
  201. :param safe: if True, raise an exception if a file with the new name
  202. already exists.
  203. """
  204. raise NotImplementedError
  205. def lock(self, name):
  206. """Return a named lock object (implementing ``.acquire()`` and
  207. ``.release()`` methods). Different storage implementations may use
  208. different lock types with different guarantees. For example, the
  209. RamStorage object uses Python thread locks, while the FileStorage
  210. object uses filesystem-based locks that are valid across different
  211. processes.
  212. :param name: a name for the lock.
  213. :return: a lock-like object.
  214. """
  215. raise NotImplementedError
  216. def close(self):
  217. """Closes any resources opened by this storage object. For some storage
  218. implementations this will be a no-op, but for others it is necessary
  219. to release locks and/or prevent leaks, so it's a good idea to call it
  220. when you're done with a storage object.
  221. """
  222. pass
  223. def optimize(self):
  224. """Optimizes the storage object. The meaning and cost of "optimizing"
  225. will vary by implementation. For example, a database implementation
  226. might run a garbage collection procedure on the underlying database.
  227. """
  228. pass
  229. def temp_storage(self, name=None):
  230. """Creates a new storage object for temporary files. You can call
  231. :meth:`Storage.destroy` on the new storage when you're finished with
  232. it.
  233. :param name: a name for the new storage. This may be optional or
  234. required depending on the storage implementation.
  235. :rtype: :class:`Storage`
  236. """
  237. raise NotImplementedError
  238. class OverlayStorage(Storage):
  239. """Overlays two storage objects. Reads are processed from the first if it
  240. has the named file, otherwise the second. Writes always go to the second.
  241. """
  242. def __init__(self, a, b):
  243. self.a = a
  244. self.b = b
  245. def create_index(self, *args, **kwargs):
  246. self.b.create_index(*args, **kwargs)
  247. def open_index(self, *args, **kwargs):
  248. self.a.open_index(*args, **kwargs)
  249. def create_file(self, *args, **kwargs):
  250. return self.b.create_file(*args, **kwargs)
  251. def open_file(self, name, *args, **kwargs):
  252. if self.a.file_exists(name):
  253. return self.a.open_file(name, *args, **kwargs)
  254. else:
  255. return self.b.open_file(name, *args, **kwargs)
  256. def list(self):
  257. return list(set(self.a.list()) | set(self.b.list()))
  258. def file_exists(self, name):
  259. return self.a.file_exists(name) or self.b.file_exists(name)
  260. def file_modified(self, name):
  261. if self.a.file_exists(name):
  262. return self.a.file_modified(name)
  263. else:
  264. return self.b.file_modified(name)
  265. def file_length(self, name):
  266. if self.a.file_exists(name):
  267. return self.a.file_length(name)
  268. else:
  269. return self.b.file_length(name)
  270. def delete_file(self, name):
  271. return self.b.delete_file(name)
  272. def rename_file(self, *args, **kwargs):
  273. raise NotImplementedError
  274. def lock(self, name):
  275. return self.b.lock(name)
  276. def close(self):
  277. self.a.close()
  278. self.b.close()
  279. def optimize(self):
  280. self.a.optimize()
  281. self.b.optimize()
  282. def temp_storage(self, name=None):
  283. return self.b.temp_storage(name=name)
  284. class FileStorage(Storage):
  285. """Storage object that stores the index as files in a directory on disk.
  286. Prior to version 3, the initializer would raise an IOError if the directory
  287. did not exist. As of version 3, the object does not check if the
  288. directory exists at initialization. This change is to support using the
  289. :meth:`FileStorage.create` method.
  290. """
  291. supports_mmap = True
  292. def __init__(self, path, supports_mmap=True, readonly=False, debug=False):
  293. """
  294. :param path: a path to a directory.
  295. :param supports_mmap: if True (the default), use the ``mmap`` module to
  296. open memory mapped files. You can open the storage object with
  297. ``supports_mmap=False`` to force Whoosh to open files normally
  298. instead of with ``mmap``.
  299. :param readonly: If ``True``, the object will raise an exception if you
  300. attempt to create or rename a file.
  301. """
  302. self.folder = path
  303. self.supports_mmap = supports_mmap
  304. self.readonly = readonly
  305. self._debug = debug
  306. self.locks = {}
  307. def __repr__(self):
  308. return "%s(%r)" % (self.__class__.__name__, self.folder)
  309. def create(self):
  310. """Creates this storage object's directory path using ``os.makedirs`` if
  311. it doesn't already exist.
  312. >>> from whoosh.filedb.filestore import FileStorage
  313. >>> st = FileStorage("indexdir")
  314. >>> st.create()
  315. This method returns ``self``, you can say::
  316. st = FileStorage("indexdir").create()
  317. Note that you can simply create handle the creation of the directory
  318. yourself and open the storage object using the initializer::
  319. dirname = "indexdir"
  320. os.mkdir(dirname)
  321. st = FileStorage(dirname)
  322. However, using the ``create()`` method allows you to potentially swap in
  323. other storage implementations more easily.
  324. :return: a :class:`Storage` instance.
  325. """
  326. dirpath = os.path.abspath(self.folder)
  327. # If the given directory does not already exist, try to create it
  328. try:
  329. os.makedirs(dirpath)
  330. except OSError:
  331. # This is necessary for compatibility between Py2 and Py3
  332. e = sys.exc_info()[1]
  333. # If we get an error because the path already exists, ignore it
  334. if e.errno != errno.EEXIST:
  335. raise
  336. # Raise an exception if the given path is not a directory
  337. if not os.path.isdir(dirpath):
  338. e = IOError("%r is not a directory" % dirpath)
  339. e.errno = errno.ENOTDIR
  340. raise e
  341. return self
  342. def destroy(self):
  343. """Removes any files in this storage object and then removes the
  344. storage object's directory. What happens if any of the files or the
  345. directory are in use depends on the underlying platform.
  346. """
  347. # Remove all files
  348. self.clean()
  349. try:
  350. # Try to remove the directory
  351. os.rmdir(self.folder)
  352. except IOError:
  353. e = sys.exc_info()[1]
  354. if e.errno == errno.ENOENT:
  355. pass
  356. else:
  357. raise e
  358. def create_file(self, name, excl=False, mode="wb", **kwargs):
  359. """Creates a file with the given name in this storage.
  360. :param name: the name for the new file.
  361. :param excl: if True, try to open the file in "exclusive" mode.
  362. :param mode: the mode flags with which to open the file. The default is
  363. ``"wb"``.
  364. :return: a :class:`whoosh.filedb.structfile.StructFile` instance.
  365. """
  366. if self.readonly:
  367. raise ReadOnlyError
  368. path = self._fpath(name)
  369. if excl:
  370. flags = os.O_CREAT | os.O_EXCL | os.O_RDWR
  371. if hasattr(os, "O_BINARY"):
  372. flags |= os.O_BINARY
  373. fd = os.open(path, flags)
  374. fileobj = os.fdopen(fd, mode)
  375. else:
  376. fileobj = open(path, mode)
  377. f = StructFile(fileobj, name=name, **kwargs)
  378. return f
  379. def open_file(self, name, **kwargs):
  380. """Opens an existing file in this storage.
  381. :param name: the name of the file to open.
  382. :param kwargs: additional keyword arguments are passed through to the
  383. :class:`~whoosh.filedb.structfile.StructFile` initializer.
  384. :return: a :class:`whoosh.filedb.structfile.StructFile` instance.
  385. """
  386. f = StructFile(open(self._fpath(name), "rb"), name=name, **kwargs)
  387. return f
  388. def _fpath(self, fname):
  389. return os.path.abspath(os.path.join(self.folder, fname))
  390. def clean(self, ignore=False):
  391. if self.readonly:
  392. raise ReadOnlyError
  393. path = self.folder
  394. files = self.list()
  395. for fname in files:
  396. try:
  397. os.remove(os.path.join(path, fname))
  398. except OSError:
  399. if not ignore:
  400. raise
  401. def list(self):
  402. try:
  403. files = os.listdir(self.folder)
  404. except IOError:
  405. files = []
  406. return files
  407. def file_exists(self, name):
  408. return os.path.exists(self._fpath(name))
  409. def file_modified(self, name):
  410. return os.path.getmtime(self._fpath(name))
  411. def file_length(self, name):
  412. return os.path.getsize(self._fpath(name))
  413. def delete_file(self, name):
  414. if self.readonly:
  415. raise ReadOnlyError
  416. os.remove(self._fpath(name))
  417. def rename_file(self, oldname, newname, safe=False):
  418. if self.readonly:
  419. raise ReadOnlyError
  420. if os.path.exists(self._fpath(newname)):
  421. if safe:
  422. raise NameError("File %r exists" % newname)
  423. else:
  424. os.remove(self._fpath(newname))
  425. os.rename(self._fpath(oldname), self._fpath(newname))
  426. def lock(self, name):
  427. return FileLock(self._fpath(name))
  428. def temp_storage(self, name=None):
  429. name = name or "%s.tmp" % random_name()
  430. path = os.path.join(self.folder, name)
  431. tempstore = FileStorage(path)
  432. return tempstore.create()
  433. class RamStorage(Storage):
  434. """Storage object that keeps the index in memory.
  435. """
  436. supports_mmap = False
  437. def __init__(self):
  438. self.files = {}
  439. self.locks = {}
  440. self.folder = ''
  441. def destroy(self):
  442. del self.files
  443. del self.locks
  444. def list(self):
  445. return list(self.files.keys())
  446. def clean(self):
  447. self.files = {}
  448. def total_size(self):
  449. return sum(self.file_length(f) for f in self.list())
  450. def file_exists(self, name):
  451. return name in self.files
  452. def file_length(self, name):
  453. if name not in self.files:
  454. raise NameError(name)
  455. return len(self.files[name])
  456. def file_modified(self, name):
  457. return -1
  458. def delete_file(self, name):
  459. if name not in self.files:
  460. raise NameError(name)
  461. del self.files[name]
  462. def rename_file(self, name, newname, safe=False):
  463. if name not in self.files:
  464. raise NameError(name)
  465. if safe and newname in self.files:
  466. raise NameError("File %r exists" % newname)
  467. content = self.files[name]
  468. del self.files[name]
  469. self.files[newname] = content
  470. def create_file(self, name, **kwargs):
  471. def onclose_fn(sfile):
  472. self.files[name] = sfile.file.getvalue()
  473. f = StructFile(BytesIO(), name=name, onclose=onclose_fn)
  474. return f
  475. def open_file(self, name, **kwargs):
  476. if name not in self.files:
  477. raise NameError(name)
  478. buf = memoryview_(self.files[name])
  479. return BufferFile(buf, name=name, **kwargs)
  480. def lock(self, name):
  481. if name not in self.locks:
  482. self.locks[name] = Lock()
  483. return self.locks[name]
  484. def temp_storage(self, name=None):
  485. tdir = tempfile.gettempdir()
  486. name = name or "%s.tmp" % random_name()
  487. path = os.path.join(tdir, name)
  488. tempstore = FileStorage(path)
  489. return tempstore.create()
  490. def copy_storage(sourcestore, deststore):
  491. """Copies the files from the source storage object to the destination
  492. storage object using ``shutil.copyfileobj``.
  493. """
  494. from shutil import copyfileobj
  495. for name in sourcestore.list():
  496. with sourcestore.open_file(name) as source:
  497. with deststore.create_file(name) as dest:
  498. copyfileobj(source, dest)
  499. def copy_to_ram(storage):
  500. """Copies the given FileStorage object into a new RamStorage object.
  501. :rtype: :class:`RamStorage`
  502. """
  503. ram = RamStorage()
  504. copy_storage(storage, ram)
  505. return ram