memory.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. # Copyright 2012 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. from __future__ import with_statement
  28. from bisect import bisect_left
  29. from threading import Lock, RLock
  30. from whoosh.compat import xrange
  31. from whoosh.codec import base
  32. from whoosh.matching import ListMatcher
  33. from whoosh.reading import SegmentReader, TermInfo, TermNotFound
  34. from whoosh.writing import SegmentWriter
  35. class MemWriter(SegmentWriter):
  36. def commit(self):
  37. self._finalize_segment()
  38. class MemoryCodec(base.Codec):
  39. def __init__(self):
  40. from whoosh.filedb.filestore import RamStorage
  41. self.storage = RamStorage()
  42. self.segment = MemSegment(self, "blah")
  43. def writer(self, schema):
  44. ix = self.storage.create_index(schema)
  45. return MemWriter(ix, _lk=False, codec=self,
  46. docbase=self.segment._doccount)
  47. def reader(self, schema):
  48. return SegmentReader(self.storage, schema, self.segment, codec=self)
  49. def per_document_writer(self, storage, segment):
  50. return MemPerDocWriter(self.storage, self.segment)
  51. def field_writer(self, storage, segment):
  52. return MemFieldWriter(self.storage, self.segment)
  53. def per_document_reader(self, storage, segment):
  54. return MemPerDocReader(self.storage, self.segment)
  55. def terms_reader(self, storage, segment):
  56. return MemTermsReader(self.storage, self.segment)
  57. def new_segment(self, storage, indexname):
  58. return self.segment
  59. class MemPerDocWriter(base.PerDocWriterWithColumns):
  60. def __init__(self, storage, segment):
  61. self._storage = storage
  62. self._segment = segment
  63. self.is_closed = False
  64. self._colwriters = {}
  65. self._doccount = 0
  66. def _has_column(self, fieldname):
  67. return fieldname in self._colwriters
  68. def _create_column(self, fieldname, column):
  69. colfile = self._storage.create_file("%s.c" % fieldname)
  70. self._colwriters[fieldname] = (colfile, column.writer(colfile))
  71. def _get_column(self, fieldname):
  72. return self._colwriters[fieldname][1]
  73. def start_doc(self, docnum):
  74. self._doccount += 1
  75. self._docnum = docnum
  76. self._stored = {}
  77. self._lengths = {}
  78. self._vectors = {}
  79. def add_field(self, fieldname, fieldobj, value, length):
  80. if value is not None:
  81. self._stored[fieldname] = value
  82. if length is not None:
  83. self._lengths[fieldname] = length
  84. def add_vector_items(self, fieldname, fieldobj, items):
  85. self._vectors[fieldname] = tuple(items)
  86. def finish_doc(self):
  87. with self._segment._lock:
  88. docnum = self._docnum
  89. self._segment._stored[docnum] = self._stored
  90. self._segment._lengths[docnum] = self._lengths
  91. self._segment._vectors[docnum] = self._vectors
  92. def close(self):
  93. colwriters = self._colwriters
  94. for fieldname in colwriters:
  95. colfile, colwriter = colwriters[fieldname]
  96. colwriter.finish(self._doccount)
  97. colfile.close()
  98. self.is_closed = True
  99. class MemPerDocReader(base.PerDocumentReader):
  100. def __init__(self, storage, segment):
  101. self._storage = storage
  102. self._segment = segment
  103. def doc_count(self):
  104. return self._segment.doc_count()
  105. def doc_count_all(self):
  106. return self._segment.doc_count_all()
  107. def has_deletions(self):
  108. return self._segment.has_deletions()
  109. def is_deleted(self, docnum):
  110. return self._segment.is_deleted(docnum)
  111. def deleted_docs(self):
  112. return self._segment.deleted_docs()
  113. def supports_columns(self):
  114. return True
  115. def has_column(self, fieldname):
  116. filename = "%s.c" % fieldname
  117. return self._storage.file_exists(filename)
  118. def column_reader(self, fieldname, column):
  119. filename = "%s.c" % fieldname
  120. colfile = self._storage.open_file(filename)
  121. length = self._storage.file_length(filename)
  122. return column.reader(colfile, 0, length, self._segment.doc_count_all())
  123. def doc_field_length(self, docnum, fieldname, default=0):
  124. return self._segment._lengths[docnum].get(fieldname, default)
  125. def field_length(self, fieldname):
  126. return sum(lens.get(fieldname, 0) for lens
  127. in self._segment._lengths.values())
  128. def min_field_length(self, fieldname):
  129. return min(lens[fieldname] for lens in self._segment._lengths.values()
  130. if fieldname in lens)
  131. def max_field_length(self, fieldname):
  132. return max(lens[fieldname] for lens in self._segment._lengths.values()
  133. if fieldname in lens)
  134. def has_vector(self, docnum, fieldname):
  135. return (docnum in self._segment._vectors
  136. and fieldname in self._segment._vectors[docnum])
  137. def vector(self, docnum, fieldname, format_):
  138. items = self._segment._vectors[docnum][fieldname]
  139. ids, weights, values = zip(*items)
  140. return ListMatcher(ids, weights, values, format_)
  141. def stored_fields(self, docnum):
  142. return self._segment._stored[docnum]
  143. def close(self):
  144. pass
  145. class MemFieldWriter(base.FieldWriter):
  146. def __init__(self, storage, segment):
  147. self._storage = storage
  148. self._segment = segment
  149. self._fieldname = None
  150. self._btext = None
  151. self.is_closed = False
  152. def start_field(self, fieldname, fieldobj):
  153. if self._fieldname is not None:
  154. raise Exception("Called start_field in a field")
  155. with self._segment._lock:
  156. invindex = self._segment._invindex
  157. if fieldname not in invindex:
  158. invindex[fieldname] = {}
  159. self._fieldname = fieldname
  160. self._fieldobj = fieldobj
  161. def start_term(self, btext):
  162. if self._btext is not None:
  163. raise Exception("Called start_term in a term")
  164. fieldname = self._fieldname
  165. fielddict = self._segment._invindex[fieldname]
  166. terminfos = self._segment._terminfos
  167. with self._segment._lock:
  168. if btext not in fielddict:
  169. fielddict[btext] = []
  170. if (fieldname, btext) not in terminfos:
  171. terminfos[fieldname, btext] = TermInfo()
  172. self._postings = fielddict[btext]
  173. self._terminfo = terminfos[fieldname, btext]
  174. self._btext = btext
  175. def add(self, docnum, weight, vbytes, length):
  176. self._postings.append((docnum, weight, vbytes))
  177. self._terminfo.add_posting(docnum, weight, length)
  178. def finish_term(self):
  179. if self._btext is None:
  180. raise Exception("Called finish_term outside a term")
  181. self._postings = None
  182. self._btext = None
  183. self._terminfo = None
  184. def finish_field(self):
  185. if self._fieldname is None:
  186. raise Exception("Called finish_field outside a field")
  187. self._fieldname = None
  188. self._fieldobj = None
  189. def close(self):
  190. self.is_closed = True
  191. class MemTermsReader(base.TermsReader):
  192. def __init__(self, storage, segment):
  193. self._storage = storage
  194. self._segment = segment
  195. self._invindex = segment._invindex
  196. def __contains__(self, term):
  197. return term in self._segment._terminfos
  198. def terms(self):
  199. for fieldname in self._invindex:
  200. for btext in self._invindex[fieldname]:
  201. yield (fieldname, btext)
  202. def terms_from(self, fieldname, prefix):
  203. if fieldname not in self._invindex:
  204. raise TermNotFound("Unknown field %r" % (fieldname,))
  205. terms = sorted(self._invindex[fieldname])
  206. if not terms:
  207. return
  208. start = bisect_left(terms, prefix)
  209. for i in xrange(start, len(terms)):
  210. yield (fieldname, terms[i])
  211. def term_info(self, fieldname, text):
  212. return self._segment._terminfos[fieldname, text]
  213. def matcher(self, fieldname, btext, format_, scorer=None):
  214. items = self._invindex[fieldname][btext]
  215. ids, weights, values = zip(*items)
  216. return ListMatcher(ids, weights, values, format_, scorer=scorer)
  217. def indexed_field_names(self):
  218. return self._invindex.keys()
  219. def close(self):
  220. pass
  221. class MemSegment(base.Segment):
  222. def __init__(self, codec, indexname):
  223. base.Segment.__init__(self, indexname)
  224. self._codec = codec
  225. self._doccount = 0
  226. self._stored = {}
  227. self._lengths = {}
  228. self._vectors = {}
  229. self._invindex = {}
  230. self._terminfos = {}
  231. self._lock = Lock()
  232. def codec(self):
  233. return self._codec
  234. def set_doc_count(self, doccount):
  235. self._doccount = doccount
  236. def doc_count(self):
  237. return len(self._stored)
  238. def doc_count_all(self):
  239. return self._doccount
  240. def delete_document(self, docnum, delete=True):
  241. if not delete:
  242. raise Exception("MemoryCodec can't undelete")
  243. with self._lock:
  244. del self._stored[docnum]
  245. del self._lengths[docnum]
  246. del self._vectors[docnum]
  247. def has_deletions(self):
  248. with self._lock:
  249. return self._doccount - len(self._stored)
  250. def is_deleted(self, docnum):
  251. return docnum not in self._stored
  252. def deleted_docs(self):
  253. stored = self._stored
  254. for docnum in xrange(self.doc_count_all()):
  255. if docnum not in stored:
  256. yield docnum
  257. def should_assemble(self):
  258. return False