reading.py 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296
  1. # Copyright 2007 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. """This module contains classes that allow reading from an index.
  28. """
  29. from math import log
  30. from bisect import bisect_right
  31. from heapq import heapify, heapreplace, heappop, nlargest
  32. from whoosh import columns
  33. from whoosh.compat import abstractmethod
  34. from whoosh.compat import xrange, zip_, next, iteritems
  35. from whoosh.filedb.filestore import OverlayStorage
  36. from whoosh.matching import MultiMatcher
  37. from whoosh.support.levenshtein import distance
  38. from whoosh.system import emptybytes
  39. # Exceptions
  40. class ReaderClosed(Exception):
  41. """Exception raised when you try to do some operation on a closed searcher
  42. (or a Results object derived from a searcher that has since been closed).
  43. """
  44. message = "Operation on a closed reader"
  45. class TermNotFound(Exception):
  46. pass
  47. # Term Info base class
  48. class TermInfo(object):
  49. """Represents a set of statistics about a term. This object is returned by
  50. :meth:`IndexReader.term_info`. These statistics may be useful for
  51. optimizations and scoring algorithms.
  52. """
  53. def __init__(self, weight=0, df=0, minlength=None,
  54. maxlength=0, maxweight=0, minid=None, maxid=0):
  55. self._weight = weight
  56. self._df = df
  57. self._minlength = minlength
  58. self._maxlength = maxlength
  59. self._maxweight = maxweight
  60. self._minid = minid
  61. self._maxid = maxid
  62. def add_posting(self, docnum, weight, length=None):
  63. if self._minid is None:
  64. self._minid = docnum
  65. self._maxid = docnum
  66. self._weight += weight
  67. self._df += 1
  68. self._maxweight = max(self._maxweight, weight)
  69. if length is not None:
  70. if self._minlength is None:
  71. self._minlength = length
  72. else:
  73. self._minlength = min(self._minlength, length)
  74. self._maxlength = max(self._maxlength, length)
  75. def weight(self):
  76. """Returns the total frequency of the term across all documents.
  77. """
  78. return self._weight
  79. def doc_frequency(self):
  80. """Returns the number of documents the term appears in.
  81. """
  82. return self._df
  83. def min_length(self):
  84. """Returns the length of the shortest field value the term appears
  85. in.
  86. """
  87. return self._minlength
  88. def max_length(self):
  89. """Returns the length of the longest field value the term appears
  90. in.
  91. """
  92. return self._maxlength
  93. def max_weight(self):
  94. """Returns the number of times the term appears in the document in
  95. which it appears the most.
  96. """
  97. return self._maxweight
  98. def min_id(self):
  99. """Returns the lowest document ID this term appears in.
  100. """
  101. return self._minid
  102. def max_id(self):
  103. """Returns the highest document ID this term appears in.
  104. """
  105. return self._maxid
  106. # Reader base class
  107. class IndexReader(object):
  108. """Do not instantiate this object directly. Instead use Index.reader().
  109. """
  110. def __enter__(self):
  111. return self
  112. def __exit__(self, *args):
  113. self.close()
  114. @abstractmethod
  115. def __contains__(self, term):
  116. """Returns True if the given term tuple (fieldname, text) is
  117. in this reader.
  118. """
  119. raise NotImplementedError
  120. def codec(self):
  121. """Returns the :class:`whoosh.codec.base.Codec` object used to read
  122. this reader's segment. If this reader is not atomic
  123. (``reader.is_atomic() == True``), returns None.
  124. """
  125. return None
  126. def segment(self):
  127. """Returns the :class:`whoosh.index.Segment` object used by this reader.
  128. If this reader is not atomic (``reader.is_atomic() == True``), returns
  129. None.
  130. """
  131. return None
  132. def storage(self):
  133. """Returns the :class:`whoosh.filedb.filestore.Storage` object used by
  134. this reader to read its files. If the reader is not atomic,
  135. (``reader.is_atomic() == True``), returns None.
  136. """
  137. return None
  138. def is_atomic(self):
  139. return True
  140. def _text_to_bytes(self, fieldname, text):
  141. if fieldname not in self.schema:
  142. raise TermNotFound((fieldname, text))
  143. return self.schema[fieldname].to_bytes(text)
  144. def close(self):
  145. """Closes the open files associated with this reader.
  146. """
  147. pass
  148. def generation(self):
  149. """Returns the generation of the index being read, or -1 if the backend
  150. is not versioned.
  151. """
  152. return None
  153. @abstractmethod
  154. def indexed_field_names(self):
  155. """Returns an iterable of strings representing the names of the indexed
  156. fields. This may include additional names not explicitly listed in the
  157. Schema if you use "glob" fields.
  158. """
  159. raise NotImplementedError
  160. @abstractmethod
  161. def all_terms(self):
  162. """Yields (fieldname, text) tuples for every term in the index.
  163. """
  164. raise NotImplementedError
  165. def terms_from(self, fieldname, prefix):
  166. """Yields (fieldname, text) tuples for every term in the index starting
  167. at the given prefix.
  168. """
  169. # The default implementation just scans the whole list of terms
  170. for fname, text in self.all_terms():
  171. if fname < fieldname or text < prefix:
  172. continue
  173. yield (fname, text)
  174. @abstractmethod
  175. def term_info(self, fieldname, text):
  176. """Returns a :class:`TermInfo` object allowing access to various
  177. statistics about the given term.
  178. """
  179. raise NotImplementedError
  180. def expand_prefix(self, fieldname, prefix):
  181. """Yields terms in the given field that start with the given prefix.
  182. """
  183. prefix = self._text_to_bytes(fieldname, prefix)
  184. for fn, text in self.terms_from(fieldname, prefix):
  185. if fn != fieldname or not text.startswith(prefix):
  186. return
  187. yield text
  188. def lexicon(self, fieldname):
  189. """Yields all bytestrings in the given field.
  190. """
  191. for fn, btext in self.terms_from(fieldname, emptybytes):
  192. if fn != fieldname:
  193. return
  194. yield btext
  195. def field_terms(self, fieldname):
  196. """Yields all term values (converted from on-disk bytes) in the given
  197. field.
  198. """
  199. from_bytes = self.schema[fieldname].from_bytes
  200. for btext in self.lexicon(fieldname):
  201. yield from_bytes(btext)
  202. def __iter__(self):
  203. """Yields ((fieldname, text), terminfo) tuples for each term in the
  204. reader, in lexical order.
  205. """
  206. term_info = self.term_info
  207. for term in self.all_terms():
  208. yield (term, term_info(*term))
  209. def iter_from(self, fieldname, text):
  210. """Yields ((fieldname, text), terminfo) tuples for all terms in the
  211. reader, starting at the given term.
  212. """
  213. term_info = self.term_info
  214. text = self._text_to_bytes(fieldname, text)
  215. for term in self.terms_from(fieldname, text):
  216. yield (term, term_info(*term))
  217. def iter_field(self, fieldname, prefix=''):
  218. """Yields (text, terminfo) tuples for all terms in the given field.
  219. """
  220. prefix = self._text_to_bytes(fieldname, prefix)
  221. for (fn, text), terminfo in self.iter_from(fieldname, prefix):
  222. if fn != fieldname:
  223. return
  224. yield text, terminfo
  225. def iter_prefix(self, fieldname, prefix):
  226. """Yields (text, terminfo) tuples for all terms in the given field with
  227. a certain prefix.
  228. """
  229. prefix = self._text_to_bytes(fieldname, prefix)
  230. for text, terminfo in self.iter_field(fieldname, prefix):
  231. if not text.startswith(prefix):
  232. return
  233. yield (text, terminfo)
  234. @abstractmethod
  235. def has_deletions(self):
  236. """Returns True if the underlying index/segment has deleted
  237. documents.
  238. """
  239. raise NotImplementedError
  240. def all_doc_ids(self):
  241. """Returns an iterator of all (undeleted) document IDs in the reader.
  242. """
  243. is_deleted = self.is_deleted
  244. return (docnum for docnum in xrange(self.doc_count_all())
  245. if not is_deleted(docnum))
  246. def iter_docs(self):
  247. """Yields a series of ``(docnum, stored_fields_dict)``
  248. tuples for the undeleted documents in the reader.
  249. """
  250. for docnum in self.all_doc_ids():
  251. yield docnum, self.stored_fields(docnum)
  252. @abstractmethod
  253. def is_deleted(self, docnum):
  254. """Returns True if the given document number is marked deleted.
  255. """
  256. raise NotImplementedError
  257. @abstractmethod
  258. def stored_fields(self, docnum):
  259. """Returns the stored fields for the given document number.
  260. :param numerickeys: use field numbers as the dictionary keys instead of
  261. field names.
  262. """
  263. raise NotImplementedError
  264. def all_stored_fields(self):
  265. """Yields the stored fields for all non-deleted documents.
  266. """
  267. is_deleted = self.is_deleted
  268. for docnum in xrange(self.doc_count_all()):
  269. if not is_deleted(docnum):
  270. yield self.stored_fields(docnum)
  271. @abstractmethod
  272. def doc_count_all(self):
  273. """Returns the total number of documents, DELETED OR UNDELETED,
  274. in this reader.
  275. """
  276. raise NotImplementedError
  277. @abstractmethod
  278. def doc_count(self):
  279. """Returns the total number of UNDELETED documents in this reader.
  280. """
  281. return self.doc_count_all() - self.deleted_count()
  282. @abstractmethod
  283. def frequency(self, fieldname, text):
  284. """Returns the total number of instances of the given term in the
  285. collection.
  286. """
  287. raise NotImplementedError
  288. @abstractmethod
  289. def doc_frequency(self, fieldname, text):
  290. """Returns how many documents the given term appears in.
  291. """
  292. raise NotImplementedError
  293. @abstractmethod
  294. def field_length(self, fieldname):
  295. """Returns the total number of terms in the given field. This is used
  296. by some scoring algorithms.
  297. """
  298. raise NotImplementedError
  299. @abstractmethod
  300. def min_field_length(self, fieldname):
  301. """Returns the minimum length of the field across all documents. This
  302. is used by some scoring algorithms.
  303. """
  304. raise NotImplementedError
  305. @abstractmethod
  306. def max_field_length(self, fieldname):
  307. """Returns the minimum length of the field across all documents. This
  308. is used by some scoring algorithms.
  309. """
  310. raise NotImplementedError
  311. @abstractmethod
  312. def doc_field_length(self, docnum, fieldname, default=0):
  313. """Returns the number of terms in the given field in the given
  314. document. This is used by some scoring algorithms.
  315. """
  316. raise NotImplementedError
  317. def first_id(self, fieldname, text):
  318. """Returns the first ID in the posting list for the given term. This
  319. may be optimized in certain backends.
  320. """
  321. text = self._text_to_bytes(fieldname, text)
  322. p = self.postings(fieldname, text)
  323. if p.is_active():
  324. return p.id()
  325. raise TermNotFound((fieldname, text))
  326. def iter_postings(self):
  327. """Low-level method, yields all postings in the reader as
  328. ``(fieldname, text, docnum, weight, valuestring)`` tuples.
  329. """
  330. for fieldname, btext in self.all_terms():
  331. m = self.postings(fieldname, btext)
  332. while m.is_active():
  333. yield (fieldname, btext, m.id(), m.weight(), m.value())
  334. m.next()
  335. @abstractmethod
  336. def postings(self, fieldname, text):
  337. """Returns a :class:`~whoosh.matching.Matcher` for the postings of the
  338. given term.
  339. >>> pr = reader.postings("content", "render")
  340. >>> pr.skip_to(10)
  341. >>> pr.id
  342. 12
  343. :param fieldname: the field name or field number of the term.
  344. :param text: the text of the term.
  345. :rtype: :class:`whoosh.matching.Matcher`
  346. """
  347. raise NotImplementedError
  348. @abstractmethod
  349. def has_vector(self, docnum, fieldname):
  350. """Returns True if the given document has a term vector for the given
  351. field.
  352. """
  353. raise NotImplementedError
  354. @abstractmethod
  355. def vector(self, docnum, fieldname, format_=None):
  356. """Returns a :class:`~whoosh.matching.Matcher` object for the
  357. given term vector.
  358. >>> docnum = searcher.document_number(path=u'/a/b/c')
  359. >>> v = searcher.vector(docnum, "content")
  360. >>> v.all_as("frequency")
  361. [(u"apple", 3), (u"bear", 2), (u"cab", 2)]
  362. :param docnum: the document number of the document for which you want
  363. the term vector.
  364. :param fieldname: the field name or field number of the field for which
  365. you want the term vector.
  366. :rtype: :class:`whoosh.matching.Matcher`
  367. """
  368. raise NotImplementedError
  369. def vector_as(self, astype, docnum, fieldname):
  370. """Returns an iterator of (termtext, value) pairs for the terms in the
  371. given term vector. This is a convenient shortcut to calling vector()
  372. and using the Matcher object when all you want are the terms and/or
  373. values.
  374. >>> docnum = searcher.document_number(path=u'/a/b/c')
  375. >>> searcher.vector_as("frequency", docnum, "content")
  376. [(u"apple", 3), (u"bear", 2), (u"cab", 2)]
  377. :param docnum: the document number of the document for which you want
  378. the term vector.
  379. :param fieldname: the field name or field number of the field for which
  380. you want the term vector.
  381. :param astype: a string containing the name of the format you want the
  382. term vector's data in, for example "weights".
  383. """
  384. vec = self.vector(docnum, fieldname)
  385. if astype == "weight":
  386. while vec.is_active():
  387. yield (vec.id(), vec.weight())
  388. vec.next()
  389. else:
  390. format_ = self.schema[fieldname].format
  391. decoder = format_.decoder(astype)
  392. while vec.is_active():
  393. yield (vec.id(), decoder(vec.value()))
  394. vec.next()
  395. def corrector(self, fieldname):
  396. """Returns a :class:`whoosh.spelling.Corrector` object that suggests
  397. corrections based on the terms in the given field.
  398. """
  399. from whoosh.spelling import ReaderCorrector
  400. fieldobj = self.schema[fieldname]
  401. return ReaderCorrector(self, fieldname, fieldobj)
  402. def terms_within(self, fieldname, text, maxdist, prefix=0):
  403. """
  404. Returns a generator of words in the given field within ``maxdist``
  405. Damerau-Levenshtein edit distance of the given text.
  406. Important: the terms are returned in **no particular order**. The only
  407. criterion is that they are within ``maxdist`` edits of ``text``. You
  408. may want to run this method multiple times with increasing ``maxdist``
  409. values to ensure you get the closest matches first. You may also have
  410. additional information (such as term frequency or an acoustic matching
  411. algorithm) you can use to rank terms with the same edit distance.
  412. :param maxdist: the maximum edit distance.
  413. :param prefix: require suggestions to share a prefix of this length
  414. with the given word. This is often justifiable since most
  415. misspellings do not involve the first letter of the word.
  416. Using a prefix dramatically decreases the time it takes to generate
  417. the list of words.
  418. :param seen: an optional set object. Words that appear in the set will
  419. not be yielded.
  420. """
  421. fieldobj = self.schema[fieldname]
  422. for btext in self.expand_prefix(fieldname, text[:prefix]):
  423. word = fieldobj.from_bytes(btext)
  424. k = distance(word, text, limit=maxdist)
  425. if k <= maxdist:
  426. yield word
  427. def most_frequent_terms(self, fieldname, number=5, prefix=''):
  428. """Returns the top 'number' most frequent terms in the given field as a
  429. list of (frequency, text) tuples.
  430. """
  431. gen = ((terminfo.weight(), text) for text, terminfo
  432. in self.iter_prefix(fieldname, prefix))
  433. return nlargest(number, gen)
  434. def most_distinctive_terms(self, fieldname, number=5, prefix=''):
  435. """Returns the top 'number' terms with the highest `tf*idf` scores as
  436. a list of (score, text) tuples.
  437. """
  438. N = float(self.doc_count())
  439. gen = ((terminfo.weight() * log(N / terminfo.doc_frequency()), text)
  440. for text, terminfo in self.iter_prefix(fieldname, prefix))
  441. return nlargest(number, gen)
  442. def leaf_readers(self):
  443. """Returns a list of (IndexReader, docbase) pairs for the child readers
  444. of this reader if it is a composite reader. If this is not a composite
  445. reader, it returns `[(self, 0)]`.
  446. """
  447. return [(self, 0)]
  448. def supports_caches(self):
  449. return False
  450. def has_column(self, fieldname):
  451. return False
  452. def column_reader(self, fieldname, column=None, reverse=False,
  453. translate=False):
  454. """
  455. :param fieldname: the name of the field for which to get a reader.
  456. :param column: if passed, use this Column object instead of the one
  457. associated with the field in the Schema.
  458. :param reverse: if passed, reverses the order of keys returned by the
  459. reader's ``sort_key()`` method. If the column type is not
  460. reversible, this will raise a ``NotImplementedError``.
  461. :param translate: if True, wrap the reader to call the field's
  462. ``from_bytes()`` method on the returned values.
  463. :return: a :class:`whoosh.columns.ColumnReader` object.
  464. """
  465. raise NotImplementedError
  466. # Segment-based reader
  467. class SegmentReader(IndexReader):
  468. def __init__(self, storage, schema, segment, generation=None, codec=None):
  469. self.schema = schema
  470. self.is_closed = False
  471. self._segment = segment
  472. self._segid = self._segment.segment_id()
  473. self._gen = generation
  474. # self.files is a storage object from which to load the segment files.
  475. # This is different from the general storage (which will be used for
  476. # caches) if the segment is in a compound file.
  477. if segment.is_compound():
  478. # Open the compound file as a storage object
  479. files = segment.open_compound_file(storage)
  480. # Use an overlay here instead of just the compound storage, in rare
  481. # circumstances a segment file may be added after the segment is
  482. # written
  483. self._storage = OverlayStorage(files, storage)
  484. else:
  485. self._storage = storage
  486. # Get subreaders from codec
  487. self._codec = codec if codec else segment.codec()
  488. self._terms = self._codec.terms_reader(self._storage, segment)
  489. self._perdoc = self._codec.per_document_reader(self._storage, segment)
  490. def codec(self):
  491. return self._codec
  492. def segment(self):
  493. return self._segment
  494. def storage(self):
  495. return self._storage
  496. def has_deletions(self):
  497. if self.is_closed:
  498. raise ReaderClosed
  499. return self._perdoc.has_deletions()
  500. def doc_count(self):
  501. if self.is_closed:
  502. raise ReaderClosed
  503. return self._perdoc.doc_count()
  504. def doc_count_all(self):
  505. if self.is_closed:
  506. raise ReaderClosed
  507. return self._perdoc.doc_count_all()
  508. def is_deleted(self, docnum):
  509. if self.is_closed:
  510. raise ReaderClosed
  511. return self._perdoc.is_deleted(docnum)
  512. def generation(self):
  513. return self._gen
  514. def __repr__(self):
  515. return "%s(%r, %r)" % (self.__class__.__name__, self._storage,
  516. self._segment)
  517. def __contains__(self, term):
  518. if self.is_closed:
  519. raise ReaderClosed
  520. fieldname, text = term
  521. if fieldname not in self.schema:
  522. return False
  523. text = self._text_to_bytes(fieldname, text)
  524. return (fieldname, text) in self._terms
  525. def close(self):
  526. if self.is_closed:
  527. raise ReaderClosed("Reader already closed")
  528. self._terms.close()
  529. self._perdoc.close()
  530. # It's possible some weird codec that doesn't use storage might have
  531. # passed None instead of a storage object
  532. if self._storage:
  533. self._storage.close()
  534. self.is_closed = True
  535. def stored_fields(self, docnum):
  536. if self.is_closed:
  537. raise ReaderClosed
  538. assert docnum >= 0
  539. schema = self.schema
  540. sfs = self._perdoc.stored_fields(docnum)
  541. # Double-check with schema to filter out removed fields
  542. return dict(item for item in iteritems(sfs) if item[0] in schema)
  543. # Delegate doc methods to the per-doc reader
  544. def all_doc_ids(self):
  545. if self.is_closed:
  546. raise ReaderClosed
  547. return self._perdoc.all_doc_ids()
  548. def iter_docs(self):
  549. if self.is_closed:
  550. raise ReaderClosed
  551. return self._perdoc.iter_docs()
  552. def all_stored_fields(self):
  553. if self.is_closed:
  554. raise ReaderClosed
  555. return self._perdoc.all_stored_fields()
  556. def field_length(self, fieldname):
  557. if self.is_closed:
  558. raise ReaderClosed
  559. return self._perdoc.field_length(fieldname)
  560. def min_field_length(self, fieldname):
  561. if self.is_closed:
  562. raise ReaderClosed
  563. return self._perdoc.min_field_length(fieldname)
  564. def max_field_length(self, fieldname):
  565. if self.is_closed:
  566. raise ReaderClosed
  567. return self._perdoc.max_field_length(fieldname)
  568. def doc_field_length(self, docnum, fieldname, default=0):
  569. if self.is_closed:
  570. raise ReaderClosed
  571. return self._perdoc.doc_field_length(docnum, fieldname, default)
  572. def has_vector(self, docnum, fieldname):
  573. if self.is_closed:
  574. raise ReaderClosed
  575. return self._perdoc.has_vector(docnum, fieldname)
  576. #
  577. def _test_field(self, fieldname):
  578. if self.is_closed:
  579. raise ReaderClosed
  580. if fieldname not in self.schema:
  581. raise TermNotFound("No field %r" % fieldname)
  582. if self.schema[fieldname].format is None:
  583. raise TermNotFound("Field %r is not indexed" % fieldname)
  584. def indexed_field_names(self):
  585. return self._terms.indexed_field_names()
  586. def all_terms(self):
  587. if self.is_closed:
  588. raise ReaderClosed
  589. schema = self.schema
  590. return ((fieldname, text) for fieldname, text in self._terms.terms()
  591. if fieldname in schema)
  592. def terms_from(self, fieldname, prefix):
  593. self._test_field(fieldname)
  594. prefix = self._text_to_bytes(fieldname, prefix)
  595. schema = self.schema
  596. return ((fname, text) for fname, text
  597. in self._terms.terms_from(fieldname, prefix)
  598. if fname in schema)
  599. def term_info(self, fieldname, text):
  600. self._test_field(fieldname)
  601. text = self._text_to_bytes(fieldname, text)
  602. try:
  603. return self._terms.term_info(fieldname, text)
  604. except KeyError:
  605. raise TermNotFound("%s:%r" % (fieldname, text))
  606. def expand_prefix(self, fieldname, prefix):
  607. self._test_field(fieldname)
  608. prefix = self._text_to_bytes(fieldname, prefix)
  609. return IndexReader.expand_prefix(self, fieldname, prefix)
  610. def lexicon(self, fieldname):
  611. self._test_field(fieldname)
  612. return IndexReader.lexicon(self, fieldname)
  613. def __iter__(self):
  614. if self.is_closed:
  615. raise ReaderClosed
  616. schema = self.schema
  617. return ((term, terminfo) for term, terminfo in self._terms.items()
  618. if term[0] in schema)
  619. def iter_from(self, fieldname, text):
  620. self._test_field(fieldname)
  621. schema = self.schema
  622. text = self._text_to_bytes(fieldname, text)
  623. for term, terminfo in self._terms.items_from(fieldname, text):
  624. if term[0] not in schema:
  625. continue
  626. yield (term, terminfo)
  627. def frequency(self, fieldname, text):
  628. self._test_field(fieldname)
  629. text = self._text_to_bytes(fieldname, text)
  630. try:
  631. return self._terms.frequency(fieldname, text)
  632. except KeyError:
  633. return 0
  634. def doc_frequency(self, fieldname, text):
  635. self._test_field(fieldname)
  636. text = self._text_to_bytes(fieldname, text)
  637. try:
  638. return self._terms.doc_frequency(fieldname, text)
  639. except KeyError:
  640. return 0
  641. def postings(self, fieldname, text, scorer=None):
  642. from whoosh.matching.wrappers import FilterMatcher
  643. if self.is_closed:
  644. raise ReaderClosed
  645. if fieldname not in self.schema:
  646. raise TermNotFound("No field %r" % fieldname)
  647. text = self._text_to_bytes(fieldname, text)
  648. format_ = self.schema[fieldname].format
  649. matcher = self._terms.matcher(fieldname, text, format_, scorer=scorer)
  650. deleted = frozenset(self._perdoc.deleted_docs())
  651. if deleted:
  652. matcher = FilterMatcher(matcher, deleted, exclude=True)
  653. return matcher
  654. def vector(self, docnum, fieldname, format_=None):
  655. if self.is_closed:
  656. raise ReaderClosed
  657. if fieldname not in self.schema:
  658. raise TermNotFound("No field %r" % fieldname)
  659. vformat = format_ or self.schema[fieldname].vector
  660. if not vformat:
  661. raise Exception("No vectors are stored for field %r" % fieldname)
  662. return self._perdoc.vector(docnum, fieldname, vformat)
  663. def cursor(self, fieldname):
  664. if self.is_closed:
  665. raise ReaderClosed
  666. fieldobj = self.schema[fieldname]
  667. return self._terms.cursor(fieldname, fieldobj)
  668. def terms_within(self, fieldname, text, maxdist, prefix=0):
  669. # Replaces the horribly inefficient base implementation with one based
  670. # on skipping through the word list efficiently using a DFA
  671. fieldobj = self.schema[fieldname]
  672. spellfield = fieldobj.spelling_fieldname(fieldname)
  673. auto = self._codec.automata(self._storage, self._segment)
  674. fieldcur = self.cursor(spellfield)
  675. return auto.terms_within(fieldcur, text, maxdist, prefix)
  676. # Column methods
  677. def has_column(self, fieldname):
  678. if self.is_closed:
  679. raise ReaderClosed
  680. coltype = self.schema[fieldname].column_type
  681. return coltype and self._perdoc.has_column(fieldname)
  682. def column_reader(self, fieldname, column=None, reverse=False,
  683. translate=True):
  684. if self.is_closed:
  685. raise ReaderClosed
  686. fieldobj = self.schema[fieldname]
  687. column = column or fieldobj.column_type
  688. if not column:
  689. raise Exception("No column for field %r in %r"
  690. % (fieldname, self))
  691. if self._perdoc.has_column(fieldname):
  692. creader = self._perdoc.column_reader(fieldname, column)
  693. if reverse:
  694. creader.set_reverse()
  695. else:
  696. # This segment doesn't have a column file for this field, so create
  697. # a fake column reader that always returns the default value.
  698. default = column.default_value(reverse)
  699. creader = columns.EmptyColumnReader(default, self.doc_count_all())
  700. if translate:
  701. # Wrap the column in a Translator to give the caller
  702. # nice values instead of sortable representations
  703. fcv = fieldobj.from_column_value
  704. creader = columns.TranslatingColumnReader(creader, fcv)
  705. return creader
  706. # Fake IndexReader class for empty indexes
  707. class EmptyReader(IndexReader):
  708. def __init__(self, schema):
  709. self.schema = schema
  710. def __contains__(self, term):
  711. return False
  712. def __iter__(self):
  713. return iter([])
  714. def cursor(self, fieldname):
  715. from whoosh.codec.base import EmptyCursor
  716. return EmptyCursor()
  717. def indexed_field_names(self):
  718. return []
  719. def all_terms(self):
  720. return iter([])
  721. def term_info(self, fieldname, text):
  722. raise TermNotFound((fieldname, text))
  723. def iter_from(self, fieldname, text):
  724. return iter([])
  725. def iter_field(self, fieldname, prefix=''):
  726. return iter([])
  727. def iter_prefix(self, fieldname, prefix=''):
  728. return iter([])
  729. def lexicon(self, fieldname):
  730. return iter([])
  731. def has_deletions(self):
  732. return False
  733. def is_deleted(self, docnum):
  734. return False
  735. def stored_fields(self, docnum):
  736. raise KeyError("No document number %s" % docnum)
  737. def all_stored_fields(self):
  738. return iter([])
  739. def doc_count_all(self):
  740. return 0
  741. def doc_count(self):
  742. return 0
  743. def frequency(self, fieldname, text):
  744. return 0
  745. def doc_frequency(self, fieldname, text):
  746. return 0
  747. def field_length(self, fieldname):
  748. return 0
  749. def min_field_length(self, fieldname):
  750. return 0
  751. def max_field_length(self, fieldname):
  752. return 0
  753. def doc_field_length(self, docnum, fieldname, default=0):
  754. return default
  755. def postings(self, fieldname, text, scorer=None):
  756. raise TermNotFound("%s:%r" % (fieldname, text))
  757. def has_vector(self, docnum, fieldname):
  758. return False
  759. def vector(self, docnum, fieldname, format_=None):
  760. raise KeyError("No document number %s" % docnum)
  761. def most_frequent_terms(self, fieldname, number=5, prefix=''):
  762. return iter([])
  763. def most_distinctive_terms(self, fieldname, number=5, prefix=None):
  764. return iter([])
  765. # Multisegment reader class
  766. class MultiReader(IndexReader):
  767. """Do not instantiate this object directly. Instead use Index.reader().
  768. """
  769. def __init__(self, readers, generation=None):
  770. self.readers = readers
  771. self._gen = generation
  772. self.schema = None
  773. if readers:
  774. self.schema = readers[0].schema
  775. self.doc_offsets = []
  776. self.base = 0
  777. for r in self.readers:
  778. self.doc_offsets.append(self.base)
  779. self.base += r.doc_count_all()
  780. self.is_closed = False
  781. def _document_segment(self, docnum):
  782. return max(0, bisect_right(self.doc_offsets, docnum) - 1)
  783. def _segment_and_docnum(self, docnum):
  784. segmentnum = self._document_segment(docnum)
  785. offset = self.doc_offsets[segmentnum]
  786. return segmentnum, docnum - offset
  787. def cursor(self, fieldname):
  788. return MultiCursor([r.cursor(fieldname) for r in self.readers])
  789. def is_atomic(self):
  790. return False
  791. def leaf_readers(self):
  792. return zip_(self.readers, self.doc_offsets)
  793. def add_reader(self, reader):
  794. self.readers.append(reader)
  795. self.doc_offsets.append(self.base)
  796. self.base += reader.doc_count_all()
  797. def close(self):
  798. for d in self.readers:
  799. d.close()
  800. self.is_closed = True
  801. def generation(self):
  802. return self._gen
  803. def format(self, fieldname):
  804. for r in self.readers:
  805. fmt = r.format(fieldname)
  806. if fmt is not None:
  807. return fmt
  808. def vector_format(self, fieldname):
  809. for r in self.readers:
  810. vfmt = r.vector_format(fieldname)
  811. if vfmt is not None:
  812. return vfmt
  813. # Term methods
  814. def __contains__(self, term):
  815. return any(r.__contains__(term) for r in self.readers)
  816. def _merge_terms(self, iterlist):
  817. # Merge-sorts terms coming from a list of term iterators.
  818. # Create a map so we can look up each iterator by its id() value
  819. itermap = {}
  820. for it in iterlist:
  821. itermap[id(it)] = it
  822. # Fill in the list with the head term from each iterator.
  823. current = []
  824. for it in iterlist:
  825. try:
  826. term = next(it)
  827. except StopIteration:
  828. continue
  829. current.append((term, id(it)))
  830. # Number of active iterators
  831. active = len(current)
  832. # If only one iterator is active, just yield from it and return
  833. if active == 1:
  834. term, itid = current[0]
  835. it = itermap[itid]
  836. yield term
  837. for term in it:
  838. yield term
  839. return
  840. # Otherwise, do a streaming heap sort of the terms from the iterators
  841. heapify(current)
  842. while active:
  843. # Peek at the first term in the sorted list
  844. term = current[0][0]
  845. # Re-iterate on all items in the list that have that term
  846. while active and current[0][0] == term:
  847. it = itermap[current[0][1]]
  848. try:
  849. nextterm = next(it)
  850. heapreplace(current, (nextterm, id(it)))
  851. except StopIteration:
  852. heappop(current)
  853. active -= 1
  854. # Yield the term
  855. yield term
  856. def indexed_field_names(self):
  857. names = set()
  858. for r in self.readers:
  859. names.update(r.indexed_field_names())
  860. return iter(names)
  861. def all_terms(self):
  862. return self._merge_terms([r.all_terms() for r in self.readers])
  863. def terms_from(self, fieldname, prefix):
  864. return self._merge_terms([r.terms_from(fieldname, prefix)
  865. for r in self.readers])
  866. def term_info(self, fieldname, text):
  867. term = (fieldname, text)
  868. # Get the term infos for the sub-readers containing the term
  869. tis = [(r.term_info(fieldname, text), offset) for r, offset
  870. in zip_(self.readers, self.doc_offsets) if term in r]
  871. # If only one reader had the term, return its terminfo with the offset
  872. # added
  873. if not tis:
  874. raise TermNotFound(term)
  875. return combine_terminfos(tis)
  876. def frequency(self, fieldname, text):
  877. return sum(r.frequency(fieldname, text) for r in self.readers)
  878. def doc_frequency(self, fieldname, text):
  879. return sum(r.doc_frequency(fieldname, text) for r in self.readers)
  880. def postings(self, fieldname, text):
  881. # This method does not add a scorer; for that, use Searcher.postings()
  882. postreaders = []
  883. docoffsets = []
  884. term = (fieldname, text)
  885. for i, r in enumerate(self.readers):
  886. if term in r:
  887. offset = self.doc_offsets[i]
  888. pr = r.postings(fieldname, text)
  889. postreaders.append(pr)
  890. docoffsets.append(offset)
  891. if not postreaders:
  892. raise TermNotFound(fieldname, text)
  893. return MultiMatcher(postreaders, docoffsets)
  894. def first_id(self, fieldname, text):
  895. for i, r in enumerate(self.readers):
  896. try:
  897. id = r.first_id(fieldname, text)
  898. except (KeyError, TermNotFound):
  899. pass
  900. else:
  901. if id is None:
  902. raise TermNotFound((fieldname, text))
  903. else:
  904. return self.doc_offsets[i] + id
  905. raise TermNotFound((fieldname, text))
  906. # Deletion methods
  907. def has_deletions(self):
  908. return any(r.has_deletions() for r in self.readers)
  909. def is_deleted(self, docnum):
  910. segmentnum, segmentdoc = self._segment_and_docnum(docnum)
  911. return self.readers[segmentnum].is_deleted(segmentdoc)
  912. def stored_fields(self, docnum):
  913. segmentnum, segmentdoc = self._segment_and_docnum(docnum)
  914. return self.readers[segmentnum].stored_fields(segmentdoc)
  915. # Columns
  916. def has_column(self, fieldname):
  917. return any(r.has_column(fieldname) for r in self.readers)
  918. def column_reader(self, fieldname, column=None, reverse=False,
  919. translate=True):
  920. crs = []
  921. doc_offsets = []
  922. for i, r in enumerate(self.readers):
  923. if r.has_column(fieldname):
  924. cr = r.column_reader(fieldname, column=column, reverse=reverse,
  925. translate=translate)
  926. crs.append(cr)
  927. doc_offsets.append(self.doc_offsets[i])
  928. return columns.MultiColumnReader(crs, doc_offsets)
  929. # Per doc methods
  930. def all_stored_fields(self):
  931. for reader in self.readers:
  932. for result in reader.all_stored_fields():
  933. yield result
  934. def doc_count_all(self):
  935. return sum(dr.doc_count_all() for dr in self.readers)
  936. def doc_count(self):
  937. return sum(dr.doc_count() for dr in self.readers)
  938. def field_length(self, fieldname):
  939. return sum(dr.field_length(fieldname) for dr in self.readers)
  940. def min_field_length(self, fieldname):
  941. return min(r.min_field_length(fieldname) for r in self.readers)
  942. def max_field_length(self, fieldname):
  943. return max(r.max_field_length(fieldname) for r in self.readers)
  944. def doc_field_length(self, docnum, fieldname, default=0):
  945. segmentnum, segmentdoc = self._segment_and_docnum(docnum)
  946. reader = self.readers[segmentnum]
  947. return reader.doc_field_length(segmentdoc, fieldname, default=default)
  948. def has_vector(self, docnum, fieldname):
  949. segmentnum, segmentdoc = self._segment_and_docnum(docnum)
  950. return self.readers[segmentnum].has_vector(segmentdoc, fieldname)
  951. def vector(self, docnum, fieldname, format_=None):
  952. segmentnum, segmentdoc = self._segment_and_docnum(docnum)
  953. return self.readers[segmentnum].vector(segmentdoc, fieldname)
  954. def vector_as(self, astype, docnum, fieldname):
  955. segmentnum, segmentdoc = self._segment_and_docnum(docnum)
  956. return self.readers[segmentnum].vector_as(astype, segmentdoc,
  957. fieldname)
  958. def combine_terminfos(tis):
  959. if len(tis) == 1:
  960. ti, offset = tis[0]
  961. ti._minid += offset
  962. ti._maxid += offset
  963. return ti
  964. # Combine the various statistics
  965. w = sum(ti.weight() for ti, _ in tis)
  966. df = sum(ti.doc_frequency() for ti, _ in tis)
  967. ml = min(ti.min_length() for ti, _ in tis)
  968. xl = max(ti.max_length() for ti, _ in tis)
  969. xw = max(ti.max_weight() for ti, _ in tis)
  970. # For min and max ID, we need to add the doc offsets
  971. mid = min(ti.min_id() + offset for ti, offset in tis)
  972. xid = max(ti.max_id() + offset for ti, offset in tis)
  973. return TermInfo(w, df, ml, xl, xw, mid, xid)
  974. class MultiCursor(object):
  975. def __init__(self, cursors):
  976. self._cursors = [c for c in cursors if c.is_valid()]
  977. self._low = []
  978. self._text = None
  979. self.next()
  980. def _find_low(self):
  981. low = []
  982. lowterm = None
  983. for c in self._cursors:
  984. if c.is_valid():
  985. cterm = c.term()
  986. if low and cterm == lowterm:
  987. low.append(c)
  988. elif low and cterm < lowterm:
  989. low = [c]
  990. lowterm = cterm
  991. self._low = low
  992. self._text = lowterm
  993. return lowterm
  994. def first(self):
  995. for c in self._cursors:
  996. c.first()
  997. return self._find_low()
  998. def find(self, term):
  999. for c in self._cursors:
  1000. c.find(term)
  1001. return self._find_low()
  1002. def next(self):
  1003. for c in self._cursors:
  1004. c.next()
  1005. return self._find_low()
  1006. def term_info(self):
  1007. tis = [c.term_info() for c in self._low]
  1008. return combine_terminfos(tis) if tis else None
  1009. def is_valid(self):
  1010. return any(c.is_valid() for c in self._cursors)