plaintext.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452
  1. # Copyright 2012 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. from ast import literal_eval
  28. from whoosh.compat import b, bytes_type, text_type, integer_types, PY3
  29. from whoosh.compat import iteritems, dumps, loads, xrange
  30. from whoosh.codec import base
  31. from whoosh.matching import ListMatcher
  32. from whoosh.reading import TermInfo, TermNotFound
  33. if not PY3:
  34. class memoryview:
  35. pass
  36. _reprable = (bytes_type, text_type, integer_types, float)
  37. # Mixin classes for producing and consuming the simple text format
  38. class LineWriter(object):
  39. def _print_line(self, indent, command, **kwargs):
  40. self._dbfile.write(b(" ") * indent)
  41. self._dbfile.write(command.encode("latin1"))
  42. for k, v in iteritems(kwargs):
  43. if isinstance(v, memoryview):
  44. v = bytes(v)
  45. if v is not None and not isinstance(v, _reprable):
  46. raise TypeError(type(v))
  47. self._dbfile.write(("\t%s=%r" % (k, v)).encode("latin1"))
  48. self._dbfile.write(b("\n"))
  49. class LineReader(object):
  50. def __init__(self, dbfile):
  51. self._dbfile = dbfile
  52. def _reset(self):
  53. self._dbfile.seek(0)
  54. def _find_line(self, indent, command, **kwargs):
  55. for largs in self._find_lines(indent, command, **kwargs):
  56. return largs
  57. def _find_lines(self, indent, command, **kwargs):
  58. while True:
  59. line = self._dbfile.readline()
  60. if not line:
  61. return
  62. c = self._parse_line(line)
  63. if c is None:
  64. return
  65. lindent, lcommand, largs = c
  66. if lindent == indent and lcommand == command:
  67. matched = True
  68. if kwargs:
  69. for k in kwargs:
  70. if kwargs[k] != largs.get(k):
  71. matched = False
  72. break
  73. if matched:
  74. yield largs
  75. elif lindent < indent:
  76. return
  77. def _parse_line(self, line):
  78. line = line.decode("latin1")
  79. line = line.rstrip()
  80. l = len(line)
  81. line = line.lstrip()
  82. if not line or line.startswith("#"):
  83. return None
  84. indent = (l - len(line)) // 2
  85. parts = line.split("\t")
  86. command = parts[0]
  87. args = {}
  88. for i in xrange(1, len(parts)):
  89. n, v = parts[i].split("=")
  90. args[n] = literal_eval(v)
  91. return (indent, command, args)
  92. def _find_root(self, command):
  93. self._reset()
  94. c = self._find_line(0, command)
  95. if c is None:
  96. raise Exception("No root section %r" % (command,))
  97. # Codec class
  98. class PlainTextCodec(base.Codec):
  99. length_stats = False
  100. def per_document_writer(self, storage, segment):
  101. return PlainPerDocWriter(storage, segment)
  102. def field_writer(self, storage, segment):
  103. return PlainFieldWriter(storage, segment)
  104. def per_document_reader(self, storage, segment):
  105. return PlainPerDocReader(storage, segment)
  106. def terms_reader(self, storage, segment):
  107. return PlainTermsReader(storage, segment)
  108. def new_segment(self, storage, indexname):
  109. return PlainSegment(indexname)
  110. class PlainPerDocWriter(base.PerDocumentWriter, LineWriter):
  111. def __init__(self, storage, segment):
  112. self._dbfile = storage.create_file(segment.make_filename(".dcs"))
  113. self._print_line(0, "DOCS")
  114. self.is_closed = False
  115. def start_doc(self, docnum):
  116. self._print_line(1, "DOC", dn=docnum)
  117. def add_field(self, fieldname, fieldobj, value, length):
  118. if value is not None:
  119. value = dumps(value, 2)
  120. self._print_line(2, "DOCFIELD", fn=fieldname, v=value, len=length)
  121. def add_column_value(self, fieldname, columnobj, value):
  122. self._print_line(2, "COLVAL", fn=fieldname, v=value)
  123. def add_vector_items(self, fieldname, fieldobj, items):
  124. self._print_line(2, "VECTOR", fn=fieldname)
  125. for text, weight, vbytes in items:
  126. self._print_line(3, "VPOST", t=text, w=weight, v=vbytes)
  127. def finish_doc(self):
  128. pass
  129. def close(self):
  130. self._dbfile.close()
  131. self.is_closed = True
  132. class PlainPerDocReader(base.PerDocumentReader, LineReader):
  133. def __init__(self, storage, segment):
  134. self._dbfile = storage.open_file(segment.make_filename(".dcs"))
  135. self._segment = segment
  136. self.is_closed = False
  137. def doc_count(self):
  138. return self._segment.doc_count()
  139. def doc_count_all(self):
  140. return self._segment.doc_count()
  141. def has_deletions(self):
  142. return False
  143. def is_deleted(self, docnum):
  144. return False
  145. def deleted_docs(self):
  146. return frozenset()
  147. def _find_doc(self, docnum):
  148. self._find_root("DOCS")
  149. c = self._find_line(1, "DOC")
  150. while c is not None:
  151. dn = c["dn"]
  152. if dn == docnum:
  153. return True
  154. elif dn > docnum:
  155. return False
  156. c = self._find_line(1, "DOC")
  157. return False
  158. def _iter_docs(self):
  159. self._find_root("DOCS")
  160. c = self._find_line(1, "DOC")
  161. while c is not None:
  162. yield c["dn"]
  163. c = self._find_line(1, "DOC")
  164. def _iter_docfields(self, fieldname):
  165. for _ in self._iter_docs():
  166. for c in self._find_lines(2, "DOCFIELD", fn=fieldname):
  167. yield c
  168. def _iter_lengths(self, fieldname):
  169. return (c.get("len", 0) for c in self._iter_docfields(fieldname))
  170. def doc_field_length(self, docnum, fieldname, default=0):
  171. for dn in self._iter_docs():
  172. if dn == docnum:
  173. c = self._find_line(2, "DOCFIELD", fn=fieldname)
  174. if c is not None:
  175. return c.get("len", default)
  176. elif dn > docnum:
  177. break
  178. return default
  179. def _column_values(self, fieldname):
  180. for i, docnum in enumerate(self._iter_docs()):
  181. if i != docnum:
  182. raise Exception("Missing column value for field %r doc %d?"
  183. % (fieldname, i))
  184. c = self._find_line(2, "COLVAL", fn=fieldname)
  185. if c is None:
  186. raise Exception("Missing column value for field %r doc %d?"
  187. % (fieldname, docnum))
  188. yield c.get("v")
  189. def has_column(self, fieldname):
  190. for _ in self._column_values(fieldname):
  191. return True
  192. return False
  193. def column_reader(self, fieldname, column):
  194. return list(self._column_values(fieldname))
  195. def field_length(self, fieldname):
  196. return sum(self._iter_lengths(fieldname))
  197. def min_field_length(self, fieldname):
  198. return min(self._iter_lengths(fieldname))
  199. def max_field_length(self, fieldname):
  200. return max(self._iter_lengths(fieldname))
  201. def has_vector(self, docnum, fieldname):
  202. if self._find_doc(docnum):
  203. if self._find_line(2, "VECTOR"):
  204. return True
  205. return False
  206. def vector(self, docnum, fieldname, format_):
  207. if not self._find_doc(docnum):
  208. raise Exception
  209. if not self._find_line(2, "VECTOR"):
  210. raise Exception
  211. ids = []
  212. weights = []
  213. values = []
  214. c = self._find_line(3, "VPOST")
  215. while c is not None:
  216. ids.append(c["t"])
  217. weights.append(c["w"])
  218. values.append(c["v"])
  219. c = self._find_line(3, "VPOST")
  220. return ListMatcher(ids, weights, values, format_,)
  221. def _read_stored_fields(self):
  222. sfs = {}
  223. c = self._find_line(2, "DOCFIELD")
  224. while c is not None:
  225. v = c.get("v")
  226. if v is not None:
  227. v = loads(v)
  228. sfs[c["fn"]] = v
  229. c = self._find_line(2, "DOCFIELD")
  230. return sfs
  231. def stored_fields(self, docnum):
  232. if not self._find_doc(docnum):
  233. raise Exception
  234. return self._read_stored_fields()
  235. def iter_docs(self):
  236. return enumerate(self.all_stored_fields())
  237. def all_stored_fields(self):
  238. for _ in self._iter_docs():
  239. yield self._read_stored_fields()
  240. def close(self):
  241. self._dbfile.close()
  242. self.is_closed = True
  243. class PlainFieldWriter(base.FieldWriter, LineWriter):
  244. def __init__(self, storage, segment):
  245. self._dbfile = storage.create_file(segment.make_filename(".trm"))
  246. self._print_line(0, "TERMS")
  247. @property
  248. def is_closed(self):
  249. return self._dbfile.is_closed
  250. def start_field(self, fieldname, fieldobj):
  251. self._fieldobj = fieldobj
  252. self._print_line(1, "TERMFIELD", fn=fieldname)
  253. def start_term(self, btext):
  254. self._terminfo = TermInfo()
  255. self._print_line(2, "BTEXT", t=btext)
  256. def add(self, docnum, weight, vbytes, length):
  257. self._terminfo.add_posting(docnum, weight, length)
  258. self._print_line(3, "POST", dn=docnum, w=weight, v=vbytes)
  259. def finish_term(self):
  260. ti = self._terminfo
  261. self._print_line(3, "TERMINFO",
  262. df=ti.doc_frequency(), weight=ti.weight(),
  263. minlength=ti.min_length(), maxlength=ti.max_length(),
  264. maxweight=ti.max_weight(),
  265. minid=ti.min_id(), maxid=ti.max_id())
  266. def add_spell_word(self, fieldname, text):
  267. self._print_line(2, "SPELL", fn=fieldname, t=text)
  268. def close(self):
  269. self._dbfile.close()
  270. class PlainTermsReader(base.TermsReader, LineReader):
  271. def __init__(self, storage, segment):
  272. self._dbfile = storage.open_file(segment.make_filename(".trm"))
  273. self._segment = segment
  274. self.is_closed = False
  275. def _find_field(self, fieldname):
  276. self._find_root("TERMS")
  277. if self._find_line(1, "TERMFIELD", fn=fieldname) is None:
  278. raise TermNotFound("No field %r" % fieldname)
  279. def _iter_fields(self):
  280. self._find_root()
  281. c = self._find_line(1, "TERMFIELD")
  282. while c is not None:
  283. yield c["fn"]
  284. c = self._find_line(1, "TERMFIELD")
  285. def _iter_btexts(self):
  286. c = self._find_line(2, "BTEXT")
  287. while c is not None:
  288. yield c["t"]
  289. c = self._find_line(2, "BTEXT")
  290. def _find_term(self, fieldname, btext):
  291. self._find_field(fieldname)
  292. for t in self._iter_btexts():
  293. if t == btext:
  294. return True
  295. elif t > btext:
  296. break
  297. return False
  298. def _find_terminfo(self):
  299. c = self._find_line(3, "TERMINFO")
  300. return TermInfo(**c)
  301. def __contains__(self, term):
  302. fieldname, btext = term
  303. return self._find_term(fieldname, btext)
  304. def indexed_field_names(self):
  305. return self._iter_fields()
  306. def terms(self):
  307. for fieldname in self._iter_fields():
  308. for btext in self._iter_btexts():
  309. yield (fieldname, btext)
  310. def terms_from(self, fieldname, prefix):
  311. self._find_field(fieldname)
  312. for btext in self._iter_btexts():
  313. if btext < prefix:
  314. continue
  315. yield (fieldname, btext)
  316. def items(self):
  317. for fieldname, btext in self.terms():
  318. yield (fieldname, btext), self._find_terminfo()
  319. def items_from(self, fieldname, prefix):
  320. for fieldname, btext in self.terms_from(fieldname, prefix):
  321. yield (fieldname, btext), self._find_terminfo()
  322. def term_info(self, fieldname, btext):
  323. if not self._find_term(fieldname, btext):
  324. raise TermNotFound((fieldname, btext))
  325. return self._find_terminfo()
  326. def matcher(self, fieldname, btext, format_, scorer=None):
  327. if not self._find_term(fieldname, btext):
  328. raise TermNotFound((fieldname, btext))
  329. ids = []
  330. weights = []
  331. values = []
  332. c = self._find_line(3, "POST")
  333. while c is not None:
  334. ids.append(c["dn"])
  335. weights.append(c["w"])
  336. values.append(c["v"])
  337. c = self._find_line(3, "POST")
  338. return ListMatcher(ids, weights, values, format_, scorer=scorer)
  339. def close(self):
  340. self._dbfile.close()
  341. self.is_closed = True
  342. class PlainSegment(base.Segment):
  343. def __init__(self, indexname):
  344. base.Segment.__init__(self, indexname)
  345. self._doccount = 0
  346. def codec(self):
  347. return PlainTextCodec()
  348. def set_doc_count(self, doccount):
  349. self._doccount = doccount
  350. def doc_count(self):
  351. return self._doccount
  352. def should_assemble(self):
  353. return False