123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452 |
- # Copyright 2012 Matt Chaput. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # 1. Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
- # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # The views and conclusions contained in the software and documentation are
- # those of the authors and should not be interpreted as representing official
- # policies, either expressed or implied, of Matt Chaput.
- from ast import literal_eval
- from whoosh.compat import b, bytes_type, text_type, integer_types, PY3
- from whoosh.compat import iteritems, dumps, loads, xrange
- from whoosh.codec import base
- from whoosh.matching import ListMatcher
- from whoosh.reading import TermInfo, TermNotFound
- if not PY3:
- class memoryview:
- pass
- _reprable = (bytes_type, text_type, integer_types, float)
- # Mixin classes for producing and consuming the simple text format
- class LineWriter(object):
- def _print_line(self, indent, command, **kwargs):
- self._dbfile.write(b(" ") * indent)
- self._dbfile.write(command.encode("latin1"))
- for k, v in iteritems(kwargs):
- if isinstance(v, memoryview):
- v = bytes(v)
- if v is not None and not isinstance(v, _reprable):
- raise TypeError(type(v))
- self._dbfile.write(("\t%s=%r" % (k, v)).encode("latin1"))
- self._dbfile.write(b("\n"))
- class LineReader(object):
- def __init__(self, dbfile):
- self._dbfile = dbfile
- def _reset(self):
- self._dbfile.seek(0)
- def _find_line(self, indent, command, **kwargs):
- for largs in self._find_lines(indent, command, **kwargs):
- return largs
- def _find_lines(self, indent, command, **kwargs):
- while True:
- line = self._dbfile.readline()
- if not line:
- return
- c = self._parse_line(line)
- if c is None:
- return
- lindent, lcommand, largs = c
- if lindent == indent and lcommand == command:
- matched = True
- if kwargs:
- for k in kwargs:
- if kwargs[k] != largs.get(k):
- matched = False
- break
- if matched:
- yield largs
- elif lindent < indent:
- return
- def _parse_line(self, line):
- line = line.decode("latin1")
- line = line.rstrip()
- l = len(line)
- line = line.lstrip()
- if not line or line.startswith("#"):
- return None
- indent = (l - len(line)) // 2
- parts = line.split("\t")
- command = parts[0]
- args = {}
- for i in xrange(1, len(parts)):
- n, v = parts[i].split("=")
- args[n] = literal_eval(v)
- return (indent, command, args)
- def _find_root(self, command):
- self._reset()
- c = self._find_line(0, command)
- if c is None:
- raise Exception("No root section %r" % (command,))
- # Codec class
- class PlainTextCodec(base.Codec):
- length_stats = False
- def per_document_writer(self, storage, segment):
- return PlainPerDocWriter(storage, segment)
- def field_writer(self, storage, segment):
- return PlainFieldWriter(storage, segment)
- def per_document_reader(self, storage, segment):
- return PlainPerDocReader(storage, segment)
- def terms_reader(self, storage, segment):
- return PlainTermsReader(storage, segment)
- def new_segment(self, storage, indexname):
- return PlainSegment(indexname)
- class PlainPerDocWriter(base.PerDocumentWriter, LineWriter):
- def __init__(self, storage, segment):
- self._dbfile = storage.create_file(segment.make_filename(".dcs"))
- self._print_line(0, "DOCS")
- self.is_closed = False
- def start_doc(self, docnum):
- self._print_line(1, "DOC", dn=docnum)
- def add_field(self, fieldname, fieldobj, value, length):
- if value is not None:
- value = dumps(value, 2)
- self._print_line(2, "DOCFIELD", fn=fieldname, v=value, len=length)
- def add_column_value(self, fieldname, columnobj, value):
- self._print_line(2, "COLVAL", fn=fieldname, v=value)
- def add_vector_items(self, fieldname, fieldobj, items):
- self._print_line(2, "VECTOR", fn=fieldname)
- for text, weight, vbytes in items:
- self._print_line(3, "VPOST", t=text, w=weight, v=vbytes)
- def finish_doc(self):
- pass
- def close(self):
- self._dbfile.close()
- self.is_closed = True
- class PlainPerDocReader(base.PerDocumentReader, LineReader):
- def __init__(self, storage, segment):
- self._dbfile = storage.open_file(segment.make_filename(".dcs"))
- self._segment = segment
- self.is_closed = False
- def doc_count(self):
- return self._segment.doc_count()
- def doc_count_all(self):
- return self._segment.doc_count()
- def has_deletions(self):
- return False
- def is_deleted(self, docnum):
- return False
- def deleted_docs(self):
- return frozenset()
- def _find_doc(self, docnum):
- self._find_root("DOCS")
- c = self._find_line(1, "DOC")
- while c is not None:
- dn = c["dn"]
- if dn == docnum:
- return True
- elif dn > docnum:
- return False
- c = self._find_line(1, "DOC")
- return False
- def _iter_docs(self):
- self._find_root("DOCS")
- c = self._find_line(1, "DOC")
- while c is not None:
- yield c["dn"]
- c = self._find_line(1, "DOC")
- def _iter_docfields(self, fieldname):
- for _ in self._iter_docs():
- for c in self._find_lines(2, "DOCFIELD", fn=fieldname):
- yield c
- def _iter_lengths(self, fieldname):
- return (c.get("len", 0) for c in self._iter_docfields(fieldname))
- def doc_field_length(self, docnum, fieldname, default=0):
- for dn in self._iter_docs():
- if dn == docnum:
- c = self._find_line(2, "DOCFIELD", fn=fieldname)
- if c is not None:
- return c.get("len", default)
- elif dn > docnum:
- break
- return default
- def _column_values(self, fieldname):
- for i, docnum in enumerate(self._iter_docs()):
- if i != docnum:
- raise Exception("Missing column value for field %r doc %d?"
- % (fieldname, i))
- c = self._find_line(2, "COLVAL", fn=fieldname)
- if c is None:
- raise Exception("Missing column value for field %r doc %d?"
- % (fieldname, docnum))
- yield c.get("v")
- def has_column(self, fieldname):
- for _ in self._column_values(fieldname):
- return True
- return False
- def column_reader(self, fieldname, column):
- return list(self._column_values(fieldname))
- def field_length(self, fieldname):
- return sum(self._iter_lengths(fieldname))
- def min_field_length(self, fieldname):
- return min(self._iter_lengths(fieldname))
- def max_field_length(self, fieldname):
- return max(self._iter_lengths(fieldname))
- def has_vector(self, docnum, fieldname):
- if self._find_doc(docnum):
- if self._find_line(2, "VECTOR"):
- return True
- return False
- def vector(self, docnum, fieldname, format_):
- if not self._find_doc(docnum):
- raise Exception
- if not self._find_line(2, "VECTOR"):
- raise Exception
- ids = []
- weights = []
- values = []
- c = self._find_line(3, "VPOST")
- while c is not None:
- ids.append(c["t"])
- weights.append(c["w"])
- values.append(c["v"])
- c = self._find_line(3, "VPOST")
- return ListMatcher(ids, weights, values, format_,)
- def _read_stored_fields(self):
- sfs = {}
- c = self._find_line(2, "DOCFIELD")
- while c is not None:
- v = c.get("v")
- if v is not None:
- v = loads(v)
- sfs[c["fn"]] = v
- c = self._find_line(2, "DOCFIELD")
- return sfs
- def stored_fields(self, docnum):
- if not self._find_doc(docnum):
- raise Exception
- return self._read_stored_fields()
- def iter_docs(self):
- return enumerate(self.all_stored_fields())
- def all_stored_fields(self):
- for _ in self._iter_docs():
- yield self._read_stored_fields()
- def close(self):
- self._dbfile.close()
- self.is_closed = True
- class PlainFieldWriter(base.FieldWriter, LineWriter):
- def __init__(self, storage, segment):
- self._dbfile = storage.create_file(segment.make_filename(".trm"))
- self._print_line(0, "TERMS")
- @property
- def is_closed(self):
- return self._dbfile.is_closed
- def start_field(self, fieldname, fieldobj):
- self._fieldobj = fieldobj
- self._print_line(1, "TERMFIELD", fn=fieldname)
- def start_term(self, btext):
- self._terminfo = TermInfo()
- self._print_line(2, "BTEXT", t=btext)
- def add(self, docnum, weight, vbytes, length):
- self._terminfo.add_posting(docnum, weight, length)
- self._print_line(3, "POST", dn=docnum, w=weight, v=vbytes)
- def finish_term(self):
- ti = self._terminfo
- self._print_line(3, "TERMINFO",
- df=ti.doc_frequency(), weight=ti.weight(),
- minlength=ti.min_length(), maxlength=ti.max_length(),
- maxweight=ti.max_weight(),
- minid=ti.min_id(), maxid=ti.max_id())
- def add_spell_word(self, fieldname, text):
- self._print_line(2, "SPELL", fn=fieldname, t=text)
- def close(self):
- self._dbfile.close()
- class PlainTermsReader(base.TermsReader, LineReader):
- def __init__(self, storage, segment):
- self._dbfile = storage.open_file(segment.make_filename(".trm"))
- self._segment = segment
- self.is_closed = False
- def _find_field(self, fieldname):
- self._find_root("TERMS")
- if self._find_line(1, "TERMFIELD", fn=fieldname) is None:
- raise TermNotFound("No field %r" % fieldname)
- def _iter_fields(self):
- self._find_root()
- c = self._find_line(1, "TERMFIELD")
- while c is not None:
- yield c["fn"]
- c = self._find_line(1, "TERMFIELD")
- def _iter_btexts(self):
- c = self._find_line(2, "BTEXT")
- while c is not None:
- yield c["t"]
- c = self._find_line(2, "BTEXT")
- def _find_term(self, fieldname, btext):
- self._find_field(fieldname)
- for t in self._iter_btexts():
- if t == btext:
- return True
- elif t > btext:
- break
- return False
- def _find_terminfo(self):
- c = self._find_line(3, "TERMINFO")
- return TermInfo(**c)
- def __contains__(self, term):
- fieldname, btext = term
- return self._find_term(fieldname, btext)
- def indexed_field_names(self):
- return self._iter_fields()
- def terms(self):
- for fieldname in self._iter_fields():
- for btext in self._iter_btexts():
- yield (fieldname, btext)
- def terms_from(self, fieldname, prefix):
- self._find_field(fieldname)
- for btext in self._iter_btexts():
- if btext < prefix:
- continue
- yield (fieldname, btext)
- def items(self):
- for fieldname, btext in self.terms():
- yield (fieldname, btext), self._find_terminfo()
- def items_from(self, fieldname, prefix):
- for fieldname, btext in self.terms_from(fieldname, prefix):
- yield (fieldname, btext), self._find_terminfo()
- def term_info(self, fieldname, btext):
- if not self._find_term(fieldname, btext):
- raise TermNotFound((fieldname, btext))
- return self._find_terminfo()
- def matcher(self, fieldname, btext, format_, scorer=None):
- if not self._find_term(fieldname, btext):
- raise TermNotFound((fieldname, btext))
- ids = []
- weights = []
- values = []
- c = self._find_line(3, "POST")
- while c is not None:
- ids.append(c["dn"])
- weights.append(c["w"])
- values.append(c["v"])
- c = self._find_line(3, "POST")
- return ListMatcher(ids, weights, values, format_, scorer=scorer)
- def close(self):
- self._dbfile.close()
- self.is_closed = True
- class PlainSegment(base.Segment):
- def __init__(self, indexname):
- base.Segment.__init__(self, indexname)
- self._doccount = 0
- def codec(self):
- return PlainTextCodec()
- def set_doc_count(self, doccount):
- self._doccount = doccount
- def doc_count(self):
- return self._doccount
- def should_assemble(self):
- return False
|