123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610 |
- # Copyright 2010 Matt Chaput. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # 1. Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
- # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # The views and conclusions contained in the software and documentation are
- # those of the authors and should not be interpreted as representing official
- # policies, either expressed or implied, of Matt Chaput.
- from __future__ import division
- import os.path
- from optparse import OptionParser
- from shutil import rmtree
- from whoosh import index, qparser, query, scoring
- from whoosh.util import now, find_object
- try:
- import xappy
- except ImportError:
- pass
- try:
- import xapian
- except ImportError:
- pass
- try:
- import pysolr
- except ImportError:
- pass
- try:
- from persistent import Persistent
- class ZDoc(Persistent):
- def __init__(self, d):
- self.__dict__.update(d)
- except ImportError:
- pass
- class Module(object):
- def __init__(self, bench, options, args):
- self.bench = bench
- self.options = options
- self.args = args
- def __repr__(self):
- return self.__class__.__name__
- def indexer(self, **kwargs):
- pass
- def index_document(self, d):
- raise NotImplementedError
- def finish(self, **kwargs):
- pass
- def _process_result(self, d):
- attrname = "process_result_%s" % self.options.lib
- if hasattr(self.bench.spec, attrname):
- method = getattr(self.bench.spec, attrname)
- self._process_result = method
- return method(d)
- else:
- self._process_result = lambda x: x
- return d
- def searcher(self):
- pass
- def query(self):
- raise NotImplementedError
- def find(self, q):
- raise NotImplementedError
- def findterms(self, terms):
- raise NotImplementedError
- def results(self, r):
- for hit in r:
- yield self._process_result(hit)
- class Spec(object):
- headline_field = "title"
- main_field = "body"
- def __init__(self, options, args):
- self.options = options
- self.args = args
- def documents(self):
- raise NotImplementedError
- def setup(self):
- pass
- def print_results(self, ls):
- showbody = self.options.showbody
- snippets = self.options.snippets
- limit = self.options.limit
- for i, hit in enumerate(ls):
- if i >= limit:
- break
- print("%d. %s" % (i + 1, hit.get(self.headline_field)))
- if snippets:
- print(self.show_snippet(hit))
- if showbody:
- print(hit.get(self.main_field))
- class WhooshModule(Module):
- def indexer(self, create=True):
- schema = self.bench.spec.whoosh_schema()
- path = os.path.join(self.options.dir, "%s_whoosh"
- % self.options.indexname)
- if not os.path.exists(path):
- os.mkdir(path)
- if create:
- ix = index.create_in(path, schema)
- else:
- ix = index.open_dir(path)
- poolclass = None
- if self.options.pool:
- poolclass = find_object(self.options.pool)
- self.writer = ix.writer(limitmb=int(self.options.limitmb),
- poolclass=poolclass,
- dir=self.options.tempdir,
- procs=int(self.options.procs),
- batchsize=int(self.options.batch),
- multisegment=self.options.xms)
- self._procdoc = None
- if hasattr(self.bench.spec, "process_document_whoosh"):
- self._procdoc = self.bench.spec.process_document_whoosh
- def index_document(self, d):
- _procdoc = self._procdoc
- if _procdoc:
- _procdoc(d)
- self.writer.add_document(**d)
- def finish(self, merge=True, optimize=False):
- self.writer.commit(merge=merge, optimize=optimize)
- def searcher(self):
- path = os.path.join(self.options.dir, "%s_whoosh"
- % self.options.indexname)
- ix = index.open_dir(path)
- self.srch = ix.searcher(weighting=scoring.PL2())
- self.parser = qparser.QueryParser(self.bench.spec.main_field,
- schema=ix.schema)
- def query(self):
- qstring = " ".join(self.args).decode("utf-8")
- return self.parser.parse(qstring)
- def find(self, q):
- return self.srch.search(q, limit=int(self.options.limit),
- optimize=self.options.optimize)
- def findterms(self, terms):
- limit = int(self.options.limit)
- s = self.srch
- q = query.Term(self.bench.spec.main_field, None)
- for term in terms:
- q.text = term
- yield s.search(q, limit=limit)
- class XappyModule(Module):
- def indexer(self, **kwargs):
- path = os.path.join(self.options.dir, "%s_xappy"
- % self.options.indexname)
- conn = self.bench.spec.xappy_connection(path)
- return conn
- def index_document(self, conn, d):
- if hasattr(self.bench, "process_document_xappy"):
- self.bench.process_document_xappy(d)
- doc = xappy.UnprocessedDocument()
- for key, values in d:
- if not isinstance(values, list):
- values = [values]
- for value in values:
- doc.fields.append(xappy.Field(key, value))
- conn.add(doc)
- def finish(self, conn):
- conn.flush()
- def searcher(self):
- path = os.path.join(self.options.dir, "%s_xappy"
- % self.options.indexname)
- return xappy.SearchConnection(path)
- def query(self, conn):
- return conn.query_parse(" ".join(self.args))
- def find(self, conn, q):
- return conn.search(q, 0, int(self.options.limit))
- def findterms(self, conn, terms):
- limit = int(self.options.limit)
- for term in terms:
- q = conn.query_field(self.bench.spec.main_field, term)
- yield conn.search(q, 0, limit)
- def results(self, r):
- hf = self.bench.spec.headline_field
- mf = self.bench.spec.main_field
- for hit in r:
- yield self._process_result({hf: hit.data[hf], mf: hit.data[mf]})
- class XapianModule(Module):
- def indexer(self, **kwargs):
- path = os.path.join(self.options.dir, "%s_xapian"
- % self.options.indexname)
- self.database = xapian.WritableDatabase(path, xapian.DB_CREATE_OR_OPEN)
- self.ixer = xapian.TermGenerator()
- def index_document(self, d):
- if hasattr(self.bench, "process_document_xapian"):
- self.bench.process_document_xapian(d)
- doc = xapian.Document()
- doc.add_value(0, d.get(self.bench.spec.headline_field, "-"))
- doc.set_data(d[self.bench.spec.main_field])
- self.ixer.set_document(doc)
- self.ixer.index_text(d[self.bench.spec.main_field])
- self.database.add_document(doc)
- def finish(self, **kwargs):
- self.database.flush()
- def searcher(self):
- path = os.path.join(self.options.dir, "%s_xappy"
- % self.options.indexname)
- self.db = xapian.Database(path)
- self.enq = xapian.Enquire(self.db)
- self.qp = xapian.QueryParser()
- self.qp.set_database(self.db)
- def query(self):
- return self.qp.parse_query(" ".join(self.args))
- def find(self, q):
- self.enq.set_query(q)
- return self.enq.get_mset(0, int(self.options.limit))
- def findterms(self, terms):
- limit = int(self.options.limit)
- for term in terms:
- q = self.qp.parse_query(term)
- self.enq.set_query(q)
- yield self.enq.get_mset(0, limit)
- def results(self, matches):
- hf = self.bench.spec.headline_field
- mf = self.bench.spec.main_field
- for m in matches:
- yield self._process_result({hf: m.document.get_value(0),
- mf: m.document.get_data()})
- class SolrModule(Module):
- def indexer(self, **kwargs):
- self.solr_doclist = []
- self.conn = pysolr.Solr(self.options.url)
- self.conn.delete("*:*")
- self.conn.commit()
- def index_document(self, d):
- self.solr_doclist.append(d)
- if len(self.solr_doclist) >= int(self.options.batch):
- self.conn.add(self.solr_doclist, commit=False)
- self.solr_doclist = []
- def finish(self, **kwargs):
- if self.solr_doclist:
- self.conn.add(self.solr_doclist)
- del self.solr_doclist
- self.conn.optimize(block=True)
- def searcher(self):
- self.solr = pysolr.Solr(self.options.url)
- def query(self):
- return " ".join(self.args)
- def find(self, q):
- return self.solr.search(q, limit=int(self.options.limit))
- def findterms(self, terms):
- limit = int(self.options.limit)
- for term in terms:
- yield self.solr.search("body:" + term, limit=limit)
- class ZcatalogModule(Module):
- def indexer(self, **kwargs):
- from ZODB.FileStorage import FileStorage # @UnresolvedImport
- from ZODB.DB import DB # @UnresolvedImport
- from zcatalog import catalog # @UnresolvedImport
- from zcatalog import indexes # @UnresolvedImport
- import transaction # @UnresolvedImport
- dir = os.path.join(self.options.dir, "%s_zcatalog"
- % self.options.indexname)
- if os.path.exists(dir):
- rmtree(dir)
- os.mkdir(dir)
- storage = FileStorage(os.path.join(dir, "index"))
- db = DB(storage)
- conn = db.open()
- self.cat = catalog.Catalog()
- self.bench.spec.zcatalog_setup(self.cat)
- conn.root()["cat"] = self.cat
- transaction.commit()
- self.zcatalog_count = 0
- def index_document(self, d):
- if hasattr(self.bench, "process_document_zcatalog"):
- self.bench.process_document_zcatalog(d)
- doc = ZDoc(d)
- self.cat.index_doc(doc)
- self.zcatalog_count += 1
- if self.zcatalog_count >= 100:
- import transaction # @UnresolvedImport
- transaction.commit()
- self.zcatalog_count = 0
- def finish(self, **kwargs):
- import transaction # @UnresolvedImport
- transaction.commit()
- del self.zcatalog_count
- def searcher(self):
- from ZODB.FileStorage import FileStorage # @UnresolvedImport
- from ZODB.DB import DB # @UnresolvedImport
- from zcatalog import catalog # @UnresolvedImport
- from zcatalog import indexes # @UnresolvedImport
- import transaction # @UnresolvedImport
- path = os.path.join(self.options.dir, "%s_zcatalog"
- % self.options.indexname, "index")
- storage = FileStorage(path)
- db = DB(storage)
- conn = db.open()
- self.cat = conn.root()["cat"]
- def query(self):
- return " ".join(self.args)
- def find(self, q):
- return self.cat.searchResults(body=q)
- def findterms(self, terms):
- for term in terms:
- yield self.cat.searchResults(body=term)
- def results(self, r):
- hf = self.bench.spec.headline_field
- mf = self.bench.spec.main_field
- for hit in r:
- # Have to access the attributes for them to be retrieved
- yield self._process_result({hf: getattr(hit, hf),
- mf: getattr(hit, mf)})
- class NucularModule(Module):
- def indexer(self, create=True):
- import shutil
- from nucular import Nucular
- dir = os.path.join(self.options.dir, "%s_nucular"
- % self.options.indexname)
- if create:
- if os.path.exists(dir):
- shutil.rmtree(dir)
- os.mkdir(dir)
- self.archive = Nucular.Nucular(dir)
- if create:
- self.archive.create()
- self.count = 0
- def index_document(self, d):
- try:
- self.archive.indexDictionary(str(self.count), d)
- except ValueError:
- print("d=", d)
- raise
- self.count += 1
- if not self.count % int(self.options.batch):
- t = now()
- self.archive.store(lazy=True)
- self.indexer(create=False)
- def finish(self, **kwargs):
- self.archive.store(lazy=False)
- self.archive.aggregateRecent(fast=False, verbose=True)
- self.archive.moveTransientToBase(verbose=True)
- self.archive.cleanUp()
- def searcher(self):
- from nucular import Nucular
- dir = os.path.join(self.options.dir, "%s_nucular"
- % self.options.indexname)
- self.archive = Nucular.Nucular(dir)
- def query(self):
- return " ".join(self.args)
- def find(self, q):
- return self.archive.dictionaries(q)
- def findterms(self, terms):
- for term in terms:
- q = self.archive.Query()
- q.anyWord(term)
- yield q.resultDictionaries()
- class Bench(object):
- libs = {"whoosh": WhooshModule, "xappy": XappyModule,
- "xapian": XapianModule, "solr": SolrModule,
- "zcatalog": ZcatalogModule, "nucular": NucularModule}
- def index(self, lib):
- print("Indexing with %s..." % lib)
- options = self.options
- every = None if options.every is None else int(options.every)
- merge = options.merge
- chunk = int(options.chunk)
- skip = int(options.skip)
- upto = int(options.upto)
- count = 0
- skipc = skip
- starttime = chunkstarttime = now()
- lib.indexer()
- for d in self.spec.documents():
- skipc -= 1
- if not skipc:
- lib.index_document(d)
- count += 1
- skipc = skip
- if chunk and not count % chunk:
- t = now()
- sofar = t - starttime
- print("Done %d docs, %0.3f secs for %d, %0.3f total, %0.3f docs/s" % (count, t - chunkstarttime, chunk, sofar, count / sofar))
- chunkstarttime = t
- if count > upto:
- break
- if every and not count % every:
- print("----Commit")
- lib.finish(merge=merge)
- lib.indexer(create=False)
- spooltime = now()
- print("Spool time:", spooltime - starttime)
- lib.finish(merge=merge)
- committime = now()
- print("Commit time:", committime - spooltime)
- totaltime = committime - starttime
- print("Total time to index %d documents: %0.3f secs (%0.3f minutes)" % (count, totaltime, totaltime / 60.0))
- print("Indexed %0.3f docs/s" % (count / totaltime))
- def search(self, lib):
- lib.searcher()
- t = now()
- q = lib.query()
- print("Query:", q)
- r = lib.find(q)
- print("Search time:", now() - t)
- t = now()
- self.spec.print_results(lib.results(r))
- print("Print time:", now() - t)
- def search_file(self, lib):
- f = open(self.options.termfile, "rb")
- terms = [line.strip() for line in f]
- f.close()
- print("Searching %d terms with %s" % (len(terms), lib))
- lib.searcher()
- starttime = now()
- for r in lib.findterms(terms):
- pass
- searchtime = now() - starttime
- print("Search time:", searchtime, "searches/s:", float(len(terms)) / searchtime)
- def _parser(self, name):
- p = OptionParser()
- p.add_option("-x", "--lib", dest="lib",
- help="Name of the library to use to index/search.",
- default="whoosh")
- p.add_option("-d", "--dir", dest="dir", metavar="DIRNAME",
- help="Directory in which to store index.", default=".")
- p.add_option("-s", "--setup", dest="setup", action="store_true",
- help="Set up any support files or caches.", default=False)
- p.add_option("-i", "--index", dest="index", action="store_true",
- help="Index the documents.", default=False)
- p.add_option("-n", "--name", dest="indexname", metavar="PREFIX",
- help="Index name prefix.", default="%s_index" % name)
- p.add_option("-U", "--url", dest="url", metavar="URL",
- help="Solr URL", default="http://localhost:8983/solr")
- p.add_option("-m", "--mb", dest="limitmb",
- help="Max. memory usage, in MB", default="128")
- p.add_option("-c", "--chunk", dest="chunk",
- help="Number of documents to index between progress messages.",
- default=1000)
- p.add_option("-B", "--batch", dest="batch",
- help="Batch size for batch adding documents.",
- default=1000)
- p.add_option("-k", "--skip", dest="skip", metavar="N",
- help="Index every Nth document.", default=1)
- p.add_option("-e", "--commit-every", dest="every", metavar="NUM",
- help="Commit every NUM documents", default=None)
- p.add_option("-M", "--no-merge", dest="merge", action="store_false",
- help="Don't merge segments when doing multiple commits",
- default=True)
- p.add_option("-u", "--upto", dest="upto", metavar="N",
- help="Index up to this document number.", default=600000)
- p.add_option("-p", "--procs", dest="procs", metavar="NUMBER",
- help="Number of processors to use.", default=0)
- p.add_option("-l", "--limit", dest="limit", metavar="N",
- help="Maximum number of search results to retrieve.",
- default=10)
- p.add_option("-b", "--body", dest="showbody", action="store_true",
- help="Show the body text in search results.",
- default=False)
- p.add_option("-g", "--gen", dest="generate", metavar="N",
- help="Generate a list at most N terms present in all libraries.",
- default=None)
- p.add_option("-f", "--file", dest="termfile", metavar="FILENAME",
- help="Search using the list of terms in this file.",
- default=None)
- p.add_option("-t", "--tempdir", dest="tempdir", metavar="DIRNAME",
- help="Whoosh temp dir", default=None)
- p.add_option("-P", "--pool", dest="pool", metavar="CLASSNAME",
- help="Whoosh pool class", default=None)
- p.add_option("-X", "--xms", dest="xms", action="store_true",
- help="Experimental Whoosh feature", default=False)
- p.add_option("-Z", "--storebody", dest="storebody", action="store_true",
- help="Store the body text in index", default=False)
- p.add_option("-q", "--snippets", dest="snippets", action="store_true",
- help="Show highlighted snippets", default=False)
- p.add_option("-O", "--no-optimize", dest="optimize", action="store_false",
- help="Turn off searcher optimization", default=True)
- return p
- def run(self, specclass):
- parser = self._parser(specclass.name)
- options, args = parser.parse_args()
- self.options = options
- self.args = args
- if options.lib not in self.libs:
- raise Exception("Unknown library: %r" % options.lib)
- lib = self.libs[options.lib](self, options, args)
- self.spec = specclass(options, args)
- if options.setup:
- self.spec.setup()
- action = self.search
- if options.index:
- action = self.index
- if options.termfile:
- action = self.search_file
- if options.generate:
- action = self.generate_search_file
- action(lib)
|