dayuan
/
manyi


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610
							# Copyright 2010 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#    1. Redistributions of source code must retain the above copyright notice,
#       this list of conditions and the following disclaimer.
#
#    2. Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.

from __future__ import division
import os.path
from optparse import OptionParser
from shutil import rmtree

from whoosh import index, qparser, query, scoring
from whoosh.util import now, find_object

try:
    import xappy
except ImportError:
    pass
try:
    import xapian
except ImportError:
    pass
try:
    import pysolr
except ImportError:
    pass

try:
    from persistent import Persistent

    class ZDoc(Persistent):
        def __init__(self, d):
            self.__dict__.update(d)
except ImportError:
    pass


class Module(object):
    def __init__(self, bench, options, args):
        self.bench = bench
        self.options = options
        self.args = args

    def __repr__(self):
        return self.__class__.__name__

    def indexer(self, **kwargs):
        pass

    def index_document(self, d):
        raise NotImplementedError

    def finish(self, **kwargs):
        pass

    def _process_result(self, d):
        attrname = "process_result_%s" % self.options.lib
        if hasattr(self.bench.spec, attrname):
            method = getattr(self.bench.spec, attrname)
            self._process_result = method
            return method(d)
        else:
            self._process_result = lambda x: x
            return d

    def searcher(self):
        pass

    def query(self):
        raise NotImplementedError

    def find(self, q):
        raise NotImplementedError

    def findterms(self, terms):
        raise NotImplementedError

    def results(self, r):
        for hit in r:
            yield self._process_result(hit)


class Spec(object):
    headline_field = "title"
    main_field = "body"

    def __init__(self, options, args):
        self.options = options
        self.args = args

    def documents(self):
        raise NotImplementedError

    def setup(self):
        pass

    def print_results(self, ls):
        showbody = self.options.showbody
        snippets = self.options.snippets
        limit = self.options.limit
        for i, hit in enumerate(ls):
            if i >= limit:
                break

            print("%d. %s" % (i + 1, hit.get(self.headline_field)))
            if snippets:
                print(self.show_snippet(hit))
            if showbody:
                print(hit.get(self.main_field))


class WhooshModule(Module):
    def indexer(self, create=True):
        schema = self.bench.spec.whoosh_schema()
        path = os.path.join(self.options.dir, "%s_whoosh"
                            % self.options.indexname)

        if not os.path.exists(path):
            os.mkdir(path)
        if create:
            ix = index.create_in(path, schema)
        else:
            ix = index.open_dir(path)

        poolclass = None
        if self.options.pool:
            poolclass = find_object(self.options.pool)

        self.writer = ix.writer(limitmb=int(self.options.limitmb),
                                poolclass=poolclass,
                                dir=self.options.tempdir,
                                procs=int(self.options.procs),
                                batchsize=int(self.options.batch),
                                multisegment=self.options.xms)
        self._procdoc = None
        if hasattr(self.bench.spec, "process_document_whoosh"):
            self._procdoc = self.bench.spec.process_document_whoosh

    def index_document(self, d):
        _procdoc = self._procdoc
        if _procdoc:
            _procdoc(d)
        self.writer.add_document(**d)

    def finish(self, merge=True, optimize=False):
        self.writer.commit(merge=merge, optimize=optimize)

    def searcher(self):
        path = os.path.join(self.options.dir, "%s_whoosh"
                            % self.options.indexname)
        ix = index.open_dir(path)
        self.srch = ix.searcher(weighting=scoring.PL2())
        self.parser = qparser.QueryParser(self.bench.spec.main_field,
                                          schema=ix.schema)

    def query(self):
        qstring = " ".join(self.args).decode("utf-8")
        return self.parser.parse(qstring)

    def find(self, q):
        return self.srch.search(q, limit=int(self.options.limit),
                                optimize=self.options.optimize)

    def findterms(self, terms):
        limit = int(self.options.limit)
        s = self.srch
        q = query.Term(self.bench.spec.main_field, None)
        for term in terms:
            q.text = term
            yield s.search(q, limit=limit)


class XappyModule(Module):
    def indexer(self, **kwargs):
        path = os.path.join(self.options.dir, "%s_xappy"
                            % self.options.indexname)
        conn = self.bench.spec.xappy_connection(path)
        return conn

    def index_document(self, conn, d):
        if hasattr(self.bench, "process_document_xappy"):
            self.bench.process_document_xappy(d)
        doc = xappy.UnprocessedDocument()
        for key, values in d:
            if not isinstance(values, list):
                values = [values]
            for value in values:
                doc.fields.append(xappy.Field(key, value))
        conn.add(doc)

    def finish(self, conn):
        conn.flush()

    def searcher(self):
        path = os.path.join(self.options.dir, "%s_xappy"
                            % self.options.indexname)
        return xappy.SearchConnection(path)

    def query(self, conn):
        return conn.query_parse(" ".join(self.args))

    def find(self, conn, q):
        return conn.search(q, 0, int(self.options.limit))

    def findterms(self, conn, terms):
        limit = int(self.options.limit)
        for term in terms:
            q = conn.query_field(self.bench.spec.main_field, term)
            yield conn.search(q, 0, limit)

    def results(self, r):
        hf = self.bench.spec.headline_field
        mf = self.bench.spec.main_field
        for hit in r:
            yield self._process_result({hf: hit.data[hf], mf: hit.data[mf]})


class XapianModule(Module):
    def indexer(self, **kwargs):
        path = os.path.join(self.options.dir, "%s_xapian"
                            % self.options.indexname)
        self.database = xapian.WritableDatabase(path, xapian.DB_CREATE_OR_OPEN)
        self.ixer = xapian.TermGenerator()

    def index_document(self, d):
        if hasattr(self.bench, "process_document_xapian"):
            self.bench.process_document_xapian(d)
        doc = xapian.Document()
        doc.add_value(0, d.get(self.bench.spec.headline_field, "-"))
        doc.set_data(d[self.bench.spec.main_field])
        self.ixer.set_document(doc)
        self.ixer.index_text(d[self.bench.spec.main_field])
        self.database.add_document(doc)

    def finish(self, **kwargs):
        self.database.flush()

    def searcher(self):
        path = os.path.join(self.options.dir, "%s_xappy"
                            % self.options.indexname)
        self.db = xapian.Database(path)
        self.enq = xapian.Enquire(self.db)
        self.qp = xapian.QueryParser()
        self.qp.set_database(self.db)

    def query(self):
        return self.qp.parse_query(" ".join(self.args))

    def find(self, q):
        self.enq.set_query(q)
        return self.enq.get_mset(0, int(self.options.limit))

    def findterms(self, terms):
        limit = int(self.options.limit)
        for term in terms:
            q = self.qp.parse_query(term)
            self.enq.set_query(q)
            yield self.enq.get_mset(0, limit)

    def results(self, matches):
        hf = self.bench.spec.headline_field
        mf = self.bench.spec.main_field
        for m in matches:
            yield self._process_result({hf: m.document.get_value(0),
                                        mf: m.document.get_data()})


class SolrModule(Module):
    def indexer(self, **kwargs):
        self.solr_doclist = []
        self.conn = pysolr.Solr(self.options.url)
        self.conn.delete("*:*")
        self.conn.commit()

    def index_document(self, d):
        self.solr_doclist.append(d)
        if len(self.solr_doclist) >= int(self.options.batch):
            self.conn.add(self.solr_doclist, commit=False)
            self.solr_doclist = []

    def finish(self, **kwargs):
        if self.solr_doclist:
            self.conn.add(self.solr_doclist)
        del self.solr_doclist
        self.conn.optimize(block=True)

    def searcher(self):
        self.solr = pysolr.Solr(self.options.url)

    def query(self):
        return " ".join(self.args)

    def find(self, q):
        return self.solr.search(q, limit=int(self.options.limit))

    def findterms(self, terms):
        limit = int(self.options.limit)
        for term in terms:
            yield self.solr.search("body:" + term, limit=limit)


class ZcatalogModule(Module):
    def indexer(self, **kwargs):
        from ZODB.FileStorage import FileStorage  # @UnresolvedImport
        from ZODB.DB import DB  # @UnresolvedImport
        from zcatalog import catalog  # @UnresolvedImport
        from zcatalog import indexes  # @UnresolvedImport
        import transaction  # @UnresolvedImport

        dir = os.path.join(self.options.dir, "%s_zcatalog"
                           % self.options.indexname)
        if os.path.exists(dir):
            rmtree(dir)
        os.mkdir(dir)

        storage = FileStorage(os.path.join(dir, "index"))
        db = DB(storage)
        conn = db.open()

        self.cat = catalog.Catalog()
        self.bench.spec.zcatalog_setup(self.cat)
        conn.root()["cat"] = self.cat
        transaction.commit()

        self.zcatalog_count = 0

    def index_document(self, d):
        if hasattr(self.bench, "process_document_zcatalog"):
            self.bench.process_document_zcatalog(d)
        doc = ZDoc(d)
        self.cat.index_doc(doc)
        self.zcatalog_count += 1
        if self.zcatalog_count >= 100:
            import transaction  # @UnresolvedImport
            transaction.commit()
            self.zcatalog_count = 0

    def finish(self, **kwargs):
        import transaction  # @UnresolvedImport
        transaction.commit()
        del self.zcatalog_count

    def searcher(self):
        from ZODB.FileStorage import FileStorage  # @UnresolvedImport
        from ZODB.DB import DB  # @UnresolvedImport
        from zcatalog import catalog  # @UnresolvedImport
        from zcatalog import indexes  # @UnresolvedImport
        import transaction  # @UnresolvedImport

        path = os.path.join(self.options.dir, "%s_zcatalog"
                            % self.options.indexname, "index")
        storage = FileStorage(path)
        db = DB(storage)
        conn = db.open()

        self.cat = conn.root()["cat"]

    def query(self):
        return " ".join(self.args)

    def find(self, q):
        return self.cat.searchResults(body=q)

    def findterms(self, terms):
        for term in terms:
            yield self.cat.searchResults(body=term)

    def results(self, r):
        hf = self.bench.spec.headline_field
        mf = self.bench.spec.main_field
        for hit in r:
            # Have to access the attributes for them to be retrieved
            yield self._process_result({hf: getattr(hit, hf),
                                        mf: getattr(hit, mf)})


class NucularModule(Module):
    def indexer(self, create=True):
        import shutil
        from nucular import Nucular

        dir = os.path.join(self.options.dir, "%s_nucular"
                           % self.options.indexname)
        if create:
            if os.path.exists(dir):
                shutil.rmtree(dir)
            os.mkdir(dir)
        self.archive = Nucular.Nucular(dir)
        if create:
            self.archive.create()
        self.count = 0

    def index_document(self, d):
        try:
            self.archive.indexDictionary(str(self.count), d)
        except ValueError:
            print("d=", d)
            raise
        self.count += 1
        if not self.count % int(self.options.batch):
            t = now()
            self.archive.store(lazy=True)
            self.indexer(create=False)

    def finish(self, **kwargs):
        self.archive.store(lazy=False)
        self.archive.aggregateRecent(fast=False, verbose=True)
        self.archive.moveTransientToBase(verbose=True)
        self.archive.cleanUp()

    def searcher(self):
        from nucular import Nucular

        dir = os.path.join(self.options.dir, "%s_nucular"
                           % self.options.indexname)
        self.archive = Nucular.Nucular(dir)

    def query(self):
        return " ".join(self.args)

    def find(self, q):
        return self.archive.dictionaries(q)

    def findterms(self, terms):
        for term in terms:
            q = self.archive.Query()
            q.anyWord(term)
            yield q.resultDictionaries()


class Bench(object):
    libs = {"whoosh": WhooshModule, "xappy": XappyModule,
            "xapian": XapianModule, "solr": SolrModule,
            "zcatalog": ZcatalogModule, "nucular": NucularModule}

    def index(self, lib):
        print("Indexing with %s..." % lib)

        options = self.options
        every = None if options.every is None else int(options.every)
        merge = options.merge
        chunk = int(options.chunk)
        skip = int(options.skip)
        upto = int(options.upto)
        count = 0
        skipc = skip

        starttime = chunkstarttime = now()

        lib.indexer()

        for d in self.spec.documents():
            skipc -= 1
            if not skipc:
                lib.index_document(d)
                count += 1
                skipc = skip
                if chunk and not count % chunk:
                    t = now()
                    sofar = t - starttime
                    print("Done %d docs, %0.3f secs for %d, %0.3f total, %0.3f docs/s" % (count, t - chunkstarttime, chunk, sofar, count / sofar))
                    chunkstarttime = t
                if count > upto:
                    break
                if every and not count % every:
                    print("----Commit")
                    lib.finish(merge=merge)
                    lib.indexer(create=False)

        spooltime = now()
        print("Spool time:", spooltime - starttime)
        lib.finish(merge=merge)
        committime = now()
        print("Commit time:", committime - spooltime)
        totaltime = committime - starttime
        print("Total time to index %d documents: %0.3f secs (%0.3f minutes)" % (count, totaltime, totaltime / 60.0))
        print("Indexed %0.3f docs/s" % (count / totaltime))

    def search(self, lib):
        lib.searcher()

        t = now()
        q = lib.query()
        print("Query:", q)
        r = lib.find(q)
        print("Search time:", now() - t)

        t = now()
        self.spec.print_results(lib.results(r))
        print("Print time:", now() - t)

    def search_file(self, lib):
        f = open(self.options.termfile, "rb")
        terms = [line.strip() for line in f]
        f.close()

        print("Searching %d terms with %s" % (len(terms), lib))
        lib.searcher()
        starttime = now()
        for r in lib.findterms(terms):
            pass
        searchtime = now() - starttime
        print("Search time:", searchtime, "searches/s:", float(len(terms)) / searchtime)

    def _parser(self, name):
        p = OptionParser()
        p.add_option("-x", "--lib", dest="lib",
                     help="Name of the library to use to index/search.",
                     default="whoosh")
        p.add_option("-d", "--dir", dest="dir", metavar="DIRNAME",
                     help="Directory in which to store index.", default=".")
        p.add_option("-s", "--setup", dest="setup", action="store_true",
                     help="Set up any support files or caches.", default=False)
        p.add_option("-i", "--index", dest="index", action="store_true",
                     help="Index the documents.", default=False)
        p.add_option("-n", "--name", dest="indexname", metavar="PREFIX",
                     help="Index name prefix.", default="%s_index" % name)
        p.add_option("-U", "--url", dest="url", metavar="URL",
                     help="Solr URL", default="http://localhost:8983/solr")
        p.add_option("-m", "--mb", dest="limitmb",
                     help="Max. memory usage, in MB", default="128")
        p.add_option("-c", "--chunk", dest="chunk",
                     help="Number of documents to index between progress messages.",
                     default=1000)
        p.add_option("-B", "--batch", dest="batch",
                     help="Batch size for batch adding documents.",
                     default=1000)
        p.add_option("-k", "--skip", dest="skip", metavar="N",
                     help="Index every Nth document.", default=1)
        p.add_option("-e", "--commit-every", dest="every", metavar="NUM",
                      help="Commit every NUM documents", default=None)
        p.add_option("-M", "--no-merge", dest="merge", action="store_false",
                     help="Don't merge segments when doing multiple commits",
                     default=True)
        p.add_option("-u", "--upto", dest="upto", metavar="N",
                     help="Index up to this document number.", default=600000)
        p.add_option("-p", "--procs", dest="procs", metavar="NUMBER",
                     help="Number of processors to use.", default=0)
        p.add_option("-l", "--limit", dest="limit", metavar="N",
                     help="Maximum number of search results to retrieve.",
                     default=10)
        p.add_option("-b", "--body", dest="showbody", action="store_true",
                     help="Show the body text in search results.",
                     default=False)
        p.add_option("-g", "--gen", dest="generate", metavar="N",
                     help="Generate a list at most N terms present in all libraries.",
                     default=None)
        p.add_option("-f", "--file", dest="termfile", metavar="FILENAME",
                     help="Search using the list of terms in this file.",
                     default=None)
        p.add_option("-t", "--tempdir", dest="tempdir", metavar="DIRNAME",
                     help="Whoosh temp dir", default=None)
        p.add_option("-P", "--pool", dest="pool", metavar="CLASSNAME",
                     help="Whoosh pool class", default=None)
        p.add_option("-X", "--xms", dest="xms", action="store_true",
                     help="Experimental Whoosh feature", default=False)
        p.add_option("-Z", "--storebody", dest="storebody", action="store_true",
                     help="Store the body text in index", default=False)
        p.add_option("-q", "--snippets", dest="snippets", action="store_true",
                     help="Show highlighted snippets", default=False)
        p.add_option("-O", "--no-optimize", dest="optimize", action="store_false",
                     help="Turn off searcher optimization", default=True)

        return p

    def run(self, specclass):
        parser = self._parser(specclass.name)
        options, args = parser.parse_args()
        self.options = options
        self.args = args

        if options.lib not in self.libs:
            raise Exception("Unknown library: %r" % options.lib)
        lib = self.libs[options.lib](self, options, args)

        self.spec = specclass(options, args)

        if options.setup:
            self.spec.setup()

        action = self.search
        if options.index:
            action = self.index
        if options.termfile:
            action = self.search_file
        if options.generate:
            action = self.generate_search_file

        action(lib)