bench.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610
  1. # Copyright 2010 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. from __future__ import division
  28. import os.path
  29. from optparse import OptionParser
  30. from shutil import rmtree
  31. from whoosh import index, qparser, query, scoring
  32. from whoosh.util import now, find_object
  33. try:
  34. import xappy
  35. except ImportError:
  36. pass
  37. try:
  38. import xapian
  39. except ImportError:
  40. pass
  41. try:
  42. import pysolr
  43. except ImportError:
  44. pass
  45. try:
  46. from persistent import Persistent
  47. class ZDoc(Persistent):
  48. def __init__(self, d):
  49. self.__dict__.update(d)
  50. except ImportError:
  51. pass
  52. class Module(object):
  53. def __init__(self, bench, options, args):
  54. self.bench = bench
  55. self.options = options
  56. self.args = args
  57. def __repr__(self):
  58. return self.__class__.__name__
  59. def indexer(self, **kwargs):
  60. pass
  61. def index_document(self, d):
  62. raise NotImplementedError
  63. def finish(self, **kwargs):
  64. pass
  65. def _process_result(self, d):
  66. attrname = "process_result_%s" % self.options.lib
  67. if hasattr(self.bench.spec, attrname):
  68. method = getattr(self.bench.spec, attrname)
  69. self._process_result = method
  70. return method(d)
  71. else:
  72. self._process_result = lambda x: x
  73. return d
  74. def searcher(self):
  75. pass
  76. def query(self):
  77. raise NotImplementedError
  78. def find(self, q):
  79. raise NotImplementedError
  80. def findterms(self, terms):
  81. raise NotImplementedError
  82. def results(self, r):
  83. for hit in r:
  84. yield self._process_result(hit)
  85. class Spec(object):
  86. headline_field = "title"
  87. main_field = "body"
  88. def __init__(self, options, args):
  89. self.options = options
  90. self.args = args
  91. def documents(self):
  92. raise NotImplementedError
  93. def setup(self):
  94. pass
  95. def print_results(self, ls):
  96. showbody = self.options.showbody
  97. snippets = self.options.snippets
  98. limit = self.options.limit
  99. for i, hit in enumerate(ls):
  100. if i >= limit:
  101. break
  102. print("%d. %s" % (i + 1, hit.get(self.headline_field)))
  103. if snippets:
  104. print(self.show_snippet(hit))
  105. if showbody:
  106. print(hit.get(self.main_field))
  107. class WhooshModule(Module):
  108. def indexer(self, create=True):
  109. schema = self.bench.spec.whoosh_schema()
  110. path = os.path.join(self.options.dir, "%s_whoosh"
  111. % self.options.indexname)
  112. if not os.path.exists(path):
  113. os.mkdir(path)
  114. if create:
  115. ix = index.create_in(path, schema)
  116. else:
  117. ix = index.open_dir(path)
  118. poolclass = None
  119. if self.options.pool:
  120. poolclass = find_object(self.options.pool)
  121. self.writer = ix.writer(limitmb=int(self.options.limitmb),
  122. poolclass=poolclass,
  123. dir=self.options.tempdir,
  124. procs=int(self.options.procs),
  125. batchsize=int(self.options.batch),
  126. multisegment=self.options.xms)
  127. self._procdoc = None
  128. if hasattr(self.bench.spec, "process_document_whoosh"):
  129. self._procdoc = self.bench.spec.process_document_whoosh
  130. def index_document(self, d):
  131. _procdoc = self._procdoc
  132. if _procdoc:
  133. _procdoc(d)
  134. self.writer.add_document(**d)
  135. def finish(self, merge=True, optimize=False):
  136. self.writer.commit(merge=merge, optimize=optimize)
  137. def searcher(self):
  138. path = os.path.join(self.options.dir, "%s_whoosh"
  139. % self.options.indexname)
  140. ix = index.open_dir(path)
  141. self.srch = ix.searcher(weighting=scoring.PL2())
  142. self.parser = qparser.QueryParser(self.bench.spec.main_field,
  143. schema=ix.schema)
  144. def query(self):
  145. qstring = " ".join(self.args).decode("utf-8")
  146. return self.parser.parse(qstring)
  147. def find(self, q):
  148. return self.srch.search(q, limit=int(self.options.limit),
  149. optimize=self.options.optimize)
  150. def findterms(self, terms):
  151. limit = int(self.options.limit)
  152. s = self.srch
  153. q = query.Term(self.bench.spec.main_field, None)
  154. for term in terms:
  155. q.text = term
  156. yield s.search(q, limit=limit)
  157. class XappyModule(Module):
  158. def indexer(self, **kwargs):
  159. path = os.path.join(self.options.dir, "%s_xappy"
  160. % self.options.indexname)
  161. conn = self.bench.spec.xappy_connection(path)
  162. return conn
  163. def index_document(self, conn, d):
  164. if hasattr(self.bench, "process_document_xappy"):
  165. self.bench.process_document_xappy(d)
  166. doc = xappy.UnprocessedDocument()
  167. for key, values in d:
  168. if not isinstance(values, list):
  169. values = [values]
  170. for value in values:
  171. doc.fields.append(xappy.Field(key, value))
  172. conn.add(doc)
  173. def finish(self, conn):
  174. conn.flush()
  175. def searcher(self):
  176. path = os.path.join(self.options.dir, "%s_xappy"
  177. % self.options.indexname)
  178. return xappy.SearchConnection(path)
  179. def query(self, conn):
  180. return conn.query_parse(" ".join(self.args))
  181. def find(self, conn, q):
  182. return conn.search(q, 0, int(self.options.limit))
  183. def findterms(self, conn, terms):
  184. limit = int(self.options.limit)
  185. for term in terms:
  186. q = conn.query_field(self.bench.spec.main_field, term)
  187. yield conn.search(q, 0, limit)
  188. def results(self, r):
  189. hf = self.bench.spec.headline_field
  190. mf = self.bench.spec.main_field
  191. for hit in r:
  192. yield self._process_result({hf: hit.data[hf], mf: hit.data[mf]})
  193. class XapianModule(Module):
  194. def indexer(self, **kwargs):
  195. path = os.path.join(self.options.dir, "%s_xapian"
  196. % self.options.indexname)
  197. self.database = xapian.WritableDatabase(path, xapian.DB_CREATE_OR_OPEN)
  198. self.ixer = xapian.TermGenerator()
  199. def index_document(self, d):
  200. if hasattr(self.bench, "process_document_xapian"):
  201. self.bench.process_document_xapian(d)
  202. doc = xapian.Document()
  203. doc.add_value(0, d.get(self.bench.spec.headline_field, "-"))
  204. doc.set_data(d[self.bench.spec.main_field])
  205. self.ixer.set_document(doc)
  206. self.ixer.index_text(d[self.bench.spec.main_field])
  207. self.database.add_document(doc)
  208. def finish(self, **kwargs):
  209. self.database.flush()
  210. def searcher(self):
  211. path = os.path.join(self.options.dir, "%s_xappy"
  212. % self.options.indexname)
  213. self.db = xapian.Database(path)
  214. self.enq = xapian.Enquire(self.db)
  215. self.qp = xapian.QueryParser()
  216. self.qp.set_database(self.db)
  217. def query(self):
  218. return self.qp.parse_query(" ".join(self.args))
  219. def find(self, q):
  220. self.enq.set_query(q)
  221. return self.enq.get_mset(0, int(self.options.limit))
  222. def findterms(self, terms):
  223. limit = int(self.options.limit)
  224. for term in terms:
  225. q = self.qp.parse_query(term)
  226. self.enq.set_query(q)
  227. yield self.enq.get_mset(0, limit)
  228. def results(self, matches):
  229. hf = self.bench.spec.headline_field
  230. mf = self.bench.spec.main_field
  231. for m in matches:
  232. yield self._process_result({hf: m.document.get_value(0),
  233. mf: m.document.get_data()})
  234. class SolrModule(Module):
  235. def indexer(self, **kwargs):
  236. self.solr_doclist = []
  237. self.conn = pysolr.Solr(self.options.url)
  238. self.conn.delete("*:*")
  239. self.conn.commit()
  240. def index_document(self, d):
  241. self.solr_doclist.append(d)
  242. if len(self.solr_doclist) >= int(self.options.batch):
  243. self.conn.add(self.solr_doclist, commit=False)
  244. self.solr_doclist = []
  245. def finish(self, **kwargs):
  246. if self.solr_doclist:
  247. self.conn.add(self.solr_doclist)
  248. del self.solr_doclist
  249. self.conn.optimize(block=True)
  250. def searcher(self):
  251. self.solr = pysolr.Solr(self.options.url)
  252. def query(self):
  253. return " ".join(self.args)
  254. def find(self, q):
  255. return self.solr.search(q, limit=int(self.options.limit))
  256. def findterms(self, terms):
  257. limit = int(self.options.limit)
  258. for term in terms:
  259. yield self.solr.search("body:" + term, limit=limit)
  260. class ZcatalogModule(Module):
  261. def indexer(self, **kwargs):
  262. from ZODB.FileStorage import FileStorage # @UnresolvedImport
  263. from ZODB.DB import DB # @UnresolvedImport
  264. from zcatalog import catalog # @UnresolvedImport
  265. from zcatalog import indexes # @UnresolvedImport
  266. import transaction # @UnresolvedImport
  267. dir = os.path.join(self.options.dir, "%s_zcatalog"
  268. % self.options.indexname)
  269. if os.path.exists(dir):
  270. rmtree(dir)
  271. os.mkdir(dir)
  272. storage = FileStorage(os.path.join(dir, "index"))
  273. db = DB(storage)
  274. conn = db.open()
  275. self.cat = catalog.Catalog()
  276. self.bench.spec.zcatalog_setup(self.cat)
  277. conn.root()["cat"] = self.cat
  278. transaction.commit()
  279. self.zcatalog_count = 0
  280. def index_document(self, d):
  281. if hasattr(self.bench, "process_document_zcatalog"):
  282. self.bench.process_document_zcatalog(d)
  283. doc = ZDoc(d)
  284. self.cat.index_doc(doc)
  285. self.zcatalog_count += 1
  286. if self.zcatalog_count >= 100:
  287. import transaction # @UnresolvedImport
  288. transaction.commit()
  289. self.zcatalog_count = 0
  290. def finish(self, **kwargs):
  291. import transaction # @UnresolvedImport
  292. transaction.commit()
  293. del self.zcatalog_count
  294. def searcher(self):
  295. from ZODB.FileStorage import FileStorage # @UnresolvedImport
  296. from ZODB.DB import DB # @UnresolvedImport
  297. from zcatalog import catalog # @UnresolvedImport
  298. from zcatalog import indexes # @UnresolvedImport
  299. import transaction # @UnresolvedImport
  300. path = os.path.join(self.options.dir, "%s_zcatalog"
  301. % self.options.indexname, "index")
  302. storage = FileStorage(path)
  303. db = DB(storage)
  304. conn = db.open()
  305. self.cat = conn.root()["cat"]
  306. def query(self):
  307. return " ".join(self.args)
  308. def find(self, q):
  309. return self.cat.searchResults(body=q)
  310. def findterms(self, terms):
  311. for term in terms:
  312. yield self.cat.searchResults(body=term)
  313. def results(self, r):
  314. hf = self.bench.spec.headline_field
  315. mf = self.bench.spec.main_field
  316. for hit in r:
  317. # Have to access the attributes for them to be retrieved
  318. yield self._process_result({hf: getattr(hit, hf),
  319. mf: getattr(hit, mf)})
  320. class NucularModule(Module):
  321. def indexer(self, create=True):
  322. import shutil
  323. from nucular import Nucular
  324. dir = os.path.join(self.options.dir, "%s_nucular"
  325. % self.options.indexname)
  326. if create:
  327. if os.path.exists(dir):
  328. shutil.rmtree(dir)
  329. os.mkdir(dir)
  330. self.archive = Nucular.Nucular(dir)
  331. if create:
  332. self.archive.create()
  333. self.count = 0
  334. def index_document(self, d):
  335. try:
  336. self.archive.indexDictionary(str(self.count), d)
  337. except ValueError:
  338. print("d=", d)
  339. raise
  340. self.count += 1
  341. if not self.count % int(self.options.batch):
  342. t = now()
  343. self.archive.store(lazy=True)
  344. self.indexer(create=False)
  345. def finish(self, **kwargs):
  346. self.archive.store(lazy=False)
  347. self.archive.aggregateRecent(fast=False, verbose=True)
  348. self.archive.moveTransientToBase(verbose=True)
  349. self.archive.cleanUp()
  350. def searcher(self):
  351. from nucular import Nucular
  352. dir = os.path.join(self.options.dir, "%s_nucular"
  353. % self.options.indexname)
  354. self.archive = Nucular.Nucular(dir)
  355. def query(self):
  356. return " ".join(self.args)
  357. def find(self, q):
  358. return self.archive.dictionaries(q)
  359. def findterms(self, terms):
  360. for term in terms:
  361. q = self.archive.Query()
  362. q.anyWord(term)
  363. yield q.resultDictionaries()
  364. class Bench(object):
  365. libs = {"whoosh": WhooshModule, "xappy": XappyModule,
  366. "xapian": XapianModule, "solr": SolrModule,
  367. "zcatalog": ZcatalogModule, "nucular": NucularModule}
  368. def index(self, lib):
  369. print("Indexing with %s..." % lib)
  370. options = self.options
  371. every = None if options.every is None else int(options.every)
  372. merge = options.merge
  373. chunk = int(options.chunk)
  374. skip = int(options.skip)
  375. upto = int(options.upto)
  376. count = 0
  377. skipc = skip
  378. starttime = chunkstarttime = now()
  379. lib.indexer()
  380. for d in self.spec.documents():
  381. skipc -= 1
  382. if not skipc:
  383. lib.index_document(d)
  384. count += 1
  385. skipc = skip
  386. if chunk and not count % chunk:
  387. t = now()
  388. sofar = t - starttime
  389. print("Done %d docs, %0.3f secs for %d, %0.3f total, %0.3f docs/s" % (count, t - chunkstarttime, chunk, sofar, count / sofar))
  390. chunkstarttime = t
  391. if count > upto:
  392. break
  393. if every and not count % every:
  394. print("----Commit")
  395. lib.finish(merge=merge)
  396. lib.indexer(create=False)
  397. spooltime = now()
  398. print("Spool time:", spooltime - starttime)
  399. lib.finish(merge=merge)
  400. committime = now()
  401. print("Commit time:", committime - spooltime)
  402. totaltime = committime - starttime
  403. print("Total time to index %d documents: %0.3f secs (%0.3f minutes)" % (count, totaltime, totaltime / 60.0))
  404. print("Indexed %0.3f docs/s" % (count / totaltime))
  405. def search(self, lib):
  406. lib.searcher()
  407. t = now()
  408. q = lib.query()
  409. print("Query:", q)
  410. r = lib.find(q)
  411. print("Search time:", now() - t)
  412. t = now()
  413. self.spec.print_results(lib.results(r))
  414. print("Print time:", now() - t)
  415. def search_file(self, lib):
  416. f = open(self.options.termfile, "rb")
  417. terms = [line.strip() for line in f]
  418. f.close()
  419. print("Searching %d terms with %s" % (len(terms), lib))
  420. lib.searcher()
  421. starttime = now()
  422. for r in lib.findterms(terms):
  423. pass
  424. searchtime = now() - starttime
  425. print("Search time:", searchtime, "searches/s:", float(len(terms)) / searchtime)
  426. def _parser(self, name):
  427. p = OptionParser()
  428. p.add_option("-x", "--lib", dest="lib",
  429. help="Name of the library to use to index/search.",
  430. default="whoosh")
  431. p.add_option("-d", "--dir", dest="dir", metavar="DIRNAME",
  432. help="Directory in which to store index.", default=".")
  433. p.add_option("-s", "--setup", dest="setup", action="store_true",
  434. help="Set up any support files or caches.", default=False)
  435. p.add_option("-i", "--index", dest="index", action="store_true",
  436. help="Index the documents.", default=False)
  437. p.add_option("-n", "--name", dest="indexname", metavar="PREFIX",
  438. help="Index name prefix.", default="%s_index" % name)
  439. p.add_option("-U", "--url", dest="url", metavar="URL",
  440. help="Solr URL", default="http://localhost:8983/solr")
  441. p.add_option("-m", "--mb", dest="limitmb",
  442. help="Max. memory usage, in MB", default="128")
  443. p.add_option("-c", "--chunk", dest="chunk",
  444. help="Number of documents to index between progress messages.",
  445. default=1000)
  446. p.add_option("-B", "--batch", dest="batch",
  447. help="Batch size for batch adding documents.",
  448. default=1000)
  449. p.add_option("-k", "--skip", dest="skip", metavar="N",
  450. help="Index every Nth document.", default=1)
  451. p.add_option("-e", "--commit-every", dest="every", metavar="NUM",
  452. help="Commit every NUM documents", default=None)
  453. p.add_option("-M", "--no-merge", dest="merge", action="store_false",
  454. help="Don't merge segments when doing multiple commits",
  455. default=True)
  456. p.add_option("-u", "--upto", dest="upto", metavar="N",
  457. help="Index up to this document number.", default=600000)
  458. p.add_option("-p", "--procs", dest="procs", metavar="NUMBER",
  459. help="Number of processors to use.", default=0)
  460. p.add_option("-l", "--limit", dest="limit", metavar="N",
  461. help="Maximum number of search results to retrieve.",
  462. default=10)
  463. p.add_option("-b", "--body", dest="showbody", action="store_true",
  464. help="Show the body text in search results.",
  465. default=False)
  466. p.add_option("-g", "--gen", dest="generate", metavar="N",
  467. help="Generate a list at most N terms present in all libraries.",
  468. default=None)
  469. p.add_option("-f", "--file", dest="termfile", metavar="FILENAME",
  470. help="Search using the list of terms in this file.",
  471. default=None)
  472. p.add_option("-t", "--tempdir", dest="tempdir", metavar="DIRNAME",
  473. help="Whoosh temp dir", default=None)
  474. p.add_option("-P", "--pool", dest="pool", metavar="CLASSNAME",
  475. help="Whoosh pool class", default=None)
  476. p.add_option("-X", "--xms", dest="xms", action="store_true",
  477. help="Experimental Whoosh feature", default=False)
  478. p.add_option("-Z", "--storebody", dest="storebody", action="store_true",
  479. help="Store the body text in index", default=False)
  480. p.add_option("-q", "--snippets", dest="snippets", action="store_true",
  481. help="Show highlighted snippets", default=False)
  482. p.add_option("-O", "--no-optimize", dest="optimize", action="store_false",
  483. help="Turn off searcher optimization", default=True)
  484. return p
  485. def run(self, specclass):
  486. parser = self._parser(specclass.name)
  487. options, args = parser.parse_args()
  488. self.options = options
  489. self.args = args
  490. if options.lib not in self.libs:
  491. raise Exception("Unknown library: %r" % options.lib)
  492. lib = self.libs[options.lib](self, options, args)
  493. self.spec = specclass(options, args)
  494. if options.setup:
  495. self.spec.setup()
  496. action = self.search
  497. if options.index:
  498. action = self.index
  499. if options.termfile:
  500. action = self.search_file
  501. if options.generate:
  502. action = self.generate_search_file
  503. action(lib)