terms.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535
  1. # Copyright 2007 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. from __future__ import division
  28. import copy
  29. import fnmatch
  30. import re
  31. from collections import defaultdict
  32. from whoosh import matching
  33. from whoosh.analysis import Token
  34. from whoosh.compat import bytes_type, text_type, u
  35. from whoosh.lang.morph_en import variations
  36. from whoosh.query import qcore
  37. class Term(qcore.Query):
  38. """Matches documents containing the given term (fieldname+text pair).
  39. >>> Term("content", u"render")
  40. """
  41. __inittypes__ = dict(fieldname=str, text=text_type, boost=float)
  42. def __init__(self, fieldname, text, boost=1.0, minquality=None):
  43. self.fieldname = fieldname
  44. self.text = text
  45. self.boost = boost
  46. self.minquality = minquality
  47. def __eq__(self, other):
  48. return (other
  49. and self.__class__ is other.__class__
  50. and self.fieldname == other.fieldname
  51. and self.text == other.text
  52. and self.boost == other.boost)
  53. def __repr__(self):
  54. r = "%s(%r, %r" % (self.__class__.__name__, self.fieldname, self.text)
  55. if self.boost != 1.0:
  56. r += ", boost=%s" % self.boost
  57. r += ")"
  58. return r
  59. def __unicode__(self):
  60. text = self.text
  61. if isinstance(text, bytes_type):
  62. try:
  63. text = text.decode("ascii")
  64. except UnicodeDecodeError:
  65. text = repr(text)
  66. t = u("%s:%s") % (self.fieldname, text)
  67. if self.boost != 1:
  68. t += u("^") + text_type(self.boost)
  69. return t
  70. __str__ = __unicode__
  71. def __hash__(self):
  72. return hash(self.fieldname) ^ hash(self.text) ^ hash(self.boost)
  73. def has_terms(self):
  74. return True
  75. def tokens(self, boost=1.0):
  76. yield Token(fieldname=self.fieldname, text=self.text,
  77. boost=boost * self.boost, startchar=self.startchar,
  78. endchar=self.endchar, chars=True)
  79. def terms(self, phrases=False):
  80. if self.field():
  81. yield (self.field(), self.text)
  82. def replace(self, fieldname, oldtext, newtext):
  83. q = copy.copy(self)
  84. if q.fieldname == fieldname and q.text == oldtext:
  85. q.text = newtext
  86. return q
  87. def estimate_size(self, ixreader):
  88. fieldname = self.fieldname
  89. if fieldname not in ixreader.schema:
  90. return 0
  91. field = ixreader.schema[fieldname]
  92. try:
  93. text = field.to_bytes(self.text)
  94. except ValueError:
  95. return 0
  96. return ixreader.doc_frequency(fieldname, text)
  97. def matcher(self, searcher, context=None):
  98. fieldname = self.fieldname
  99. text = self.text
  100. if fieldname not in searcher.schema:
  101. return matching.NullMatcher()
  102. field = searcher.schema[fieldname]
  103. try:
  104. text = field.to_bytes(text)
  105. except ValueError:
  106. return matching.NullMatcher()
  107. if (self.fieldname, text) in searcher.reader():
  108. if context is None:
  109. w = searcher.weighting
  110. else:
  111. w = context.weighting
  112. m = searcher.postings(self.fieldname, text, weighting=w)
  113. if self.minquality:
  114. m.set_min_quality(self.minquality)
  115. if self.boost != 1.0:
  116. m = matching.WrappingMatcher(m, boost=self.boost)
  117. return m
  118. else:
  119. return matching.NullMatcher()
  120. class MultiTerm(qcore.Query):
  121. """Abstract base class for queries that operate on multiple terms in the
  122. same field.
  123. """
  124. constantscore = False
  125. def _btexts(self, ixreader):
  126. raise NotImplementedError(self.__class__.__name__)
  127. def expanded_terms(self, ixreader, phrases=False):
  128. fieldname = self.field()
  129. if fieldname:
  130. for btext in self._btexts(ixreader):
  131. yield (fieldname, btext)
  132. def tokens(self, boost=1.0, exreader=None):
  133. fieldname = self.field()
  134. if exreader is None:
  135. btexts = [self.text]
  136. else:
  137. btexts = self._btexts(exreader)
  138. for btext in btexts:
  139. yield Token(fieldname=fieldname, text=btext,
  140. boost=boost * self.boost, startchar=self.startchar,
  141. endchar=self.endchar, chars=True)
  142. def simplify(self, ixreader):
  143. fieldname = self.field()
  144. if fieldname not in ixreader.schema:
  145. return qcore.NullQuery()
  146. field = ixreader.schema[fieldname]
  147. existing = []
  148. for btext in sorted(set(self._btexts(ixreader))):
  149. text = field.from_bytes(btext)
  150. existing.append(Term(fieldname, text, boost=self.boost))
  151. if len(existing) == 1:
  152. return existing[0]
  153. elif existing:
  154. from whoosh.query import Or
  155. return Or(existing)
  156. else:
  157. return qcore.NullQuery
  158. def estimate_size(self, ixreader):
  159. fieldname = self.field()
  160. return sum(ixreader.doc_frequency(fieldname, btext)
  161. for btext in self._btexts(ixreader))
  162. def estimate_min_size(self, ixreader):
  163. fieldname = self.field()
  164. return min(ixreader.doc_frequency(fieldname, text)
  165. for text in self._btexts(ixreader))
  166. def matcher(self, searcher, context=None):
  167. from whoosh.query import Or
  168. fieldname = self.field()
  169. constantscore = self.constantscore
  170. reader = searcher.reader()
  171. qs = [Term(fieldname, word) for word in self._btexts(reader)
  172. if word]
  173. if not qs:
  174. return matching.NullMatcher()
  175. if len(qs) == 1:
  176. # If there's only one term, just use it
  177. m = qs[0].matcher(searcher, context)
  178. else:
  179. if constantscore:
  180. # To tell the sub-query that score doesn't matter, set weighting
  181. # to None
  182. if context:
  183. context = context.set(weighting=None)
  184. else:
  185. from whoosh.searching import SearchContext
  186. context = SearchContext(weighting=None)
  187. # Or the terms together
  188. m = Or(qs, boost=self.boost).matcher(searcher, context)
  189. return m
  190. class PatternQuery(MultiTerm):
  191. """An intermediate base class for common methods of Prefix and Wildcard.
  192. """
  193. __inittypes__ = dict(fieldname=str, text=text_type, boost=float)
  194. def __init__(self, fieldname, text, boost=1.0, constantscore=True):
  195. self.fieldname = fieldname
  196. self.text = text
  197. self.boost = boost
  198. self.constantscore = constantscore
  199. def __eq__(self, other):
  200. return (other and self.__class__ is other.__class__
  201. and self.fieldname == other.fieldname
  202. and self.text == other.text and self.boost == other.boost
  203. and self.constantscore == other.constantscore)
  204. def __repr__(self):
  205. r = "%s(%r, %r" % (self.__class__.__name__, self.fieldname, self.text)
  206. if self.boost != 1:
  207. r += ", boost=%s" % self.boost
  208. r += ")"
  209. return r
  210. def __hash__(self):
  211. return (hash(self.fieldname) ^ hash(self.text) ^ hash(self.boost)
  212. ^ hash(self.constantscore))
  213. def _get_pattern(self):
  214. raise NotImplementedError
  215. def _find_prefix(self, text):
  216. # Subclasses/instances should set the SPECIAL_CHARS attribute to a set
  217. # of characters that mark the end of the literal prefix
  218. specialchars = self.SPECIAL_CHARS
  219. i = 0
  220. for i, char in enumerate(text):
  221. if char in specialchars:
  222. break
  223. return text[:i]
  224. def _btexts(self, ixreader):
  225. field = ixreader.schema[self.fieldname]
  226. exp = re.compile(self._get_pattern())
  227. prefix = self._find_prefix(self.text)
  228. if prefix:
  229. candidates = ixreader.expand_prefix(self.fieldname, prefix)
  230. else:
  231. candidates = ixreader.lexicon(self.fieldname)
  232. from_bytes = field.from_bytes
  233. for btext in candidates:
  234. text = from_bytes(btext)
  235. if exp.match(text):
  236. yield btext
  237. class Prefix(PatternQuery):
  238. """Matches documents that contain any terms that start with the given text.
  239. >>> # Match documents containing words starting with 'comp'
  240. >>> Prefix("content", u"comp")
  241. """
  242. def __unicode__(self):
  243. return "%s:%s*" % (self.fieldname, self.text)
  244. __str__ = __unicode__
  245. def _btexts(self, ixreader):
  246. return ixreader.expand_prefix(self.fieldname, self.text)
  247. def matcher(self, searcher, context=None):
  248. if self.text == "":
  249. from whoosh.query import Every
  250. eq = Every(self.fieldname, boost=self.boost)
  251. return eq.matcher(searcher, context)
  252. else:
  253. return PatternQuery.matcher(self, searcher, context)
  254. class Wildcard(PatternQuery):
  255. """Matches documents that contain any terms that match a "glob" pattern.
  256. See the Python ``fnmatch`` module for information about globs.
  257. >>> Wildcard("content", u"in*f?x")
  258. """
  259. SPECIAL_CHARS = frozenset("*?[")
  260. def __unicode__(self):
  261. return "%s:%s" % (self.fieldname, self.text)
  262. __str__ = __unicode__
  263. def _get_pattern(self):
  264. return fnmatch.translate(self.text)
  265. def normalize(self):
  266. # If there are no wildcard characters in this "wildcard", turn it into
  267. # a simple Term
  268. text = self.text
  269. if text == "*":
  270. from whoosh.query import Every
  271. return Every(self.fieldname, boost=self.boost)
  272. if "*" not in text and "?" not in text:
  273. # If no wildcard chars, convert to a normal term.
  274. return Term(self.fieldname, self.text, boost=self.boost)
  275. elif ("?" not in text and text.endswith("*")
  276. and text.find("*") == len(text) - 1):
  277. # If the only wildcard char is an asterisk at the end, convert to a
  278. # Prefix query.
  279. return Prefix(self.fieldname, self.text[:-1], boost=self.boost)
  280. else:
  281. return self
  282. def matcher(self, searcher, context=None):
  283. if self.text == "*":
  284. from whoosh.query import Every
  285. eq = Every(self.fieldname, boost=self.boost)
  286. return eq.matcher(searcher, context)
  287. else:
  288. return PatternQuery.matcher(self, searcher, context)
  289. # _btexts() implemented in PatternQuery
  290. class Regex(PatternQuery):
  291. """Matches documents that contain any terms that match a regular
  292. expression. See the Python ``re`` module for information about regular
  293. expressions.
  294. """
  295. SPECIAL_CHARS = frozenset("{}()[].?*+^$\\")
  296. def __unicode__(self):
  297. return '%s:r"%s"' % (self.fieldname, self.text)
  298. __str__ = __unicode__
  299. def _get_pattern(self):
  300. return self.text
  301. def _find_prefix(self, text):
  302. if "|" in text:
  303. return ""
  304. if text.startswith("^"):
  305. text = text[1:]
  306. elif text.startswith("\\A"):
  307. text = text[2:]
  308. prefix = PatternQuery._find_prefix(self, text)
  309. lp = len(prefix)
  310. if lp < len(text) and text[lp] in "*?":
  311. # we stripped something starting from * or ? - they both MAY mean
  312. # "0 times". As we had stripped starting from FIRST special char,
  313. # that implies there were only ordinary chars left of it. Thus,
  314. # the very last of them is not part of the real prefix:
  315. prefix = prefix[:-1]
  316. return prefix
  317. def matcher(self, searcher, context=None):
  318. if self.text == ".*":
  319. from whoosh.query import Every
  320. eq = Every(self.fieldname, boost=self.boost)
  321. return eq.matcher(searcher, context)
  322. else:
  323. return PatternQuery.matcher(self, searcher, context)
  324. # _btexts() implemented in PatternQuery
  325. class ExpandingTerm(MultiTerm):
  326. """Intermediate base class for queries such as FuzzyTerm and Variations
  327. that expand into multiple queries, but come from a single term.
  328. """
  329. def has_terms(self):
  330. return True
  331. def terms(self, phrases=False):
  332. if self.field():
  333. yield (self.field(), self.text)
  334. class FuzzyTerm(ExpandingTerm):
  335. """Matches documents containing words similar to the given term.
  336. """
  337. __inittypes__ = dict(fieldname=str, text=text_type, boost=float,
  338. maxdist=float, prefixlength=int)
  339. def __init__(self, fieldname, text, boost=1.0, maxdist=1,
  340. prefixlength=1, constantscore=True):
  341. """
  342. :param fieldname: The name of the field to search.
  343. :param text: The text to search for.
  344. :param boost: A boost factor to apply to scores of documents matching
  345. this query.
  346. :param maxdist: The maximum edit distance from the given text.
  347. :param prefixlength: The matched terms must share this many initial
  348. characters with 'text'. For example, if text is "light" and
  349. prefixlength is 2, then only terms starting with "li" are checked
  350. for similarity.
  351. """
  352. self.fieldname = fieldname
  353. self.text = text
  354. self.boost = boost
  355. self.maxdist = maxdist
  356. self.prefixlength = prefixlength
  357. self.constantscore = constantscore
  358. def __eq__(self, other):
  359. return (other and self.__class__ is other.__class__
  360. and self.fieldname == other.fieldname
  361. and self.text == other.text
  362. and self.maxdist == other.maxdist
  363. and self.prefixlength == other.prefixlength
  364. and self.boost == other.boost
  365. and self.constantscore == other.constantscore)
  366. def __repr__(self):
  367. r = "%s(%r, %r, boost=%f, maxdist=%d, prefixlength=%d)"
  368. return r % (self.__class__.__name__, self.fieldname, self.text,
  369. self.boost, self.maxdist, self.prefixlength)
  370. def __unicode__(self):
  371. r = u("%s:%s") % (self.fieldname, self.text) + u("~")
  372. if self.maxdist > 1:
  373. r += u("%d") % self.maxdist
  374. if self.boost != 1.0:
  375. r += u("^%f") % self.boost
  376. return r
  377. __str__ = __unicode__
  378. def __hash__(self):
  379. return (hash(self.fieldname) ^ hash(self.text) ^ hash(self.boost)
  380. ^ hash(self.maxdist) ^ hash(self.prefixlength)
  381. ^ hash(self.constantscore))
  382. def _btexts(self, ixreader):
  383. return ixreader.terms_within(self.fieldname, self.text, self.maxdist,
  384. prefix=self.prefixlength)
  385. def replace(self, fieldname, oldtext, newtext):
  386. q = copy.copy(self)
  387. if q.fieldname == fieldname and q.text == oldtext:
  388. q.text = newtext
  389. return q
  390. class Variations(ExpandingTerm):
  391. """Query that automatically searches for morphological variations of the
  392. given word in the same field.
  393. """
  394. def __init__(self, fieldname, text, boost=1.0):
  395. self.fieldname = fieldname
  396. self.text = text
  397. self.boost = boost
  398. def __repr__(self):
  399. r = "%s(%r, %r" % (self.__class__.__name__, self.fieldname, self.text)
  400. if self.boost != 1:
  401. r += ", boost=%s" % self.boost
  402. r += ")"
  403. return r
  404. def __eq__(self, other):
  405. return (other and self.__class__ is other.__class__
  406. and self.fieldname == other.fieldname
  407. and self.text == other.text and self.boost == other.boost)
  408. def __hash__(self):
  409. return hash(self.fieldname) ^ hash(self.text) ^ hash(self.boost)
  410. def _btexts(self, ixreader):
  411. fieldname = self.fieldname
  412. to_bytes = ixreader.schema[fieldname].to_bytes
  413. for word in variations(self.text):
  414. try:
  415. btext = to_bytes(word)
  416. except ValueError:
  417. continue
  418. if (fieldname, btext) in ixreader:
  419. yield btext
  420. def __unicode__(self):
  421. return u("%s:<%s>") % (self.fieldname, self.text)
  422. __str__ = __unicode__
  423. def replace(self, fieldname, oldtext, newtext):
  424. q = copy.copy(self)
  425. if q.fieldname == fieldname and q.text == oldtext:
  426. q.text = newtext
  427. return q