spelling.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. # Copyright 2007 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. """
  28. This module contains helper functions for correcting typos in user queries.
  29. """
  30. from bisect import bisect_left
  31. from heapq import heappush, heapreplace
  32. from whoosh import highlight
  33. from whoosh.compat import iteritems, xrange
  34. # Corrector objects
  35. class Corrector(object):
  36. """
  37. Base class for spelling correction objects. Concrete sub-classes should
  38. implement the ``_suggestions`` method.
  39. """
  40. def suggest(self, text, limit=5, maxdist=2, prefix=0):
  41. """
  42. :param text: the text to check. This word will **not** be added to the
  43. suggestions, even if it appears in the word graph.
  44. :param limit: only return up to this many suggestions. If there are not
  45. enough terms in the field within ``maxdist`` of the given word, the
  46. returned list will be shorter than this number.
  47. :param maxdist: the largest edit distance from the given word to look
  48. at. Values higher than 2 are not very effective or efficient.
  49. :param prefix: require suggestions to share a prefix of this length
  50. with the given word. This is often justifiable since most
  51. misspellings do not involve the first letter of the word. Using a
  52. prefix dramatically decreases the time it takes to generate the
  53. list of words.
  54. """
  55. _suggestions = self._suggestions
  56. heap = []
  57. for item in _suggestions(text, maxdist, prefix):
  58. # Note that the *higher* scores (item[0]) are better!
  59. if len(heap) < limit:
  60. heappush(heap, item)
  61. elif item > heap[0]:
  62. heapreplace(heap, item)
  63. sugs = sorted(heap, key=lambda x: (0 - x[0], x[1]))
  64. return [sug for _, sug in sugs]
  65. def _suggestions(self, text, maxdist, prefix):
  66. """
  67. Low-level method that yields a series of (score, "suggestion")
  68. tuples.
  69. :param text: the text to check.
  70. :param maxdist: the maximum edit distance.
  71. :param prefix: require suggestions to share a prefix of this length
  72. with the given word.
  73. """
  74. raise NotImplementedError
  75. class ReaderCorrector(Corrector):
  76. """
  77. Suggests corrections based on the content of a field in a reader.
  78. Ranks suggestions by the edit distance, then by highest to lowest
  79. frequency.
  80. """
  81. def __init__(self, reader, fieldname, fieldobj):
  82. self.reader = reader
  83. self.fieldname = fieldname
  84. self.fieldobj = fieldobj
  85. def _suggestions(self, text, maxdist, prefix):
  86. reader = self.reader
  87. freq = reader.frequency
  88. fieldname = self.fieldname
  89. fieldobj = reader.schema[fieldname]
  90. sugfield = fieldobj.spelling_fieldname(fieldname)
  91. for sug in reader.terms_within(sugfield, text, maxdist, prefix=prefix):
  92. # Higher scores are better, so negate the distance and frequency
  93. f = freq(fieldname, sug) or 1
  94. score = 0 - (maxdist + (1.0 / f * 0.5))
  95. yield (score, sug)
  96. class ListCorrector(Corrector):
  97. """
  98. Suggests corrections based on the content of a sorted list of strings.
  99. """
  100. def __init__(self, wordlist):
  101. self.wordlist = wordlist
  102. def _suggestions(self, text, maxdist, prefix):
  103. from whoosh.automata.lev import levenshtein_automaton
  104. from whoosh.automata.fsa import find_all_matches
  105. seen = set()
  106. for i in xrange(1, maxdist + 1):
  107. dfa = levenshtein_automaton(text, maxdist, prefix).to_dfa()
  108. sk = self.Skipper(self.wordlist)
  109. for sug in find_all_matches(dfa, sk):
  110. if sug not in seen:
  111. seen.add(sug)
  112. yield (0 - maxdist), sug
  113. class Skipper(object):
  114. def __init__(self, data):
  115. self.data = data
  116. self.i = 0
  117. def __call__(self, w):
  118. if self.data[self.i] == w:
  119. return w
  120. self.i += 1
  121. pos = bisect_left(self.data, w, self.i)
  122. if pos < len(self.data):
  123. return self.data[pos]
  124. else:
  125. return None
  126. class MultiCorrector(Corrector):
  127. """
  128. Merges suggestions from a list of sub-correctors.
  129. """
  130. def __init__(self, correctors, op):
  131. self.correctors = correctors
  132. self.op = op
  133. def _suggestions(self, text, maxdist, prefix):
  134. op = self.op
  135. seen = {}
  136. for corr in self.correctors:
  137. for score, sug in corr._suggestions(text, maxdist, prefix):
  138. if sug in seen:
  139. seen[sug] = op(seen[sug], score)
  140. else:
  141. seen[sug] = score
  142. return iteritems(seen)
  143. # Query correction
  144. class Correction(object):
  145. """
  146. Represents the corrected version of a user query string. Has the
  147. following attributes:
  148. ``query``
  149. The corrected :class:`whoosh.query.Query` object.
  150. ``string``
  151. The corrected user query string.
  152. ``original_query``
  153. The original :class:`whoosh.query.Query` object that was corrected.
  154. ``original_string``
  155. The original user query string.
  156. ``tokens``
  157. A list of token objects representing the corrected words.
  158. You can also use the :meth:`Correction.format_string` method to reformat the
  159. corrected query string using a :class:`whoosh.highlight.Formatter` class.
  160. For example, to display the corrected query string as HTML with the
  161. changed words emphasized::
  162. from whoosh import highlight
  163. correction = mysearcher.correct_query(q, qstring)
  164. hf = highlight.HtmlFormatter(classname="change")
  165. html = correction.format_string(hf)
  166. """
  167. def __init__(self, q, qstring, corr_q, tokens):
  168. self.original_query = q
  169. self.query = corr_q
  170. self.original_string = qstring
  171. self.tokens = tokens
  172. if self.original_string:
  173. self.string = self.format_string(highlight.NullFormatter())
  174. else:
  175. self.string = ''
  176. def __repr__(self):
  177. return "%s(%r, %r)" % (self.__class__.__name__, self.query,
  178. self.string)
  179. def format_string(self, formatter):
  180. """
  181. Highlights the corrected words in the original query string using the
  182. given :class:`~whoosh.highlight.Formatter`.
  183. :param formatter: A :class:`whoosh.highlight.Formatter` instance.
  184. :return: the output of the formatter (usually a string).
  185. """
  186. if not self.original_string:
  187. return ''
  188. if isinstance(formatter, type):
  189. formatter = formatter()
  190. fragment = highlight.Fragment(self.original_string, self.tokens)
  191. return formatter.format_fragment(fragment, replace=True)
  192. # QueryCorrector objects
  193. class QueryCorrector(object):
  194. """
  195. Base class for objects that correct words in a user query.
  196. """
  197. def __init__(self, fieldname):
  198. self.fieldname = fieldname
  199. def correct_query(self, q, qstring):
  200. """
  201. Returns a :class:`Correction` object representing the corrected
  202. form of the given query.
  203. :param q: the original :class:`whoosh.query.Query` tree to be
  204. corrected.
  205. :param qstring: the original user query. This may be None if the
  206. original query string is not available, in which case the
  207. ``Correction.string`` attribute will also be None.
  208. :rtype: :class:`Correction`
  209. """
  210. raise NotImplementedError
  211. def field(self):
  212. return self.fieldname
  213. class SimpleQueryCorrector(QueryCorrector):
  214. """
  215. A simple query corrector based on a mapping of field names to
  216. :class:`Corrector` objects, and a list of ``("fieldname", "text")`` tuples
  217. to correct. And terms in the query that appear in list of term tuples are
  218. corrected using the appropriate corrector.
  219. """
  220. def __init__(self, correctors, terms, aliases=None, prefix=0, maxdist=2):
  221. """
  222. :param correctors: a dictionary mapping field names to
  223. :class:`Corrector` objects.
  224. :param terms: a sequence of ``("fieldname", "text")`` tuples
  225. representing terms to be corrected.
  226. :param aliases: a dictionary mapping field names in the query to
  227. field names for spelling suggestions.
  228. :param prefix: suggested replacement words must share this number of
  229. initial characters with the original word. Increasing this even to
  230. just ``1`` can dramatically speed up suggestions, and may be
  231. justifiable since spellling mistakes rarely involve the first
  232. letter of a word.
  233. :param maxdist: the maximum number of "edits" (insertions, deletions,
  234. subsitutions, or transpositions of letters) allowed between the
  235. original word and any suggestion. Values higher than ``2`` may be
  236. slow.
  237. """
  238. self.correctors = correctors
  239. self.aliases = aliases or {}
  240. self.termset = frozenset(terms)
  241. self.prefix = prefix
  242. self.maxdist = maxdist
  243. def correct_query(self, q, qstring):
  244. correctors = self.correctors
  245. aliases = self.aliases
  246. termset = self.termset
  247. prefix = self.prefix
  248. maxdist = self.maxdist
  249. # A list of tokens that were changed by a corrector
  250. corrected_tokens = []
  251. # The corrected query tree. We don't need to deepcopy the original
  252. # because we use Query.replace() to find-and-replace the corrected
  253. # words and it returns a copy of the query tree.
  254. corrected_q = q
  255. # For every word in the original query...
  256. # Note we can't put these in a set, because we must preserve WHERE
  257. # in the query each token occured so we can format them later
  258. for token in q.all_tokens():
  259. fname = token.fieldname
  260. aname = aliases.get(fname, fname)
  261. # If this is one of the words we're supposed to correct...
  262. if (fname, token.text) in termset:
  263. c = correctors[aname]
  264. sugs = c.suggest(token.text, prefix=prefix, maxdist=maxdist)
  265. if sugs:
  266. # This is a "simple" corrector, so we just pick the first
  267. # suggestion :/
  268. sug = sugs[0]
  269. # Return a new copy of the original query with this word
  270. # replaced by the correction
  271. corrected_q = corrected_q.replace(token.fieldname,
  272. token.text, sug)
  273. # Add the token to the list of corrected tokens (for the
  274. # formatter to use later)
  275. token.original = token.text
  276. token.text = sug
  277. corrected_tokens.append(token)
  278. return Correction(q, qstring, corrected_q, corrected_tokens)