123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343 |
- # Copyright 2007 Matt Chaput. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # 1. Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
- # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # The views and conclusions contained in the software and documentation are
- # those of the authors and should not be interpreted as representing official
- # policies, either expressed or implied, of Matt Chaput.
- """
- This module contains helper functions for correcting typos in user queries.
- """
- from bisect import bisect_left
- from heapq import heappush, heapreplace
- from whoosh import highlight
- from whoosh.compat import iteritems, xrange
- # Corrector objects
- class Corrector(object):
- """
- Base class for spelling correction objects. Concrete sub-classes should
- implement the ``_suggestions`` method.
- """
- def suggest(self, text, limit=5, maxdist=2, prefix=0):
- """
- :param text: the text to check. This word will **not** be added to the
- suggestions, even if it appears in the word graph.
- :param limit: only return up to this many suggestions. If there are not
- enough terms in the field within ``maxdist`` of the given word, the
- returned list will be shorter than this number.
- :param maxdist: the largest edit distance from the given word to look
- at. Values higher than 2 are not very effective or efficient.
- :param prefix: require suggestions to share a prefix of this length
- with the given word. This is often justifiable since most
- misspellings do not involve the first letter of the word. Using a
- prefix dramatically decreases the time it takes to generate the
- list of words.
- """
- _suggestions = self._suggestions
- heap = []
- for item in _suggestions(text, maxdist, prefix):
- # Note that the *higher* scores (item[0]) are better!
- if len(heap) < limit:
- heappush(heap, item)
- elif item > heap[0]:
- heapreplace(heap, item)
- sugs = sorted(heap, key=lambda x: (0 - x[0], x[1]))
- return [sug for _, sug in sugs]
- def _suggestions(self, text, maxdist, prefix):
- """
- Low-level method that yields a series of (score, "suggestion")
- tuples.
- :param text: the text to check.
- :param maxdist: the maximum edit distance.
- :param prefix: require suggestions to share a prefix of this length
- with the given word.
- """
- raise NotImplementedError
- class ReaderCorrector(Corrector):
- """
- Suggests corrections based on the content of a field in a reader.
- Ranks suggestions by the edit distance, then by highest to lowest
- frequency.
- """
- def __init__(self, reader, fieldname, fieldobj):
- self.reader = reader
- self.fieldname = fieldname
- self.fieldobj = fieldobj
- def _suggestions(self, text, maxdist, prefix):
- reader = self.reader
- freq = reader.frequency
- fieldname = self.fieldname
- fieldobj = reader.schema[fieldname]
- sugfield = fieldobj.spelling_fieldname(fieldname)
- for sug in reader.terms_within(sugfield, text, maxdist, prefix=prefix):
- # Higher scores are better, so negate the distance and frequency
- f = freq(fieldname, sug) or 1
- score = 0 - (maxdist + (1.0 / f * 0.5))
- yield (score, sug)
- class ListCorrector(Corrector):
- """
- Suggests corrections based on the content of a sorted list of strings.
- """
- def __init__(self, wordlist):
- self.wordlist = wordlist
- def _suggestions(self, text, maxdist, prefix):
- from whoosh.automata.lev import levenshtein_automaton
- from whoosh.automata.fsa import find_all_matches
- seen = set()
- for i in xrange(1, maxdist + 1):
- dfa = levenshtein_automaton(text, maxdist, prefix).to_dfa()
- sk = self.Skipper(self.wordlist)
- for sug in find_all_matches(dfa, sk):
- if sug not in seen:
- seen.add(sug)
- yield (0 - maxdist), sug
- class Skipper(object):
- def __init__(self, data):
- self.data = data
- self.i = 0
- def __call__(self, w):
- if self.data[self.i] == w:
- return w
- self.i += 1
- pos = bisect_left(self.data, w, self.i)
- if pos < len(self.data):
- return self.data[pos]
- else:
- return None
- class MultiCorrector(Corrector):
- """
- Merges suggestions from a list of sub-correctors.
- """
- def __init__(self, correctors, op):
- self.correctors = correctors
- self.op = op
- def _suggestions(self, text, maxdist, prefix):
- op = self.op
- seen = {}
- for corr in self.correctors:
- for score, sug in corr._suggestions(text, maxdist, prefix):
- if sug in seen:
- seen[sug] = op(seen[sug], score)
- else:
- seen[sug] = score
- return iteritems(seen)
- # Query correction
- class Correction(object):
- """
- Represents the corrected version of a user query string. Has the
- following attributes:
- ``query``
- The corrected :class:`whoosh.query.Query` object.
- ``string``
- The corrected user query string.
- ``original_query``
- The original :class:`whoosh.query.Query` object that was corrected.
- ``original_string``
- The original user query string.
- ``tokens``
- A list of token objects representing the corrected words.
- You can also use the :meth:`Correction.format_string` method to reformat the
- corrected query string using a :class:`whoosh.highlight.Formatter` class.
- For example, to display the corrected query string as HTML with the
- changed words emphasized::
- from whoosh import highlight
- correction = mysearcher.correct_query(q, qstring)
- hf = highlight.HtmlFormatter(classname="change")
- html = correction.format_string(hf)
- """
- def __init__(self, q, qstring, corr_q, tokens):
- self.original_query = q
- self.query = corr_q
- self.original_string = qstring
- self.tokens = tokens
- if self.original_string:
- self.string = self.format_string(highlight.NullFormatter())
- else:
- self.string = ''
- def __repr__(self):
- return "%s(%r, %r)" % (self.__class__.__name__, self.query,
- self.string)
- def format_string(self, formatter):
- """
- Highlights the corrected words in the original query string using the
- given :class:`~whoosh.highlight.Formatter`.
- :param formatter: A :class:`whoosh.highlight.Formatter` instance.
- :return: the output of the formatter (usually a string).
- """
- if not self.original_string:
- return ''
- if isinstance(formatter, type):
- formatter = formatter()
- fragment = highlight.Fragment(self.original_string, self.tokens)
- return formatter.format_fragment(fragment, replace=True)
- # QueryCorrector objects
- class QueryCorrector(object):
- """
- Base class for objects that correct words in a user query.
- """
- def __init__(self, fieldname):
- self.fieldname = fieldname
- def correct_query(self, q, qstring):
- """
- Returns a :class:`Correction` object representing the corrected
- form of the given query.
- :param q: the original :class:`whoosh.query.Query` tree to be
- corrected.
- :param qstring: the original user query. This may be None if the
- original query string is not available, in which case the
- ``Correction.string`` attribute will also be None.
- :rtype: :class:`Correction`
- """
- raise NotImplementedError
- def field(self):
- return self.fieldname
- class SimpleQueryCorrector(QueryCorrector):
- """
- A simple query corrector based on a mapping of field names to
- :class:`Corrector` objects, and a list of ``("fieldname", "text")`` tuples
- to correct. And terms in the query that appear in list of term tuples are
- corrected using the appropriate corrector.
- """
- def __init__(self, correctors, terms, aliases=None, prefix=0, maxdist=2):
- """
- :param correctors: a dictionary mapping field names to
- :class:`Corrector` objects.
- :param terms: a sequence of ``("fieldname", "text")`` tuples
- representing terms to be corrected.
- :param aliases: a dictionary mapping field names in the query to
- field names for spelling suggestions.
- :param prefix: suggested replacement words must share this number of
- initial characters with the original word. Increasing this even to
- just ``1`` can dramatically speed up suggestions, and may be
- justifiable since spellling mistakes rarely involve the first
- letter of a word.
- :param maxdist: the maximum number of "edits" (insertions, deletions,
- subsitutions, or transpositions of letters) allowed between the
- original word and any suggestion. Values higher than ``2`` may be
- slow.
- """
- self.correctors = correctors
- self.aliases = aliases or {}
- self.termset = frozenset(terms)
- self.prefix = prefix
- self.maxdist = maxdist
- def correct_query(self, q, qstring):
- correctors = self.correctors
- aliases = self.aliases
- termset = self.termset
- prefix = self.prefix
- maxdist = self.maxdist
- # A list of tokens that were changed by a corrector
- corrected_tokens = []
- # The corrected query tree. We don't need to deepcopy the original
- # because we use Query.replace() to find-and-replace the corrected
- # words and it returns a copy of the query tree.
- corrected_q = q
- # For every word in the original query...
- # Note we can't put these in a set, because we must preserve WHERE
- # in the query each token occured so we can format them later
- for token in q.all_tokens():
- fname = token.fieldname
- aname = aliases.get(fname, fname)
- # If this is one of the words we're supposed to correct...
- if (fname, token.text) in termset:
- c = correctors[aname]
- sugs = c.suggest(token.text, prefix=prefix, maxdist=maxdist)
- if sugs:
- # This is a "simple" corrector, so we just pick the first
- # suggestion :/
- sug = sugs[0]
- # Return a new copy of the original query with this word
- # replaced by the correction
- corrected_q = corrected_q.replace(token.fieldname,
- token.text, sug)
- # Add the token to the list of corrected tokens (for the
- # formatter to use later)
- token.original = token.text
- token.text = sug
- corrected_tokens.append(token)
- return Correction(q, qstring, corrected_q, corrected_tokens)
|