123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439 |
- # Copyright 2011 Matt Chaput. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # 1. Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
- # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # The views and conclusions contained in the software and documentation are
- # those of the authors and should not be interpreted as representing official
- # policies, either expressed or implied, of Matt Chaput.
- import sys
- from whoosh import query
- from whoosh.compat import text_type
- from whoosh.qparser import syntax
- from whoosh.qparser.common import print_debug, QueryParserError
- # Query parser object
- class QueryParser(object):
- """A hand-written query parser built on modular plug-ins. The default
- configuration implements a powerful fielded query language similar to
- Lucene's.
- You can use the ``plugins`` argument when creating the object to override
- the default list of plug-ins, and/or use ``add_plugin()`` and/or
- ``remove_plugin_class()`` to change the plug-ins included in the parser.
- >>> from whoosh import qparser
- >>> parser = qparser.QueryParser("content", schema)
- >>> parser.remove_plugin_class(qparser.WildcardPlugin)
- >>> parser.add_plugin(qparser.PrefixPlugin())
- >>> parser.parse(u"hello there")
- And([Term("content", u"hello"), Term("content", u"there")])
- """
- def __init__(self, fieldname, schema, plugins=None, termclass=query.Term,
- phraseclass=query.Phrase, group=syntax.AndGroup):
- """
- :param fieldname: the default field -- the parser uses this as the
- field for any terms without an explicit field.
- :param schema: a :class:`whoosh.fields.Schema` object to use when
- parsing. The appropriate fields in the schema will be used to
- tokenize terms/phrases before they are turned into query objects.
- You can specify None for the schema to create a parser that does
- not analyze the text of the query, usually for testing purposes.
- :param plugins: a list of plugins to use. WhitespacePlugin is
- automatically included, do not put it in this list. This overrides
- the default list of plugins. Classes in the list will be
- automatically instantiated.
- :param termclass: the query class to use for individual search terms.
- The default is :class:`whoosh.query.Term`.
- :param phraseclass: the query class to use for phrases. The default
- is :class:`whoosh.query.Phrase`.
- :param group: the default grouping. ``AndGroup`` makes terms required
- by default. ``OrGroup`` makes terms optional by default.
- """
- self.fieldname = fieldname
- self.schema = schema
- self.termclass = termclass
- self.phraseclass = phraseclass
- self.group = group
- self.plugins = []
- if plugins is None:
- plugins = self.default_set()
- self._add_ws_plugin()
- self.add_plugins(plugins)
- def default_set(self):
- """Returns the default list of plugins to use.
- """
- from whoosh.qparser import plugins
- return [plugins.WhitespacePlugin(),
- plugins.SingleQuotePlugin(),
- plugins.FieldsPlugin(),
- plugins.WildcardPlugin(),
- plugins.PhrasePlugin(),
- plugins.RangePlugin(),
- plugins.GroupPlugin(),
- plugins.OperatorsPlugin(),
- plugins.BoostPlugin(),
- plugins.EveryPlugin(),
- ]
- def add_plugins(self, pins):
- """Adds the given list of plugins to the list of plugins in this
- parser.
- """
- for pin in pins:
- self.add_plugin(pin)
- def add_plugin(self, pin):
- """Adds the given plugin to the list of plugins in this parser.
- """
- if isinstance(pin, type):
- pin = pin()
- self.plugins.append(pin)
- def _add_ws_plugin(self):
- from whoosh.qparser.plugins import WhitespacePlugin
- self.add_plugin(WhitespacePlugin())
- def remove_plugin(self, pi):
- """Removes the given plugin object from the list of plugins in this
- parser.
- """
- self.plugins.remove(pi)
- def remove_plugin_class(self, cls):
- """Removes any plugins of the given class from this parser.
- """
- self.plugins = [pi for pi in self.plugins if not isinstance(pi, cls)]
- def replace_plugin(self, plugin):
- """Removes any plugins of the class of the given plugin and then adds
- it. This is a convenience method to keep from having to call
- ``remove_plugin_class`` followed by ``add_plugin`` each time you want
- to reconfigure a default plugin.
- >>> qp = qparser.QueryParser("content", schema)
- >>> qp.replace_plugin(qparser.NotPlugin("(^| )-"))
- """
- self.remove_plugin_class(plugin.__class__)
- self.add_plugin(plugin)
- def _priorized(self, methodname):
- # methodname is "taggers" or "filters". Returns a priorized list of
- # tagger objects or filter functions.
- items_and_priorities = []
- for plugin in self.plugins:
- # Call either .taggers() or .filters() on the plugin
- method = getattr(plugin, methodname)
- for item in method(self):
- items_and_priorities.append(item)
- # Sort the list by priority (lower priority runs first)
- items_and_priorities.sort(key=lambda x: x[1])
- # Return the sorted list without the priorities
- return [item for item, _ in items_and_priorities]
- def multitoken_query(self, spec, texts, fieldname, termclass, boost):
- """Returns a query for multiple texts. This method implements the
- intention specified in the field's ``multitoken_query`` attribute,
- which specifies what to do when strings that look like single terms
- to the parser turn out to yield multiple tokens when analyzed.
- :param spec: a string describing how to join the text strings into a
- query. This is usually the value of the field's
- ``multitoken_query`` attribute.
- :param texts: a list of token strings.
- :param fieldname: the name of the field.
- :param termclass: the query class to use for single terms.
- :param boost: the original term's boost in the query string, should be
- applied to the returned query object.
- """
- spec = spec.lower()
- if spec == "first":
- # Throw away all but the first token
- return termclass(fieldname, texts[0], boost=boost)
- elif spec == "phrase":
- # Turn the token into a phrase
- return self.phraseclass(fieldname, texts, boost=boost)
- else:
- if spec == "default":
- qclass = self.group.qclass
- elif spec == "and":
- qclass = query.And
- elif spec == "or":
- qclass = query.Or
- else:
- raise QueryParserError("Unknown multitoken_query value %r"
- % spec)
- return qclass([termclass(fieldname, t, boost=boost)
- for t in texts])
- def term_query(self, fieldname, text, termclass, boost=1.0, tokenize=True,
- removestops=True):
- """Returns the appropriate query object for a single term in the query
- string.
- """
- if self.schema and fieldname in self.schema:
- field = self.schema[fieldname]
- # If this field type wants to parse queries itself, let it do so
- # and return early
- if field.self_parsing():
- try:
- q = field.parse_query(fieldname, text, boost=boost)
- return q
- except:
- e = sys.exc_info()[1]
- return query.error_query(e)
- # Otherwise, ask the field to process the text into a list of
- # tokenized strings
- texts = list(field.process_text(text, mode="query",
- tokenize=tokenize,
- removestops=removestops))
- # If the analyzer returned more than one token, use the field's
- # multitoken_query attribute to decide what query class, if any, to
- # use to put the tokens together
- if len(texts) > 1:
- return self.multitoken_query(field.multitoken_query, texts,
- fieldname, termclass, boost)
- # It's possible field.process_text() will return an empty list (for
- # example, on a stop word)
- if not texts:
- return None
- text = texts[0]
- return termclass(fieldname, text, boost=boost)
- def taggers(self):
- """Returns a priorized list of tagger objects provided by the parser's
- currently configured plugins.
- """
- return self._priorized("taggers")
- def filters(self):
- """Returns a priorized list of filter functions provided by the
- parser's currently configured plugins.
- """
- return self._priorized("filters")
- def tag(self, text, pos=0, debug=False):
- """Returns a group of syntax nodes corresponding to the given text,
- created by matching the Taggers provided by the parser's plugins.
- :param text: the text to tag.
- :param pos: the position in the text to start tagging at.
- """
- # The list out output tags
- stack = []
- # End position of the previous match
- prev = pos
- # Priorized list of taggers provided by the parser's plugins
- taggers = self.taggers()
- if debug:
- print_debug(debug, "Taggers: %r" % taggers)
- # Define a function that will make a WordNode from the "interstitial"
- # text between matches
- def inter(startchar, endchar):
- n = syntax.WordNode(text[startchar:endchar])
- n.startchar = startchar
- n.endchar = endchar
- return n
- while pos < len(text):
- node = None
- # Try each tagger to see if it matches at the current position
- for tagger in taggers:
- node = tagger.match(self, text, pos)
- if node is not None:
- if node.endchar <= pos:
- raise Exception("Token %r did not move cursor forward."
- " (%r, %s)" % (tagger, text, pos))
- if prev < pos:
- tween = inter(prev, pos)
- if debug:
- print_debug(debug, "Tween: %r" % tween)
- stack.append(tween)
- if debug:
- print_debug(debug, "Tagger: %r at %s: %r"
- % (tagger, pos, node))
- stack.append(node)
- prev = pos = node.endchar
- break
- if not node:
- # No taggers matched, move forward
- pos += 1
- # If there's unmatched text left over on the end, put it in a WordNode
- if prev < len(text):
- stack.append(inter(prev, len(text)))
- # Wrap the list of nodes in a group node
- group = self.group(stack)
- if debug:
- print_debug(debug, "Tagged group: %r" % group)
- return group
- def filterize(self, nodes, debug=False):
- """Takes a group of nodes and runs the filters provided by the parser's
- plugins.
- """
- # Call each filter in the priorized list of plugin filters
- if debug:
- print_debug(debug, "Pre-filtered group: %r" % nodes)
- for f in self.filters():
- if debug:
- print_debug(debug, "..Applying: %r" % f)
- nodes = f(self, nodes)
- if debug:
- print_debug(debug, "..Result: %r" % nodes)
- if nodes is None:
- raise Exception("Filter %r did not return anything" % f)
- return nodes
- def process(self, text, pos=0, debug=False):
- """Returns a group of syntax nodes corresponding to the given text,
- tagged by the plugin Taggers and filtered by the plugin filters.
- :param text: the text to tag.
- :param pos: the position in the text to start tagging at.
- """
- nodes = self.tag(text, pos=pos, debug=debug)
- nodes = self.filterize(nodes, debug=debug)
- return nodes
- def parse(self, text, normalize=True, debug=False):
- """Parses the input string and returns a :class:`whoosh.query.Query`
- object/tree.
- :param text: the unicode string to parse.
- :param normalize: whether to call normalize() on the query object/tree
- before returning it. This should be left on unless you're trying to
- debug the parser output.
- :rtype: :class:`whoosh.query.Query`
- """
- if not isinstance(text, text_type):
- text = text.decode("latin1")
- nodes = self.process(text, debug=debug)
- if debug:
- print_debug(debug, "Syntax tree: %r" % nodes)
- q = nodes.query(self)
- if not q:
- q = query.NullQuery
- if debug:
- print_debug(debug, "Pre-normalized query: %r" % q)
- if normalize:
- q = q.normalize()
- if debug:
- print_debug(debug, "Normalized query: %r" % q)
- return q
- def parse_(self, text, normalize=True):
- pass
- # Premade parser configurations
- def MultifieldParser(fieldnames, schema, fieldboosts=None, **kwargs):
- """Returns a QueryParser configured to search in multiple fields.
- Instead of assigning unfielded clauses to a default field, this parser
- transforms them into an OR clause that searches a list of fields. For
- example, if the list of multi-fields is "f1", "f2" and the query string is
- "hello there", the class will parse "(f1:hello OR f2:hello) (f1:there OR
- f2:there)". This is very useful when you have two textual fields (e.g.
- "title" and "content") you want to search by default.
- :param fieldnames: a list of field names to search.
- :param fieldboosts: an optional dictionary mapping field names to boosts.
- """
- from whoosh.qparser.plugins import MultifieldPlugin
- p = QueryParser(None, schema, **kwargs)
- mfp = MultifieldPlugin(fieldnames, fieldboosts=fieldboosts)
- p.add_plugin(mfp)
- return p
- def SimpleParser(fieldname, schema, **kwargs):
- """Returns a QueryParser configured to support only +, -, and phrase
- syntax.
- """
- from whoosh.qparser import plugins, syntax
- pins = [plugins.WhitespacePlugin,
- plugins.PlusMinusPlugin,
- plugins.PhrasePlugin]
- orgroup = syntax.OrGroup
- return QueryParser(fieldname, schema, plugins=pins, group=orgroup,
- **kwargs)
- def DisMaxParser(fieldboosts, schema, tiebreak=0.0, **kwargs):
- """Returns a QueryParser configured to support only +, -, and phrase
- syntax, and which converts individual terms into DisjunctionMax queries
- across a set of fields.
- :param fieldboosts: a dictionary mapping field names to boosts.
- """
- from whoosh.qparser import plugins, syntax
- mfp = plugins.MultifieldPlugin(list(fieldboosts.keys()),
- fieldboosts=fieldboosts,
- group=syntax.DisMaxGroup)
- pins = [plugins.WhitespacePlugin,
- plugins.PlusMinusPlugin,
- plugins.PhrasePlugin,
- mfp]
- orgroup = syntax.OrGroup
- return QueryParser(None, schema, plugins=pins, group=orgroup, **kwargs)
|