123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645 |
- # Copyright 2011 Matt Chaput. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # 1. Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
- # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # The views and conclusions contained in the software and documentation are
- # those of the authors and should not be interpreted as representing official
- # policies, either expressed or implied, of Matt Chaput.
- import sys, weakref
- from whoosh import query
- from whoosh.qparser.common import get_single_text, QueryParserError, attach
- class SyntaxNode(object):
- """Base class for nodes that make up the abstract syntax tree (AST) of a
- parsed user query string. The AST is an intermediate step, generated
- from the query string, then converted into a :class:`whoosh.query.Query`
- tree by calling the ``query()`` method on the nodes.
- Instances have the following required attributes:
- ``has_fieldname``
- True if this node has a ``fieldname`` attribute.
- ``has_text``
- True if this node has a ``text`` attribute
- ``has_boost``
- True if this node has a ``boost`` attribute.
- ``startchar``
- The character position in the original text at which this node started.
- ``endchar``
- The character position in the original text at which this node ended.
- """
- has_fieldname = False
- has_text = False
- has_boost = False
- _parent = None
- def __repr__(self):
- r = "<"
- if self.has_fieldname:
- r += "%r:" % self.fieldname
- r += self.r()
- if self.has_boost and self.boost != 1.0:
- r += " ^%s" % self.boost
- r += ">"
- return r
- def r(self):
- """Returns a basic representation of this node. The base class's
- ``__repr__`` method calls this, then does the extra busy work of adding
- fieldname and boost where appropriate.
- """
- return "%s %r" % (self.__class__.__name__, self.__dict__)
- def apply(self, fn):
- return self
- def accept(self, fn):
- def fn_wrapper(n):
- return fn(n.apply(fn_wrapper))
- return fn_wrapper(self)
- def query(self, parser):
- """Returns a :class:`whoosh.query.Query` instance corresponding to this
- syntax tree node.
- """
- raise NotImplementedError(self.__class__.__name__)
- def is_ws(self):
- """Returns True if this node is ignorable whitespace.
- """
- return False
- def is_text(self):
- return False
- def set_fieldname(self, name, override=False):
- """Sets the fieldname associated with this node. If ``override`` is
- False (the default), the fieldname will only be replaced if this node
- does not already have a fieldname set.
- For nodes that don't have a fieldname, this is a no-op.
- """
- if not self.has_fieldname:
- return
- if self.fieldname is None or override:
- self.fieldname = name
- return self
- def set_boost(self, boost):
- """Sets the boost associated with this node.
- For nodes that don't have a boost, this is a no-op.
- """
- if not self.has_boost:
- return
- self.boost = boost
- return self
- def set_range(self, startchar, endchar):
- """Sets the character range associated with this node.
- """
- self.startchar = startchar
- self.endchar = endchar
- return self
- # Navigation methods
- def parent(self):
- if self._parent:
- return self._parent()
- def next_sibling(self):
- p = self.parent()
- if p:
- return p.node_after(self)
- def prev_sibling(self):
- p = self.parent()
- if p:
- return p.node_before(self)
- def bake(self, parent):
- self._parent = weakref.ref(parent)
- class MarkerNode(SyntaxNode):
- """Base class for nodes that only exist to mark places in the tree.
- """
- def r(self):
- return self.__class__.__name__
- class Whitespace(MarkerNode):
- """Abstract syntax tree node for ignorable whitespace.
- """
- def r(self):
- return " "
- def is_ws(self):
- return True
- class FieldnameNode(SyntaxNode):
- """Abstract syntax tree node for field name assignments.
- """
- has_fieldname = True
- def __init__(self, fieldname, original):
- self.fieldname = fieldname
- self.original = original
- def __repr__(self):
- return "<%r:>" % self.fieldname
- class GroupNode(SyntaxNode):
- """Base class for abstract syntax tree node types that group together
- sub-nodes.
- Instances have the following attributes:
- ``merging``
- True if side-by-side instances of this group can be merged into a
- single group.
- ``qclass``
- If a subclass doesn't override ``query()``, the base class will simply
- wrap this class around the queries returned by the subnodes.
- This class implements a number of list methods for operating on the
- subnodes.
- """
- has_boost = True
- merging = True
- qclass = None
- def __init__(self, nodes=None, boost=1.0, **kwargs):
- self.nodes = nodes or []
- self.boost = boost
- self.kwargs = kwargs
- def r(self):
- return "%s %s" % (self.__class__.__name__,
- ", ".join(repr(n) for n in self.nodes))
- @property
- def startchar(self):
- if not self.nodes:
- return None
- return self.nodes[0].startchar
- @property
- def endchar(self):
- if not self.nodes:
- return None
- return self.nodes[-1].endchar
- def apply(self, fn):
- return self.__class__(self.type, [fn(node) for node in self.nodes],
- boost=self.boost, **self.kwargs)
- def query(self, parser):
- subs = []
- for node in self.nodes:
- subq = node.query(parser)
- if subq is not None:
- subs.append(subq)
- q = self.qclass(subs, boost=self.boost, **self.kwargs)
- return attach(q, self)
- def empty_copy(self):
- """Returns an empty copy of this group.
- This is used in the common pattern where a filter creates an new
- group and then adds nodes from the input group to it if they meet
- certain criteria, then returns the new group::
- def remove_whitespace(parser, group):
- newgroup = group.empty_copy()
- for node in group:
- if not node.is_ws():
- newgroup.append(node)
- return newgroup
- """
- c = self.__class__(**self.kwargs)
- if self.has_boost:
- c.boost = self.boost
- if self.has_fieldname:
- c.fieldname = self.fieldname
- if self.has_text:
- c.text = self.text
- return c
- def set_fieldname(self, name, override=False):
- SyntaxNode.set_fieldname(self, name, override=override)
- for node in self.nodes:
- node.set_fieldname(name, override=override)
- def set_range(self, startchar, endchar):
- for node in self.nodes:
- node.set_range(startchar, endchar)
- return self
- # List-like methods
- def __nonzero__(self):
- return bool(self.nodes)
- __bool__ = __nonzero__
- def __iter__(self):
- return iter(self.nodes)
- def __len__(self):
- return len(self.nodes)
- def __getitem__(self, n):
- return self.nodes.__getitem__(n)
- def __setitem__(self, n, v):
- self.nodes.__setitem__(n, v)
- def __delitem__(self, n):
- self.nodes.__delitem__(n)
- def insert(self, n, v):
- self.nodes.insert(n, v)
- def append(self, v):
- self.nodes.append(v)
- def extend(self, vs):
- self.nodes.extend(vs)
- def pop(self, *args, **kwargs):
- return self.nodes.pop(*args, **kwargs)
- def reverse(self):
- self.nodes.reverse()
- def index(self, v):
- return self.nodes.index(v)
- # Navigation methods
- def bake(self, parent):
- SyntaxNode.bake(self, parent)
- for node in self.nodes:
- node.bake(self)
- def node_before(self, n):
- try:
- i = self.nodes.index(n)
- except ValueError:
- return
- if i > 0:
- return self.nodes[i - 1]
- def node_after(self, n):
- try:
- i = self.nodes.index(n)
- except ValueError:
- return
- if i < len(self.nodes) - 2:
- return self.nodes[i + 1]
- class BinaryGroup(GroupNode):
- """Intermediate base class for group nodes that have two subnodes and
- whose ``qclass`` initializer takes two arguments instead of a list.
- """
- merging = False
- has_boost = False
- def query(self, parser):
- assert len(self.nodes) == 2
- qa = self.nodes[0].query(parser)
- qb = self.nodes[1].query(parser)
- if qa is None and qb is None:
- q = query.NullQuery
- elif qa is None:
- q = qb
- elif qb is None:
- q = qa
- else:
- q = self.qclass(self.nodes[0].query(parser),
- self.nodes[1].query(parser))
- return attach(q, self)
- class Wrapper(GroupNode):
- """Intermediate base class for nodes that wrap a single sub-node.
- """
- merging = False
- def query(self, parser):
- q = self.nodes[0].query(parser)
- if q:
- return attach(self.qclass(q), self)
- class ErrorNode(SyntaxNode):
- def __init__(self, message, node=None):
- self.message = message
- self.node = node
- def r(self):
- return "ERR %r %r" % (self.node, self.message)
- @property
- def startchar(self):
- return self.node.startchar
- @property
- def endchar(self):
- return self.node.endchar
- def query(self, parser):
- if self.node:
- q = self.node.query(parser)
- else:
- q = query.NullQuery
- return attach(query.error_query(self.message, q), self)
- class AndGroup(GroupNode):
- qclass = query.And
- class OrGroup(GroupNode):
- qclass = query.Or
- @classmethod
- def factory(cls, scale=1.0):
- class ScaledOrGroup(OrGroup):
- def __init__(self, nodes=None, **kwargs):
- if "scale" in kwargs:
- del kwargs["scale"]
- super(ScaledOrGroup, self).__init__(nodes=nodes, scale=scale,
- **kwargs)
- return ScaledOrGroup
- class DisMaxGroup(GroupNode):
- qclass = query.DisjunctionMax
- class OrderedGroup(GroupNode):
- qclass = query.Ordered
- class AndNotGroup(BinaryGroup):
- qclass = query.AndNot
- class AndMaybeGroup(BinaryGroup):
- qclass = query.AndMaybe
- class RequireGroup(BinaryGroup):
- qclass = query.Require
- class NotGroup(Wrapper):
- qclass = query.Not
- class RangeNode(SyntaxNode):
- """Syntax node for range queries.
- """
- has_fieldname = True
- def __init__(self, start, end, startexcl, endexcl):
- self.start = start
- self.end = end
- self.startexcl = startexcl
- self.endexcl = endexcl
- self.boost = 1.0
- self.fieldname = None
- self.kwargs = {}
- def r(self):
- b1 = "{" if self.startexcl else "["
- b2 = "}" if self.endexcl else "]"
- return "%s%r %r%s" % (b1, self.start, self.end, b2)
- def query(self, parser):
- fieldname = self.fieldname or parser.fieldname
- start = self.start
- end = self.end
- if parser.schema and fieldname in parser.schema:
- field = parser.schema[fieldname]
- if field.self_parsing():
- try:
- q = field.parse_range(fieldname, start, end,
- self.startexcl, self.endexcl,
- boost=self.boost)
- if q is not None:
- return attach(q, self)
- except QueryParserError:
- e = sys.exc_info()[1]
- return attach(query.error_query(e), self)
- if start:
- start = get_single_text(field, start, tokenize=False,
- removestops=False)
- if end:
- end = get_single_text(field, end, tokenize=False,
- removestops=False)
- q = query.TermRange(fieldname, start, end, self.startexcl,
- self.endexcl, boost=self.boost)
- return attach(q, self)
- class TextNode(SyntaxNode):
- """Intermediate base class for basic nodes that search for text, such as
- term queries, wildcards, prefixes, etc.
- Instances have the following attributes:
- ``qclass``
- If a subclass does not override ``query()``, the base class will use
- this class to construct the query.
- ``tokenize``
- If True and the subclass does not override ``query()``, the node's text
- will be tokenized before constructing the query
- ``removestops``
- If True and the subclass does not override ``query()``, and the field's
- analyzer has a stop word filter, stop words will be removed from the
- text before constructing the query.
- """
- has_fieldname = True
- has_text = True
- has_boost = True
- qclass = None
- tokenize = False
- removestops = False
- def __init__(self, text):
- self.fieldname = None
- self.text = text
- self.boost = 1.0
- def r(self):
- return "%s %r" % (self.__class__.__name__, self.text)
- def is_text(self):
- return True
- def query(self, parser):
- fieldname = self.fieldname or parser.fieldname
- termclass = self.qclass or parser.termclass
- q = parser.term_query(fieldname, self.text, termclass,
- boost=self.boost, tokenize=self.tokenize,
- removestops=self.removestops)
- return attach(q, self)
- class WordNode(TextNode):
- """Syntax node for term queries.
- """
- tokenize = True
- removestops = True
- def r(self):
- return repr(self.text)
- # Operators
- class Operator(SyntaxNode):
- """Base class for PrefixOperator, PostfixOperator, and InfixOperator.
- Operators work by moving the nodes they apply to (e.g. for prefix operator,
- the previous node, for infix operator, the nodes on either side, etc.) into
- a group node. The group provides the code for what to do with the nodes.
- """
- def __init__(self, text, grouptype, leftassoc=True):
- """
- :param text: the text of the operator in the query string.
- :param grouptype: the type of group to create in place of the operator
- and the node(s) it operates on.
- :param leftassoc: for infix opeators, whether the operator is left
- associative. use ``leftassoc=False`` for right-associative infix
- operators.
- """
- self.text = text
- self.grouptype = grouptype
- self.leftassoc = leftassoc
- def r(self):
- return "OP %r" % self.text
- def replace_self(self, parser, group, position):
- """Called with the parser, a group, and the position at which the
- operator occurs in that group. Should return a group with the operator
- replaced by whatever effect the operator has (e.g. for an infix op,
- replace the op and the nodes on either side with a sub-group).
- """
- raise NotImplementedError
- class PrefixOperator(Operator):
- def replace_self(self, parser, group, position):
- length = len(group)
- del group[position]
- if position < length - 1:
- group[position] = self.grouptype([group[position]])
- return position
- class PostfixOperator(Operator):
- def replace_self(self, parser, group, position):
- del group[position]
- if position > 0:
- group[position - 1] = self.grouptype([group[position - 1]])
- return position
- class InfixOperator(Operator):
- def replace_self(self, parser, group, position):
- la = self.leftassoc
- gtype = self.grouptype
- merging = gtype.merging
- if position > 0 and position < len(group) - 1:
- left = group[position - 1]
- right = group[position + 1]
- # The first two clauses check whether the "strong" side is already
- # a group of the type we are going to create. If it is, we just
- # append the "weak" side to the "strong" side instead of creating
- # a new group inside the existing one. This is necessary because
- # we can quickly run into Python's recursion limit otherwise.
- if merging and la and isinstance(left, gtype):
- left.append(right)
- del group[position:position + 2]
- elif merging and not la and isinstance(right, gtype):
- right.insert(0, left)
- del group[position - 1:position + 1]
- return position - 1
- else:
- # Replace the operator and the two surrounding objects
- group[position - 1:position + 2] = [gtype([left, right])]
- else:
- del group[position]
- return position
- # Functions
- def to_word(n):
- node = WordNode(n.original)
- node.startchar = n.startchar
- node.endchar = n.endchar
- return node
|