syntax.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645
  1. # Copyright 2011 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. import sys, weakref
  28. from whoosh import query
  29. from whoosh.qparser.common import get_single_text, QueryParserError, attach
  30. class SyntaxNode(object):
  31. """Base class for nodes that make up the abstract syntax tree (AST) of a
  32. parsed user query string. The AST is an intermediate step, generated
  33. from the query string, then converted into a :class:`whoosh.query.Query`
  34. tree by calling the ``query()`` method on the nodes.
  35. Instances have the following required attributes:
  36. ``has_fieldname``
  37. True if this node has a ``fieldname`` attribute.
  38. ``has_text``
  39. True if this node has a ``text`` attribute
  40. ``has_boost``
  41. True if this node has a ``boost`` attribute.
  42. ``startchar``
  43. The character position in the original text at which this node started.
  44. ``endchar``
  45. The character position in the original text at which this node ended.
  46. """
  47. has_fieldname = False
  48. has_text = False
  49. has_boost = False
  50. _parent = None
  51. def __repr__(self):
  52. r = "<"
  53. if self.has_fieldname:
  54. r += "%r:" % self.fieldname
  55. r += self.r()
  56. if self.has_boost and self.boost != 1.0:
  57. r += " ^%s" % self.boost
  58. r += ">"
  59. return r
  60. def r(self):
  61. """Returns a basic representation of this node. The base class's
  62. ``__repr__`` method calls this, then does the extra busy work of adding
  63. fieldname and boost where appropriate.
  64. """
  65. return "%s %r" % (self.__class__.__name__, self.__dict__)
  66. def apply(self, fn):
  67. return self
  68. def accept(self, fn):
  69. def fn_wrapper(n):
  70. return fn(n.apply(fn_wrapper))
  71. return fn_wrapper(self)
  72. def query(self, parser):
  73. """Returns a :class:`whoosh.query.Query` instance corresponding to this
  74. syntax tree node.
  75. """
  76. raise NotImplementedError(self.__class__.__name__)
  77. def is_ws(self):
  78. """Returns True if this node is ignorable whitespace.
  79. """
  80. return False
  81. def is_text(self):
  82. return False
  83. def set_fieldname(self, name, override=False):
  84. """Sets the fieldname associated with this node. If ``override`` is
  85. False (the default), the fieldname will only be replaced if this node
  86. does not already have a fieldname set.
  87. For nodes that don't have a fieldname, this is a no-op.
  88. """
  89. if not self.has_fieldname:
  90. return
  91. if self.fieldname is None or override:
  92. self.fieldname = name
  93. return self
  94. def set_boost(self, boost):
  95. """Sets the boost associated with this node.
  96. For nodes that don't have a boost, this is a no-op.
  97. """
  98. if not self.has_boost:
  99. return
  100. self.boost = boost
  101. return self
  102. def set_range(self, startchar, endchar):
  103. """Sets the character range associated with this node.
  104. """
  105. self.startchar = startchar
  106. self.endchar = endchar
  107. return self
  108. # Navigation methods
  109. def parent(self):
  110. if self._parent:
  111. return self._parent()
  112. def next_sibling(self):
  113. p = self.parent()
  114. if p:
  115. return p.node_after(self)
  116. def prev_sibling(self):
  117. p = self.parent()
  118. if p:
  119. return p.node_before(self)
  120. def bake(self, parent):
  121. self._parent = weakref.ref(parent)
  122. class MarkerNode(SyntaxNode):
  123. """Base class for nodes that only exist to mark places in the tree.
  124. """
  125. def r(self):
  126. return self.__class__.__name__
  127. class Whitespace(MarkerNode):
  128. """Abstract syntax tree node for ignorable whitespace.
  129. """
  130. def r(self):
  131. return " "
  132. def is_ws(self):
  133. return True
  134. class FieldnameNode(SyntaxNode):
  135. """Abstract syntax tree node for field name assignments.
  136. """
  137. has_fieldname = True
  138. def __init__(self, fieldname, original):
  139. self.fieldname = fieldname
  140. self.original = original
  141. def __repr__(self):
  142. return "<%r:>" % self.fieldname
  143. class GroupNode(SyntaxNode):
  144. """Base class for abstract syntax tree node types that group together
  145. sub-nodes.
  146. Instances have the following attributes:
  147. ``merging``
  148. True if side-by-side instances of this group can be merged into a
  149. single group.
  150. ``qclass``
  151. If a subclass doesn't override ``query()``, the base class will simply
  152. wrap this class around the queries returned by the subnodes.
  153. This class implements a number of list methods for operating on the
  154. subnodes.
  155. """
  156. has_boost = True
  157. merging = True
  158. qclass = None
  159. def __init__(self, nodes=None, boost=1.0, **kwargs):
  160. self.nodes = nodes or []
  161. self.boost = boost
  162. self.kwargs = kwargs
  163. def r(self):
  164. return "%s %s" % (self.__class__.__name__,
  165. ", ".join(repr(n) for n in self.nodes))
  166. @property
  167. def startchar(self):
  168. if not self.nodes:
  169. return None
  170. return self.nodes[0].startchar
  171. @property
  172. def endchar(self):
  173. if not self.nodes:
  174. return None
  175. return self.nodes[-1].endchar
  176. def apply(self, fn):
  177. return self.__class__(self.type, [fn(node) for node in self.nodes],
  178. boost=self.boost, **self.kwargs)
  179. def query(self, parser):
  180. subs = []
  181. for node in self.nodes:
  182. subq = node.query(parser)
  183. if subq is not None:
  184. subs.append(subq)
  185. q = self.qclass(subs, boost=self.boost, **self.kwargs)
  186. return attach(q, self)
  187. def empty_copy(self):
  188. """Returns an empty copy of this group.
  189. This is used in the common pattern where a filter creates an new
  190. group and then adds nodes from the input group to it if they meet
  191. certain criteria, then returns the new group::
  192. def remove_whitespace(parser, group):
  193. newgroup = group.empty_copy()
  194. for node in group:
  195. if not node.is_ws():
  196. newgroup.append(node)
  197. return newgroup
  198. """
  199. c = self.__class__(**self.kwargs)
  200. if self.has_boost:
  201. c.boost = self.boost
  202. if self.has_fieldname:
  203. c.fieldname = self.fieldname
  204. if self.has_text:
  205. c.text = self.text
  206. return c
  207. def set_fieldname(self, name, override=False):
  208. SyntaxNode.set_fieldname(self, name, override=override)
  209. for node in self.nodes:
  210. node.set_fieldname(name, override=override)
  211. def set_range(self, startchar, endchar):
  212. for node in self.nodes:
  213. node.set_range(startchar, endchar)
  214. return self
  215. # List-like methods
  216. def __nonzero__(self):
  217. return bool(self.nodes)
  218. __bool__ = __nonzero__
  219. def __iter__(self):
  220. return iter(self.nodes)
  221. def __len__(self):
  222. return len(self.nodes)
  223. def __getitem__(self, n):
  224. return self.nodes.__getitem__(n)
  225. def __setitem__(self, n, v):
  226. self.nodes.__setitem__(n, v)
  227. def __delitem__(self, n):
  228. self.nodes.__delitem__(n)
  229. def insert(self, n, v):
  230. self.nodes.insert(n, v)
  231. def append(self, v):
  232. self.nodes.append(v)
  233. def extend(self, vs):
  234. self.nodes.extend(vs)
  235. def pop(self, *args, **kwargs):
  236. return self.nodes.pop(*args, **kwargs)
  237. def reverse(self):
  238. self.nodes.reverse()
  239. def index(self, v):
  240. return self.nodes.index(v)
  241. # Navigation methods
  242. def bake(self, parent):
  243. SyntaxNode.bake(self, parent)
  244. for node in self.nodes:
  245. node.bake(self)
  246. def node_before(self, n):
  247. try:
  248. i = self.nodes.index(n)
  249. except ValueError:
  250. return
  251. if i > 0:
  252. return self.nodes[i - 1]
  253. def node_after(self, n):
  254. try:
  255. i = self.nodes.index(n)
  256. except ValueError:
  257. return
  258. if i < len(self.nodes) - 2:
  259. return self.nodes[i + 1]
  260. class BinaryGroup(GroupNode):
  261. """Intermediate base class for group nodes that have two subnodes and
  262. whose ``qclass`` initializer takes two arguments instead of a list.
  263. """
  264. merging = False
  265. has_boost = False
  266. def query(self, parser):
  267. assert len(self.nodes) == 2
  268. qa = self.nodes[0].query(parser)
  269. qb = self.nodes[1].query(parser)
  270. if qa is None and qb is None:
  271. q = query.NullQuery
  272. elif qa is None:
  273. q = qb
  274. elif qb is None:
  275. q = qa
  276. else:
  277. q = self.qclass(self.nodes[0].query(parser),
  278. self.nodes[1].query(parser))
  279. return attach(q, self)
  280. class Wrapper(GroupNode):
  281. """Intermediate base class for nodes that wrap a single sub-node.
  282. """
  283. merging = False
  284. def query(self, parser):
  285. q = self.nodes[0].query(parser)
  286. if q:
  287. return attach(self.qclass(q), self)
  288. class ErrorNode(SyntaxNode):
  289. def __init__(self, message, node=None):
  290. self.message = message
  291. self.node = node
  292. def r(self):
  293. return "ERR %r %r" % (self.node, self.message)
  294. @property
  295. def startchar(self):
  296. return self.node.startchar
  297. @property
  298. def endchar(self):
  299. return self.node.endchar
  300. def query(self, parser):
  301. if self.node:
  302. q = self.node.query(parser)
  303. else:
  304. q = query.NullQuery
  305. return attach(query.error_query(self.message, q), self)
  306. class AndGroup(GroupNode):
  307. qclass = query.And
  308. class OrGroup(GroupNode):
  309. qclass = query.Or
  310. @classmethod
  311. def factory(cls, scale=1.0):
  312. class ScaledOrGroup(OrGroup):
  313. def __init__(self, nodes=None, **kwargs):
  314. if "scale" in kwargs:
  315. del kwargs["scale"]
  316. super(ScaledOrGroup, self).__init__(nodes=nodes, scale=scale,
  317. **kwargs)
  318. return ScaledOrGroup
  319. class DisMaxGroup(GroupNode):
  320. qclass = query.DisjunctionMax
  321. class OrderedGroup(GroupNode):
  322. qclass = query.Ordered
  323. class AndNotGroup(BinaryGroup):
  324. qclass = query.AndNot
  325. class AndMaybeGroup(BinaryGroup):
  326. qclass = query.AndMaybe
  327. class RequireGroup(BinaryGroup):
  328. qclass = query.Require
  329. class NotGroup(Wrapper):
  330. qclass = query.Not
  331. class RangeNode(SyntaxNode):
  332. """Syntax node for range queries.
  333. """
  334. has_fieldname = True
  335. def __init__(self, start, end, startexcl, endexcl):
  336. self.start = start
  337. self.end = end
  338. self.startexcl = startexcl
  339. self.endexcl = endexcl
  340. self.boost = 1.0
  341. self.fieldname = None
  342. self.kwargs = {}
  343. def r(self):
  344. b1 = "{" if self.startexcl else "["
  345. b2 = "}" if self.endexcl else "]"
  346. return "%s%r %r%s" % (b1, self.start, self.end, b2)
  347. def query(self, parser):
  348. fieldname = self.fieldname or parser.fieldname
  349. start = self.start
  350. end = self.end
  351. if parser.schema and fieldname in parser.schema:
  352. field = parser.schema[fieldname]
  353. if field.self_parsing():
  354. try:
  355. q = field.parse_range(fieldname, start, end,
  356. self.startexcl, self.endexcl,
  357. boost=self.boost)
  358. if q is not None:
  359. return attach(q, self)
  360. except QueryParserError:
  361. e = sys.exc_info()[1]
  362. return attach(query.error_query(e), self)
  363. if start:
  364. start = get_single_text(field, start, tokenize=False,
  365. removestops=False)
  366. if end:
  367. end = get_single_text(field, end, tokenize=False,
  368. removestops=False)
  369. q = query.TermRange(fieldname, start, end, self.startexcl,
  370. self.endexcl, boost=self.boost)
  371. return attach(q, self)
  372. class TextNode(SyntaxNode):
  373. """Intermediate base class for basic nodes that search for text, such as
  374. term queries, wildcards, prefixes, etc.
  375. Instances have the following attributes:
  376. ``qclass``
  377. If a subclass does not override ``query()``, the base class will use
  378. this class to construct the query.
  379. ``tokenize``
  380. If True and the subclass does not override ``query()``, the node's text
  381. will be tokenized before constructing the query
  382. ``removestops``
  383. If True and the subclass does not override ``query()``, and the field's
  384. analyzer has a stop word filter, stop words will be removed from the
  385. text before constructing the query.
  386. """
  387. has_fieldname = True
  388. has_text = True
  389. has_boost = True
  390. qclass = None
  391. tokenize = False
  392. removestops = False
  393. def __init__(self, text):
  394. self.fieldname = None
  395. self.text = text
  396. self.boost = 1.0
  397. def r(self):
  398. return "%s %r" % (self.__class__.__name__, self.text)
  399. def is_text(self):
  400. return True
  401. def query(self, parser):
  402. fieldname = self.fieldname or parser.fieldname
  403. termclass = self.qclass or parser.termclass
  404. q = parser.term_query(fieldname, self.text, termclass,
  405. boost=self.boost, tokenize=self.tokenize,
  406. removestops=self.removestops)
  407. return attach(q, self)
  408. class WordNode(TextNode):
  409. """Syntax node for term queries.
  410. """
  411. tokenize = True
  412. removestops = True
  413. def r(self):
  414. return repr(self.text)
  415. # Operators
  416. class Operator(SyntaxNode):
  417. """Base class for PrefixOperator, PostfixOperator, and InfixOperator.
  418. Operators work by moving the nodes they apply to (e.g. for prefix operator,
  419. the previous node, for infix operator, the nodes on either side, etc.) into
  420. a group node. The group provides the code for what to do with the nodes.
  421. """
  422. def __init__(self, text, grouptype, leftassoc=True):
  423. """
  424. :param text: the text of the operator in the query string.
  425. :param grouptype: the type of group to create in place of the operator
  426. and the node(s) it operates on.
  427. :param leftassoc: for infix opeators, whether the operator is left
  428. associative. use ``leftassoc=False`` for right-associative infix
  429. operators.
  430. """
  431. self.text = text
  432. self.grouptype = grouptype
  433. self.leftassoc = leftassoc
  434. def r(self):
  435. return "OP %r" % self.text
  436. def replace_self(self, parser, group, position):
  437. """Called with the parser, a group, and the position at which the
  438. operator occurs in that group. Should return a group with the operator
  439. replaced by whatever effect the operator has (e.g. for an infix op,
  440. replace the op and the nodes on either side with a sub-group).
  441. """
  442. raise NotImplementedError
  443. class PrefixOperator(Operator):
  444. def replace_self(self, parser, group, position):
  445. length = len(group)
  446. del group[position]
  447. if position < length - 1:
  448. group[position] = self.grouptype([group[position]])
  449. return position
  450. class PostfixOperator(Operator):
  451. def replace_self(self, parser, group, position):
  452. del group[position]
  453. if position > 0:
  454. group[position - 1] = self.grouptype([group[position - 1]])
  455. return position
  456. class InfixOperator(Operator):
  457. def replace_self(self, parser, group, position):
  458. la = self.leftassoc
  459. gtype = self.grouptype
  460. merging = gtype.merging
  461. if position > 0 and position < len(group) - 1:
  462. left = group[position - 1]
  463. right = group[position + 1]
  464. # The first two clauses check whether the "strong" side is already
  465. # a group of the type we are going to create. If it is, we just
  466. # append the "weak" side to the "strong" side instead of creating
  467. # a new group inside the existing one. This is necessary because
  468. # we can quickly run into Python's recursion limit otherwise.
  469. if merging and la and isinstance(left, gtype):
  470. left.append(right)
  471. del group[position:position + 2]
  472. elif merging and not la and isinstance(right, gtype):
  473. right.insert(0, left)
  474. del group[position - 1:position + 1]
  475. return position - 1
  476. else:
  477. # Replace the operator and the two surrounding objects
  478. group[position - 1:position + 2] = [gtype([left, right])]
  479. else:
  480. del group[position]
  481. return position
  482. # Functions
  483. def to_word(n):
  484. node = WordNode(n.original)
  485. node.startchar = n.startchar
  486. node.endchar = n.endchar
  487. return node