plugins.py 49 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413
  1. # Copyright 2011 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. import copy
  28. from whoosh import query
  29. from whoosh.compat import u
  30. from whoosh.compat import iteritems, xrange
  31. from whoosh.qparser import syntax
  32. from whoosh.qparser.common import attach
  33. from whoosh.qparser.taggers import RegexTagger, FnTagger
  34. from whoosh.util.text import rcompile
  35. class Plugin(object):
  36. """Base class for parser plugins.
  37. """
  38. def taggers(self, parser):
  39. """Should return a list of ``(Tagger, priority)`` tuples to add to the
  40. syntax the parser understands. Lower priorities run first.
  41. """
  42. return ()
  43. def filters(self, parser):
  44. """Should return a list of ``(filter_function, priority)`` tuples to
  45. add to parser. Lower priority numbers run first.
  46. Filter functions will be called with ``(parser, groupnode)`` and should
  47. return a group node.
  48. """
  49. return ()
  50. class TaggingPlugin(RegexTagger):
  51. """A plugin that also acts as a Tagger, to avoid having an extra Tagger
  52. class for simple cases.
  53. A TaggingPlugin object should have a ``priority`` attribute and either a
  54. ``nodetype`` attribute or a ``create()`` method. If the subclass doesn't
  55. override ``create()``, the base class will call ``self.nodetype`` with the
  56. Match object's named groups as keyword arguments.
  57. """
  58. priority = 0
  59. def __init__(self, expr=None):
  60. self.expr = rcompile(expr or self.expr)
  61. def taggers(self, parser):
  62. return [(self, self.priority)]
  63. def filters(self, parser):
  64. return ()
  65. def create(self, parser, match):
  66. # Groupdict keys can be unicode sometimes apparently? Convert them to
  67. # str for use as keyword arguments. This should be Py3-safe.
  68. kwargs = dict((str(k), v) for k, v in iteritems(match.groupdict()))
  69. return self.nodetype(**kwargs)
  70. class WhitespacePlugin(TaggingPlugin):
  71. """Tags whitespace and removes it at priority 500. Depending on whether
  72. your plugin's filter wants to see where whitespace was in the original
  73. query, it should run with priority lower than 500 (before removal of
  74. whitespace) or higher than 500 (after removal of whitespace).
  75. """
  76. nodetype = syntax.Whitespace
  77. priority = 100
  78. def __init__(self, expr=r"\s+"):
  79. TaggingPlugin.__init__(self, expr)
  80. def filters(self, parser):
  81. return [(self.remove_whitespace, 500)]
  82. def remove_whitespace(self, parser, group):
  83. newgroup = group.empty_copy()
  84. for node in group:
  85. if isinstance(node, syntax.GroupNode):
  86. newgroup.append(self.remove_whitespace(parser, node))
  87. elif not node.is_ws():
  88. newgroup.append(node)
  89. return newgroup
  90. class SingleQuotePlugin(TaggingPlugin):
  91. """Adds the ability to specify single "terms" containing spaces by
  92. enclosing them in single quotes.
  93. """
  94. expr = r"(^|(?<=\W))'(?P<text>.*?)'(?=\s|\]|[)}]|$)"
  95. nodetype = syntax.WordNode
  96. class PrefixPlugin(TaggingPlugin):
  97. """Adds the ability to specify prefix queries by ending a term with an
  98. asterisk.
  99. This plugin is useful if you want the user to be able to create prefix but
  100. not wildcard queries (for performance reasons). If you are including the
  101. wildcard plugin, you should not include this plugin as well.
  102. >>> qp = qparser.QueryParser("content", myschema)
  103. >>> qp.remove_plugin_class(qparser.WildcardPlugin)
  104. >>> qp.add_plugin(qparser.PrefixPlugin())
  105. >>> q = qp.parse("pre*")
  106. """
  107. class PrefixNode(syntax.TextNode):
  108. qclass = query.Prefix
  109. def r(self):
  110. return "%r*" % self.text
  111. expr = "(?P<text>[^ \t\r\n*]+)[*](?= |$|\\))"
  112. nodetype = PrefixNode
  113. class WildcardPlugin(TaggingPlugin):
  114. # \u055E = Armenian question mark
  115. # \u061F = Arabic question mark
  116. # \u1367 = Ethiopic question mark
  117. qmarks = u("?\u055E\u061F\u1367")
  118. expr = "(?P<text>[*%s])" % qmarks
  119. def filters(self, parser):
  120. # Run early, but definitely before multifield plugin
  121. return [(self.do_wildcards, 50)]
  122. def do_wildcards(self, parser, group):
  123. i = 0
  124. while i < len(group):
  125. node = group[i]
  126. if isinstance(node, self.WildcardNode):
  127. if i < len(group) - 1 and group[i + 1].is_text():
  128. nextnode = group.pop(i + 1)
  129. node.text += nextnode.text
  130. if i > 0 and group[i - 1].is_text():
  131. prevnode = group.pop(i - 1)
  132. node.text = prevnode.text + node.text
  133. else:
  134. i += 1
  135. else:
  136. if isinstance(node, syntax.GroupNode):
  137. self.do_wildcards(parser, node)
  138. i += 1
  139. for i in xrange(len(group)):
  140. node = group[i]
  141. if isinstance(node, self.WildcardNode):
  142. text = node.text
  143. if len(text) > 1 and not any(qm in text for qm in self.qmarks):
  144. if text.find("*") == len(text) - 1:
  145. newnode = PrefixPlugin.PrefixNode(text[:-1])
  146. newnode.startchar = node.startchar
  147. newnode.endchar = node.endchar
  148. group[i] = newnode
  149. return group
  150. class WildcardNode(syntax.TextNode):
  151. # Note that this node inherits tokenize = False from TextNode,
  152. # so the text in this node will not be analyzed... just passed
  153. # straight to the query
  154. qclass = query.Wildcard
  155. def r(self):
  156. return "Wild %r" % self.text
  157. nodetype = WildcardNode
  158. class RegexPlugin(TaggingPlugin):
  159. """Adds the ability to specify regular expression term queries.
  160. The default syntax for a regular expression term is ``r"termexpr"``.
  161. >>> qp = qparser.QueryParser("content", myschema)
  162. >>> qp.add_plugin(qparser.RegexPlugin())
  163. >>> q = qp.parse('foo title:r"bar+"')
  164. """
  165. class RegexNode(syntax.TextNode):
  166. qclass = query.Regex
  167. def r(self):
  168. return "Regex %r" % self.text
  169. expr = 'r"(?P<text>[^"]*)"'
  170. nodetype = RegexNode
  171. class BoostPlugin(TaggingPlugin):
  172. """Adds the ability to boost clauses of the query using the circumflex.
  173. >>> qp = qparser.QueryParser("content", myschema)
  174. >>> q = qp.parse("hello there^2")
  175. """
  176. expr = "\\^(?P<boost>[0-9]*(\\.[0-9]+)?)($|(?=[ \t\r\n)]))"
  177. class BoostNode(syntax.SyntaxNode):
  178. def __init__(self, original, boost):
  179. self.original = original
  180. self.boost = boost
  181. def r(self):
  182. return "^ %s" % self.boost
  183. def create(self, parser, match):
  184. # Override create so we can grab group 0
  185. original = match.group(0)
  186. try:
  187. boost = float(match.group("boost"))
  188. except ValueError:
  189. # The text after the ^ wasn't a valid number, so turn it into a
  190. # word
  191. node = syntax.WordNode(original)
  192. else:
  193. node = self.BoostNode(original, boost)
  194. return node
  195. def filters(self, parser):
  196. return [(self.clean_boost, 0), (self.do_boost, 510)]
  197. def clean_boost(self, parser, group):
  198. """This filter finds any BoostNodes in positions where they can't boost
  199. the previous node (e.g. at the very beginning, after whitespace, or
  200. after another BoostNode) and turns them into WordNodes.
  201. """
  202. bnode = self.BoostNode
  203. for i, node in enumerate(group):
  204. if isinstance(node, bnode):
  205. if (not i or not group[i - 1].has_boost):
  206. group[i] = syntax.to_word(node)
  207. return group
  208. def do_boost(self, parser, group):
  209. """This filter finds BoostNodes and applies the boost to the previous
  210. node.
  211. """
  212. newgroup = group.empty_copy()
  213. for node in group:
  214. if isinstance(node, syntax.GroupNode):
  215. node = self.do_boost(parser, node)
  216. elif isinstance(node, self.BoostNode):
  217. if (newgroup and newgroup[-1].has_boost):
  218. # Apply the BoostNode's boost to the previous node
  219. newgroup[-1].set_boost(node.boost)
  220. # Skip adding the BoostNode to the new group
  221. continue
  222. else:
  223. node = syntax.to_word(node)
  224. newgroup.append(node)
  225. return newgroup
  226. class GroupPlugin(Plugin):
  227. """Adds the ability to group clauses using parentheses.
  228. """
  229. # Marker nodes for open and close bracket
  230. class OpenBracket(syntax.SyntaxNode):
  231. def r(self):
  232. return "("
  233. class CloseBracket(syntax.SyntaxNode):
  234. def r(self):
  235. return ")"
  236. def __init__(self, openexpr="[(]", closeexpr="[)]"):
  237. self.openexpr = openexpr
  238. self.closeexpr = closeexpr
  239. def taggers(self, parser):
  240. return [(FnTagger(self.openexpr, self.OpenBracket, "openB"), 0),
  241. (FnTagger(self.closeexpr, self.CloseBracket, "closeB"), 0)]
  242. def filters(self, parser):
  243. return [(self.do_groups, 0)]
  244. def do_groups(self, parser, group):
  245. """This filter finds open and close bracket markers in a flat group
  246. and uses them to organize the nodes into a hierarchy.
  247. """
  248. ob, cb = self.OpenBracket, self.CloseBracket
  249. # Group hierarchy stack
  250. stack = [parser.group()]
  251. for node in group:
  252. if isinstance(node, ob):
  253. # Open bracket: push a new level of hierarchy on the stack
  254. stack.append(parser.group())
  255. elif isinstance(node, cb):
  256. # Close bracket: pop the current level of hierarchy and append
  257. # it to the previous level
  258. if len(stack) > 1:
  259. last = stack.pop()
  260. stack[-1].append(last)
  261. else:
  262. # Anything else: add it to the current level of hierarchy
  263. stack[-1].append(node)
  264. top = stack[0]
  265. # If the parens were unbalanced (more opens than closes), just take
  266. # whatever levels of hierarchy were left on the stack and tack them on
  267. # the end of the top-level
  268. if len(stack) > 1:
  269. for ls in stack[1:]:
  270. top.extend(ls)
  271. if len(top) == 1 and isinstance(top[0], syntax.GroupNode):
  272. boost = top.boost
  273. top = top[0]
  274. top.boost = boost
  275. return top
  276. class EveryPlugin(TaggingPlugin):
  277. expr = "[*]:[*]"
  278. priority = -1
  279. def create(self, parser, match):
  280. return self.EveryNode()
  281. class EveryNode(syntax.SyntaxNode):
  282. def r(self):
  283. return "*:*"
  284. def query(self, parser):
  285. return query.Every()
  286. class FieldsPlugin(TaggingPlugin):
  287. """Adds the ability to specify the field of a clause.
  288. """
  289. class FieldnameTagger(RegexTagger):
  290. def create(self, parser, match):
  291. return syntax.FieldnameNode(match.group("text"), match.group(0))
  292. def __init__(self, expr=r"(?P<text>\w+|[*]):", remove_unknown=True):
  293. """
  294. :param expr: the regular expression to use for tagging fields.
  295. :param remove_unknown: if True, converts field specifications for
  296. fields that aren't in the schema into regular text.
  297. """
  298. self.expr = expr
  299. self.removeunknown = remove_unknown
  300. def taggers(self, parser):
  301. return [(self.FieldnameTagger(self.expr), 0)]
  302. def filters(self, parser):
  303. return [(self.do_fieldnames, 100)]
  304. def do_fieldnames(self, parser, group):
  305. """This filter finds FieldnameNodes in the tree and applies their
  306. fieldname to the next node.
  307. """
  308. fnclass = syntax.FieldnameNode
  309. if self.removeunknown and parser.schema:
  310. # Look for field nodes that aren't in the schema and convert them
  311. # to text
  312. schema = parser.schema
  313. newgroup = group.empty_copy()
  314. prev_field_node = None
  315. for node in group:
  316. if isinstance(node, fnclass) and node.fieldname not in schema:
  317. prev_field_node = node
  318. continue
  319. elif prev_field_node:
  320. # If prev_field_node is not None, it contains a field node
  321. # that appeared before this node but isn't in the schema,
  322. # so we'll convert it to text here
  323. if node.has_text:
  324. node.text = prev_field_node.original + node.text
  325. else:
  326. newgroup.append(syntax.to_word(prev_field_node))
  327. prev_field_node = None
  328. newgroup.append(node)
  329. if prev_field_node:
  330. newgroup.append(syntax.to_word(prev_field_node))
  331. group = newgroup
  332. newgroup = group.empty_copy()
  333. # Iterate backwards through the stream, looking for field-able objects
  334. # with field nodes in front of them
  335. i = len(group)
  336. while i > 0:
  337. i -= 1
  338. node = group[i]
  339. if isinstance(node, fnclass):
  340. # If we see a fieldname node, it must not have been in front
  341. # of something fieldable, since we would have already removed
  342. # it (since we're iterating backwards), so convert it to text
  343. node = syntax.to_word(node)
  344. elif isinstance(node, syntax.GroupNode):
  345. node = self.do_fieldnames(parser, node)
  346. if i > 0 and not node.is_ws() and isinstance(group[i - 1],
  347. fnclass):
  348. node.set_fieldname(group[i - 1].fieldname, override=False)
  349. i -= 1
  350. newgroup.append(node)
  351. newgroup.reverse()
  352. return newgroup
  353. class FuzzyTermPlugin(TaggingPlugin):
  354. """Adds syntax to the query parser to create "fuzzy" term queries, which
  355. match any term within a certain "edit distance" (number of inserted,
  356. deleted, or transposed characters) by appending a tilde (``~``) and an
  357. optional maximum edit distance to a term. If you don't specify an explicit
  358. maximum edit distance, the default is 1.
  359. >>> qp = qparser.QueryParser("content", myschema)
  360. >>> qp.add_plugin(qparser.FuzzyTermPlugin())
  361. >>> q = qp.parse("Stephen~2 Colbert")
  362. For example, the following query creates a :class:`whoosh.query.FuzzyTerm`
  363. query with a maximum edit distance of 1::
  364. bob~
  365. The following creates a fuzzy term query with a maximum edit distance of
  366. 2::
  367. bob~2
  368. The maximum edit distance can only be a single digit. Note that edit
  369. distances greater than 2 can take an extremely long time and are generally
  370. not useful.
  371. You can specify a prefix length using ``~n/m``. For example, to allow a
  372. maximum edit distance of 2 and require a prefix match of 3 characters::
  373. johannson~2/3
  374. To specify a prefix with the default edit distance::
  375. johannson~/3
  376. """
  377. expr = rcompile("""
  378. (?<=\\S) # Only match right after non-space
  379. ~ # Initial tilde
  380. (?P<maxdist>[0-9])? # Optional maxdist
  381. (/ # Optional prefix slash
  382. (?P<prefix>[1-9][0-9]*) # prefix
  383. )? # (end prefix group)
  384. """, verbose=True)
  385. class FuzzinessNode(syntax.SyntaxNode):
  386. def __init__(self, maxdist, prefixlength, original):
  387. self.maxdist = maxdist
  388. self.prefixlength = prefixlength
  389. self.original = original
  390. def __repr__(self):
  391. return "<~%d/%d>" % (self.maxdist, self.prefixlength)
  392. class FuzzyTermNode(syntax.TextNode):
  393. qclass = query.FuzzyTerm
  394. def __init__(self, wordnode, maxdist, prefixlength):
  395. self.fieldname = wordnode.fieldname
  396. self.text = wordnode.text
  397. self.boost = wordnode.boost
  398. self.startchar = wordnode.startchar
  399. self.endchar = wordnode.endchar
  400. self.maxdist = maxdist
  401. self.prefixlength = prefixlength
  402. def r(self):
  403. return "%r ~%d/%d" % (self.text, self.maxdist, self.prefixlength)
  404. def query(self, parser):
  405. # Use the superclass's query() method to create a FuzzyTerm query
  406. # (it looks at self.qclass), just because it takes care of some
  407. # extra checks and attributes
  408. q = syntax.TextNode.query(self, parser)
  409. # Set FuzzyTerm-specific attributes
  410. q.maxdist = self.maxdist
  411. q.prefixlength = self.prefixlength
  412. return q
  413. def create(self, parser, match):
  414. mdstr = match.group("maxdist")
  415. maxdist = int(mdstr) if mdstr else 1
  416. pstr = match.group("prefix")
  417. prefixlength = int(pstr) if pstr else 0
  418. return self.FuzzinessNode(maxdist, prefixlength, match.group(0))
  419. def filters(self, parser):
  420. return [(self.do_fuzzyterms, 0)]
  421. def do_fuzzyterms(self, parser, group):
  422. newgroup = group.empty_copy()
  423. i = 0
  424. while i < len(group):
  425. node = group[i]
  426. if i < len(group) - 1 and isinstance(node, syntax.WordNode):
  427. nextnode = group[i + 1]
  428. if isinstance(nextnode, self.FuzzinessNode):
  429. node = self.FuzzyTermNode(node, nextnode.maxdist,
  430. nextnode.prefixlength)
  431. i += 1
  432. if isinstance(node, self.FuzzinessNode):
  433. node = syntax.to_word(node)
  434. if isinstance(node, syntax.GroupNode):
  435. node = self.do_fuzzyterms(parser, node)
  436. newgroup.append(node)
  437. i += 1
  438. return newgroup
  439. class FunctionPlugin(TaggingPlugin):
  440. """Adds an abitrary "function call" syntax to the query parser to allow
  441. advanced and extensible query functionality.
  442. This is unfinished and experimental.
  443. """
  444. expr = rcompile("""
  445. [#](?P<name>[A-Za-z_][A-Za-z0-9._]*) # function name
  446. ( # optional args
  447. \\[ # inside square brackets
  448. (?P<args>.*?)
  449. \\]
  450. )?
  451. """, verbose=True)
  452. class FunctionNode(syntax.SyntaxNode):
  453. has_fieldname = False
  454. has_boost = True
  455. merging = False
  456. def __init__(self, name, fn, args, kwargs):
  457. self.name = name
  458. self.fn = fn
  459. self.args = args
  460. self.kwargs = kwargs
  461. self.nodes = []
  462. self.boost = None
  463. def __repr__(self):
  464. return "#%s<%r>(%r)" % (self.name, self.args, self.nodes)
  465. def query(self, parser):
  466. qs = [n.query(parser) for n in self.nodes]
  467. kwargs = self.kwargs
  468. if "boost" not in kwargs and self.boost is not None:
  469. kwargs["boost"] = self.boost
  470. # TODO: If this call raises an exception, return an error query
  471. return self.fn(qs, *self.args, **self.kwargs)
  472. def __init__(self, fns):
  473. """
  474. :param fns: a dictionary mapping names to functions that return a
  475. query.
  476. """
  477. self.fns = fns
  478. def create(self, parser, match):
  479. name = match.group("name")
  480. if name in self.fns:
  481. fn = self.fns[name]
  482. argstring = match.group("args")
  483. if argstring:
  484. args, kwargs = self._parse_args(argstring)
  485. else:
  486. args = ()
  487. kwargs = {}
  488. return self.FunctionNode(name, fn, args, kwargs)
  489. def _parse_args(self, argstring):
  490. args = []
  491. kwargs = {}
  492. parts = argstring.split(",")
  493. for part in parts:
  494. if "=" in part:
  495. name, value = part.split("=", 1)
  496. # Wrap with str() because Python 2.5 can't handle unicode kws
  497. name = str(name.strip())
  498. else:
  499. name = None
  500. value = part
  501. value = value.strip()
  502. if value.startswith("'") and value.endswith("'"):
  503. value = value[1:-1]
  504. if name:
  505. kwargs[name] = value
  506. else:
  507. args.append(value)
  508. return args, kwargs
  509. def filters(self, parser):
  510. return [(self.do_functions, 600)]
  511. def do_functions(self, parser, group):
  512. newgroup = group.empty_copy()
  513. i = 0
  514. while i < len(group):
  515. node = group[i]
  516. if (isinstance(node, self.FunctionNode)
  517. and i < len(group) - 1
  518. and isinstance(group[i + 1], syntax.GroupNode)):
  519. nextnode = group[i + 1]
  520. node.nodes = list(self.do_functions(parser, nextnode))
  521. if nextnode.boost != 1:
  522. node.set_boost(nextnode.boost)
  523. i += 1
  524. elif isinstance(node, syntax.GroupNode):
  525. node = self.do_functions(parser, node)
  526. newgroup.append(node)
  527. i += 1
  528. return newgroup
  529. class PhrasePlugin(Plugin):
  530. """Adds the ability to specify phrase queries inside double quotes.
  531. """
  532. # Didn't use TaggingPlugin because I need to add slop parsing at some
  533. # point
  534. # Expression used to find words if a schema isn't available
  535. wordexpr = rcompile(r'\S+')
  536. class PhraseNode(syntax.TextNode):
  537. def __init__(self, text, textstartchar, slop=1):
  538. syntax.TextNode.__init__(self, text)
  539. self.textstartchar = textstartchar
  540. self.slop = slop
  541. def r(self):
  542. return "%s %r~%s" % (self.__class__.__name__, self.text, self.slop)
  543. def apply(self, fn):
  544. return self.__class__(self.type, [fn(node) for node in self.nodes],
  545. slop=self.slop, boost=self.boost)
  546. def query(self, parser):
  547. text = self.text
  548. fieldname = self.fieldname or parser.fieldname
  549. # We want to process the text of the phrase into "words" (tokens),
  550. # and also record the startchar and endchar of each word
  551. sc = self.textstartchar
  552. if parser.schema and fieldname in parser.schema:
  553. field = parser.schema[fieldname]
  554. if field.analyzer:
  555. # We have a field with an analyzer, so use it to parse
  556. # the phrase into tokens
  557. tokens = field.tokenize(text, mode="query", chars=True)
  558. words = []
  559. char_ranges = []
  560. for t in tokens:
  561. words.append(t.text)
  562. char_ranges.append((sc + t.startchar, sc + t.endchar))
  563. else:
  564. # We have a field but it doesn't have a format object,
  565. # for some reason (it's self-parsing?), so use process_text
  566. # to get the texts (we won't know the start/end chars)
  567. words = list(field.process_text(text, mode="query"))
  568. char_ranges = [(None, None)] * len(words)
  569. else:
  570. # We're parsing without a schema, so just use the default
  571. # regular expression to break the text into words
  572. words = []
  573. char_ranges = []
  574. for match in PhrasePlugin.wordexpr.finditer(text):
  575. words.append(match.group(0))
  576. char_ranges.append((sc + match.start(), sc + match.end()))
  577. qclass = parser.phraseclass
  578. q = qclass(fieldname, words, slop=self.slop, boost=self.boost,
  579. char_ranges=char_ranges)
  580. return attach(q, self)
  581. class PhraseTagger(RegexTagger):
  582. def create(self, parser, match):
  583. text = match.group("text")
  584. textstartchar = match.start("text")
  585. slopstr = match.group("slop")
  586. slop = int(slopstr) if slopstr else 1
  587. return PhrasePlugin.PhraseNode(text, textstartchar, slop)
  588. def __init__(self, expr='"(?P<text>.*?)"(~(?P<slop>[1-9][0-9]*))?'):
  589. self.expr = expr
  590. def taggers(self, parser):
  591. return [(self.PhraseTagger(self.expr), 0)]
  592. class SequencePlugin(Plugin):
  593. """Adds the ability to group arbitrary queries inside double quotes to
  594. produce a query matching the individual sub-queries in sequence.
  595. To enable this plugin, first remove the default PhrasePlugin, then add
  596. this plugin::
  597. qp = qparser.QueryParser("field", my_schema)
  598. qp.remove_plugin_class(qparser.PhrasePlugin)
  599. qp.add_plugin(qparser.SequencePlugin())
  600. This enables parsing "phrases" such as::
  601. "(jon OR john OR jonathan~1) smith*"
  602. """
  603. def __init__(self, expr='["](~(?P<slop>[1-9][0-9]*))?'):
  604. """
  605. :param expr: a regular expression for the marker at the start and end
  606. of a phrase. The default is the double-quotes character.
  607. """
  608. self.expr = expr
  609. class SequenceNode(syntax.GroupNode):
  610. qclass = query.Sequence
  611. class QuoteNode(syntax.MarkerNode):
  612. def __init__(self, slop=None):
  613. self.slop = int(slop) if slop else 1
  614. def taggers(self, parser):
  615. return [(FnTagger(self.expr, self.QuoteNode, "quote"), 0)]
  616. def filters(self, parser):
  617. return [(self.do_quotes, 550)]
  618. def do_quotes(self, parser, group):
  619. # New group to copy nodes into
  620. newgroup = group.empty_copy()
  621. # Buffer for sequence nodes; when it's None, it means we're not in
  622. # a sequence
  623. seq = None
  624. # Start copying nodes from group to newgroup. When we find a quote
  625. # node, start copying nodes into the buffer instead. When we find
  626. # the next (end) quote, put the buffered nodes into a SequenceNode
  627. # and add it to newgroup.
  628. for node in group:
  629. if isinstance(node, syntax.GroupNode):
  630. # Recurse
  631. node = self.do_quotes(parser, node)
  632. if isinstance(node, self.QuoteNode):
  633. if seq is None:
  634. # Start a new sequence
  635. seq = []
  636. else:
  637. # End the current sequence
  638. sn = self.SequenceNode(seq, slop=node.slop)
  639. newgroup.append(sn)
  640. seq = None
  641. elif seq is None:
  642. # Not in a sequence, add directly
  643. newgroup.append(node)
  644. else:
  645. # In a sequence, add it to the buffer
  646. seq.append(node)
  647. # We can end up with buffered nodes if there was an unbalanced quote;
  648. # just add the buffered nodes directly to newgroup
  649. if seq is not None:
  650. newgroup.extend(seq)
  651. return newgroup
  652. class RangePlugin(Plugin):
  653. """Adds the ability to specify term ranges.
  654. """
  655. expr = rcompile(r"""
  656. (?P<open>\{|\[) # Open paren
  657. (?P<start>
  658. ('[^']*?'\s+) # single-quoted
  659. | # or
  660. ([^\]}]+?(?=[Tt][Oo])) # everything until "to"
  661. )?
  662. [Tt][Oo] # "to"
  663. (?P<end>
  664. (\s+'[^']*?') # single-quoted
  665. | # or
  666. ([^\]}]+?) # everything until "]" or "}"
  667. )?
  668. (?P<close>}|]) # Close paren
  669. """, verbose=True)
  670. class RangeTagger(RegexTagger):
  671. def __init__(self, expr, excl_start, excl_end):
  672. self.expr = expr
  673. self.excl_start = excl_start
  674. self.excl_end = excl_end
  675. def create(self, parser, match):
  676. start = match.group("start")
  677. end = match.group("end")
  678. if start:
  679. # Strip the space before the "to"
  680. start = start.rstrip()
  681. # Strip single quotes
  682. if start.startswith("'") and start.endswith("'"):
  683. start = start[1:-1]
  684. if end:
  685. # Strip the space before the "to"
  686. end = end.lstrip()
  687. # Strip single quotes
  688. if end.startswith("'") and end.endswith("'"):
  689. end = end[1:-1]
  690. # What kind of open and close brackets were used?
  691. startexcl = match.group("open") == self.excl_start
  692. endexcl = match.group("close") == self.excl_end
  693. rn = syntax.RangeNode(start, end, startexcl, endexcl)
  694. return rn
  695. def __init__(self, expr=None, excl_start="{", excl_end="}"):
  696. self.expr = expr or self.expr
  697. self.excl_start = excl_start
  698. self.excl_end = excl_end
  699. def taggers(self, parser):
  700. tagger = self.RangeTagger(self.expr, self.excl_start, self.excl_end)
  701. return [(tagger, 1)]
  702. class OperatorsPlugin(Plugin):
  703. """By default, adds the AND, OR, ANDNOT, ANDMAYBE, and NOT operators to
  704. the parser syntax. This plugin scans the token stream for subclasses of
  705. :class:`Operator` and calls their :meth:`Operator.make_group` methods
  706. to allow them to manipulate the stream.
  707. There are two levels of configuration available.
  708. The first level is to change the regular expressions of the default
  709. operators, using the ``And``, ``Or``, ``AndNot``, ``AndMaybe``, and/or
  710. ``Not`` keyword arguments. The keyword value can be a pattern string or
  711. a compiled expression, or None to remove the operator::
  712. qp = qparser.QueryParser("content", schema)
  713. cp = qparser.OperatorsPlugin(And="&", Or="\\|", AndNot="&!",
  714. AndMaybe="&~", Not=None)
  715. qp.replace_plugin(cp)
  716. You can also specify a list of ``(OpTagger, priority)`` pairs as the first
  717. argument to the initializer to use custom operators. See :ref:`custom-op`
  718. for more information on this.
  719. """
  720. class OpTagger(RegexTagger):
  721. def __init__(self, expr, grouptype, optype=syntax.InfixOperator,
  722. leftassoc=True, memo=""):
  723. RegexTagger.__init__(self, expr)
  724. self.grouptype = grouptype
  725. self.optype = optype
  726. self.leftassoc = leftassoc
  727. self.memo = memo
  728. def __repr__(self):
  729. return "<%s %r (%s)>" % (self.__class__.__name__,
  730. self.expr.pattern, self.memo)
  731. def create(self, parser, match):
  732. return self.optype(match.group(0), self.grouptype, self.leftassoc)
  733. def __init__(self, ops=None, clean=False,
  734. And=r"(?<=\s)AND(?=\s)",
  735. Or=r"(?<=\s)OR(?=\s)",
  736. AndNot=r"(?<=\s)ANDNOT(?=\s)",
  737. AndMaybe=r"(?<=\s)ANDMAYBE(?=\s)",
  738. Not=r"(^|(?<=(\s|[()])))NOT(?=\s)",
  739. Require=r"(^|(?<=\s))REQUIRE(?=\s)"):
  740. if ops:
  741. ops = list(ops)
  742. else:
  743. ops = []
  744. if not clean:
  745. ot = self.OpTagger
  746. if Not:
  747. ops.append((ot(Not, syntax.NotGroup, syntax.PrefixOperator,
  748. memo="not"), 0))
  749. if And:
  750. ops.append((ot(And, syntax.AndGroup, memo="and"), 0))
  751. if Or:
  752. ops.append((ot(Or, syntax.OrGroup, memo="or"), 0))
  753. if AndNot:
  754. ops.append((ot(AndNot, syntax.AndNotGroup,
  755. memo="anot"), -5))
  756. if AndMaybe:
  757. ops.append((ot(AndMaybe, syntax.AndMaybeGroup,
  758. memo="amaybe"), -5))
  759. if Require:
  760. ops.append((ot(Require, syntax.RequireGroup,
  761. memo="req"), 0))
  762. self.ops = ops
  763. def taggers(self, parser):
  764. return self.ops
  765. def filters(self, parser):
  766. return [(self.do_operators, 600)]
  767. def do_operators(self, parser, group):
  768. """This filter finds PrefixOperator, PostfixOperator, and InfixOperator
  769. nodes in the tree and calls their logic to rearrange the nodes.
  770. """
  771. for tagger, _ in self.ops:
  772. # Get the operators created by the configured taggers
  773. optype = tagger.optype
  774. gtype = tagger.grouptype
  775. # Left-associative infix operators are replaced left-to-right, and
  776. # right-associative infix operators are replaced right-to-left.
  777. # Most of the work is done in the different implementations of
  778. # Operator.replace_self().
  779. if tagger.leftassoc:
  780. i = 0
  781. while i < len(group):
  782. t = group[i]
  783. if isinstance(t, optype) and t.grouptype is gtype:
  784. i = t.replace_self(parser, group, i)
  785. else:
  786. i += 1
  787. else:
  788. i = len(group) - 1
  789. while i >= 0:
  790. t = group[i]
  791. if isinstance(t, optype):
  792. i = t.replace_self(parser, group, i)
  793. i -= 1
  794. # Descend into the groups and recursively call do_operators
  795. for i, t in enumerate(group):
  796. if isinstance(t, syntax.GroupNode):
  797. group[i] = self.do_operators(parser, t)
  798. return group
  799. #
  800. class PlusMinusPlugin(Plugin):
  801. """Adds the ability to use + and - in a flat OR query to specify required
  802. and prohibited terms.
  803. This is the basis for the parser configuration returned by
  804. ``SimpleParser()``.
  805. """
  806. # Marker nodes for + and -
  807. class Plus(syntax.MarkerNode):
  808. pass
  809. class Minus(syntax.MarkerNode):
  810. pass
  811. def __init__(self, plusexpr="\\+", minusexpr="-"):
  812. self.plusexpr = plusexpr
  813. self.minusexpr = minusexpr
  814. def taggers(self, parser):
  815. return [(FnTagger(self.plusexpr, self.Plus, "plus"), 0),
  816. (FnTagger(self.minusexpr, self.Minus, "minus"), 0)]
  817. def filters(self, parser):
  818. return [(self.do_plusminus, 510)]
  819. def do_plusminus(self, parser, group):
  820. """This filter sorts nodes in a flat group into "required", "optional",
  821. and "banned" subgroups based on the presence of plus and minus nodes.
  822. """
  823. required = syntax.AndGroup()
  824. optional = syntax.OrGroup()
  825. banned = syntax.OrGroup()
  826. # If the top-level group is an AndGroup we make everything "required" by default
  827. if isinstance(group, syntax.AndGroup):
  828. optional = syntax.AndGroup()
  829. # Which group to put the next node we see into
  830. next = optional
  831. for node in group:
  832. if isinstance(node, self.Plus):
  833. # +: put the next node in the required group
  834. next = required
  835. elif isinstance(node, self.Minus):
  836. # -: put the next node in the banned group
  837. next = banned
  838. else:
  839. # Anything else: put it in the appropriate group
  840. next.append(node)
  841. # Reset to putting things in the optional group by default
  842. next = optional
  843. group = optional
  844. if required:
  845. group = syntax.AndMaybeGroup([required, group])
  846. if banned:
  847. group = syntax.AndNotGroup([group, banned])
  848. return group
  849. class GtLtPlugin(TaggingPlugin):
  850. """Allows the user to use greater than/less than symbols to create range
  851. queries::
  852. a:>100 b:<=z c:>=-1.4 d:<mz
  853. This is the equivalent of::
  854. a:{100 to] b:[to z] c:[-1.4 to] d:[to mz}
  855. The plugin recognizes ``>``, ``<``, ``>=``, ``<=``, ``=>``, and ``=<``
  856. after a field specifier. The field specifier is required. You cannot do the
  857. following::
  858. >100
  859. This plugin requires the FieldsPlugin and RangePlugin to work.
  860. """
  861. class GtLtNode(syntax.SyntaxNode):
  862. def __init__(self, rel):
  863. self.rel = rel
  864. def __repr__(self):
  865. return "(%s)" % self.rel
  866. expr = r"(?P<rel>(<=|>=|<|>|=<|=>))"
  867. nodetype = GtLtNode
  868. def filters(self, parser):
  869. # Run before the fields filter removes FilenameNodes at priority 100.
  870. return [(self.do_gtlt, 99)]
  871. def do_gtlt(self, parser, group):
  872. """This filter translate FieldnameNode/GtLtNode pairs into RangeNodes.
  873. """
  874. fname = syntax.FieldnameNode
  875. newgroup = group.empty_copy()
  876. i = 0
  877. lasti = len(group) - 1
  878. while i < len(group):
  879. node = group[i]
  880. # If this is a GtLtNode...
  881. if isinstance(node, self.GtLtNode):
  882. # If it's not the last node in the group...
  883. if i < lasti:
  884. prevnode = newgroup[-1]
  885. nextnode = group[i + 1]
  886. # If previous was a fieldname and next node has text
  887. if isinstance(prevnode, fname) and nextnode.has_text:
  888. # Make the next node into a range based on the symbol
  889. newgroup.append(self.make_range(nextnode, node.rel))
  890. # Skip the next node
  891. i += 1
  892. else:
  893. # If it's not a GtLtNode, add it to the filtered group
  894. newgroup.append(node)
  895. i += 1
  896. return newgroup
  897. def make_range(self, node, rel):
  898. text = node.text
  899. if rel == "<":
  900. n = syntax.RangeNode(None, text, False, True)
  901. elif rel == ">":
  902. n = syntax.RangeNode(text, None, True, False)
  903. elif rel == "<=" or rel == "=<":
  904. n = syntax.RangeNode(None, text, False, False)
  905. elif rel == ">=" or rel == "=>":
  906. n = syntax.RangeNode(text, None, False, False)
  907. return n.set_range(node.startchar, node.endchar)
  908. class MultifieldPlugin(Plugin):
  909. """Converts any unfielded terms into OR clauses that search for the
  910. term in a specified list of fields.
  911. >>> qp = qparser.QueryParser(None, myschema)
  912. >>> qp.add_plugin(qparser.MultifieldPlugin(["a", "b"])
  913. >>> qp.parse("alfa c:bravo")
  914. And([Or([Term("a", "alfa"), Term("b", "alfa")]), Term("c", "bravo")])
  915. This plugin is the basis for the ``MultifieldParser``.
  916. """
  917. def __init__(self, fieldnames, fieldboosts=None, group=syntax.OrGroup):
  918. """
  919. :param fieldnames: a list of fields to search.
  920. :param fieldboosts: an optional dictionary mapping field names to
  921. a boost to use for that field.
  922. :param group: the group to use to relate the fielded terms to each
  923. other.
  924. """
  925. self.fieldnames = fieldnames
  926. self.boosts = fieldboosts or {}
  927. self.group = group
  928. def filters(self, parser):
  929. # Run after the fields filter applies explicit fieldnames (at priority
  930. # 100)
  931. return [(self.do_multifield, 110)]
  932. def do_multifield(self, parser, group):
  933. for i, node in enumerate(group):
  934. if isinstance(node, syntax.GroupNode):
  935. # Recurse inside groups
  936. group[i] = self.do_multifield(parser, node)
  937. elif node.has_fieldname and node.fieldname is None:
  938. # For an unfielded node, create a new group containing fielded
  939. # versions of the node for each configured "multi" field.
  940. newnodes = []
  941. for fname in self.fieldnames:
  942. newnode = copy.copy(node)
  943. newnode.set_fieldname(fname)
  944. newnode.set_boost(self.boosts.get(fname, 1.0))
  945. newnodes.append(newnode)
  946. group[i] = self.group(newnodes)
  947. return group
  948. class FieldAliasPlugin(Plugin):
  949. """Adds the ability to use "aliases" of fields in the query string.
  950. This plugin is useful for allowing users of languages that can't be
  951. represented in ASCII to use field names in their own language, and
  952. translate them into the "real" field names, which must be valid Python
  953. identifiers.
  954. >>> # Allow users to use 'body' or 'text' to refer to the 'content' field
  955. >>> parser.add_plugin(FieldAliasPlugin({"content": ["body", "text"]}))
  956. >>> parser.parse("text:hello")
  957. Term("content", "hello")
  958. """
  959. def __init__(self, fieldmap):
  960. self.fieldmap = fieldmap
  961. self.reverse = {}
  962. for key, values in iteritems(fieldmap):
  963. for value in values:
  964. self.reverse[value] = key
  965. def filters(self, parser):
  966. # Run before fields plugin at 100
  967. return [(self.do_aliases, 90)]
  968. def do_aliases(self, parser, group):
  969. for i, node in enumerate(group):
  970. if isinstance(node, syntax.GroupNode):
  971. group[i] = self.do_aliases(parser, node)
  972. elif node.has_fieldname and node.fieldname is not None:
  973. fname = node.fieldname
  974. if fname in self.reverse:
  975. node.set_fieldname(self.reverse[fname], override=True)
  976. return group
  977. class CopyFieldPlugin(Plugin):
  978. """Looks for basic syntax nodes (terms, prefixes, wildcards, phrases, etc.)
  979. occurring in a certain field and replaces it with a group (by default OR)
  980. containing the original token and the token copied to a new field.
  981. For example, the query::
  982. hello name:matt
  983. could be automatically converted by ``CopyFieldPlugin({"name", "author"})``
  984. to::
  985. hello (name:matt OR author:matt)
  986. This is useful where one field was indexed with a differently-analyzed copy
  987. of another, and you want the query to search both fields.
  988. You can specify a different group type with the ``group`` keyword. You can
  989. also specify ``group=None``, in which case the copied node is inserted
  990. "inline" next to the original, instead of in a new group::
  991. hello name:matt author:matt
  992. """
  993. def __init__(self, map, group=syntax.OrGroup, mirror=False):
  994. """
  995. :param map: a dictionary mapping names of fields to copy to the
  996. names of the destination fields.
  997. :param group: the type of group to create in place of the original
  998. token. You can specify ``group=None`` to put the copied node
  999. "inline" next to the original node instead of in a new group.
  1000. :param two_way: if True, the plugin copies both ways, so if the user
  1001. specifies a query in the 'toname' field, it will be copied to
  1002. the 'fromname' field.
  1003. """
  1004. self.map = map
  1005. self.group = group
  1006. if mirror:
  1007. # Add in reversed mappings
  1008. map.update(dict((v, k) for k, v in iteritems(map)))
  1009. def filters(self, parser):
  1010. # Run after the fieldname filter (100) but before multifield (110)
  1011. return [(self.do_copyfield, 109)]
  1012. def do_copyfield(self, parser, group):
  1013. map = self.map
  1014. newgroup = group.empty_copy()
  1015. for node in group:
  1016. if isinstance(node, syntax.GroupNode):
  1017. # Recurse into groups
  1018. node = self.do_copyfield(parser, node)
  1019. elif node.has_fieldname:
  1020. fname = node.fieldname or parser.fieldname
  1021. if fname in map:
  1022. newnode = copy.copy(node)
  1023. newnode.set_fieldname(map[fname], override=True)
  1024. if self.group is None:
  1025. newgroup.append(node)
  1026. newgroup.append(newnode)
  1027. else:
  1028. newgroup.append(self.group([node, newnode]))
  1029. continue
  1030. newgroup.append(node)
  1031. return newgroup
  1032. class PseudoFieldPlugin(Plugin):
  1033. """This is an advanced plugin that lets you define "pseudo-fields" the user
  1034. can use in their queries. When the parser encounters one of these fields,
  1035. it runs a given function on the following node in the abstract syntax tree.
  1036. Unfortunately writing the transform function(s) requires knowledge of the
  1037. parser's abstract syntax tree classes. A transform function takes a
  1038. :class:`whoosh.qparser.SyntaxNode` and returns a
  1039. :class:`~whoosh.qparser.SyntaxNode` (or None if the node should be removed
  1040. instead of transformed).
  1041. Some things you can do in the transform function::
  1042. from whoosh import qparser
  1043. def my_xform_fn(node):
  1044. # Is this a text node?
  1045. if node.has_text:
  1046. # Change the node's text
  1047. node.text = node.text + "foo"
  1048. # Change the node into a prefix query
  1049. node = qparser.PrefixPlugin.PrefixNode(node.text)
  1050. # Set the field the node should search in
  1051. node.set_fieldname("title")
  1052. return node
  1053. else:
  1054. # If the pseudo-field wasn't applied to a text node (e.g.
  1055. # it preceded a group, as in ``pfield:(a OR b)`` ), remove the
  1056. # node. Alternatively you could just ``return node`` here to
  1057. # leave the non-text node intact.
  1058. return None
  1059. In the following example, if the user types ``regex:foo.bar``, the function
  1060. transforms the text in the pseudo-field "regex" into a regular expression
  1061. query in the "content" field::
  1062. from whoosh import qparser
  1063. def regex_maker(node):
  1064. if node.has_text:
  1065. node = qparser.RegexPlugin.RegexNode(node.text)
  1066. node.set_fieldname("content")
  1067. return node
  1068. qp = qparser.QueryParser("content", myindex.schema)
  1069. qp.add_plugin(qparser.PseudoFieldPlugin({"regex": regex_maker}))
  1070. q = qp.parse("alfa regex:br.vo")
  1071. The name of the "pseudo" field can be the same as an actual field. Imagine
  1072. the schema has a field named ``reverse``, and you want the user to be able
  1073. to type ``reverse:foo`` and transform it to ``reverse:(foo OR oof)``::
  1074. def rev_text(node):
  1075. if node.has_text:
  1076. # Create a word node for the reversed text
  1077. revtext = node.text[::-1] # Reverse the text
  1078. rnode = qparser.WordNode(revtext)
  1079. # Put the original node and the reversed node in an OrGroup
  1080. group = qparser.OrGroup([node, rnode])
  1081. # Need to set the fieldname here because the PseudoFieldPlugin
  1082. # removes the field name syntax
  1083. group.set_fieldname("reverse")
  1084. return group
  1085. qp = qparser.QueryParser("content", myindex.schema)
  1086. qp.add_plugin(qparser.PseudoFieldPlugin({"reverse": rev_text}))
  1087. q = qp.parse("alfa reverse:bravo")
  1088. Note that transforming the query like this can potentially really confuse
  1089. the spell checker!
  1090. This plugin works as a filter, so it can only operate on the query after it
  1091. has been parsed into an abstract syntax tree. For parsing control (i.e. to
  1092. give a pseudo-field its own special syntax), you would need to write your
  1093. own parsing plugin.
  1094. """
  1095. def __init__(self, xform_map):
  1096. """
  1097. :param xform_map: a dictionary mapping psuedo-field names to transform
  1098. functions. The function should take a
  1099. :class:`whoosh.qparser.SyntaxNode` as an argument, and return a
  1100. :class:`~whoosh.qparser.SyntaxNode`. If the function returns None,
  1101. the node will be removed from the query.
  1102. """
  1103. self.xform_map = xform_map
  1104. def filters(self, parser):
  1105. # Run before the fieldname filter (100)
  1106. return [(self.do_pseudofield, 99)]
  1107. def do_pseudofield(self, parser, group):
  1108. xform_map = self.xform_map
  1109. newgroup = group.empty_copy()
  1110. xform_next = None
  1111. for node in group:
  1112. if isinstance(node, syntax.GroupNode):
  1113. node = self.do_pseudofield(parser, node)
  1114. elif (isinstance(node, syntax.FieldnameNode)
  1115. and node.fieldname in xform_map):
  1116. xform_next = xform_map[node.fieldname]
  1117. continue
  1118. if xform_next:
  1119. newnode = xform_next(node)
  1120. xform_next = None
  1121. if newnode is None:
  1122. continue
  1123. else:
  1124. newnode.set_range(node.startchar, node.endchar)
  1125. node = newnode
  1126. newgroup.append(node)
  1127. return newgroup