regex_parser.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262
  1. """
  2. Parser for parsing a regular expression.
  3. Take a string representing a regular expression and return the root node of its
  4. parse tree.
  5. usage::
  6. root_node = parse_regex('(hello|world)')
  7. Remarks:
  8. - The regex parser processes multiline, it ignores all whitespace and supports
  9. multiple named groups with the same name and #-style comments.
  10. Limitations:
  11. - Lookahead is not supported.
  12. """
  13. from __future__ import unicode_literals
  14. import re
  15. __all__ = (
  16. 'Repeat',
  17. 'Variable',
  18. 'Regex',
  19. 'Lookahead',
  20. 'tokenize_regex',
  21. 'parse_regex',
  22. )
  23. class Node(object):
  24. """
  25. Base class for all the grammar nodes.
  26. (You don't initialize this one.)
  27. """
  28. def __add__(self, other_node):
  29. return Sequence([self, other_node])
  30. def __or__(self, other_node):
  31. return Any([self, other_node])
  32. class Any(Node):
  33. """
  34. Union operation (OR operation) between several grammars. You don't
  35. initialize this yourself, but it's a result of a "Grammar1 | Grammar2"
  36. operation.
  37. """
  38. def __init__(self, children):
  39. self.children = children
  40. def __or__(self, other_node):
  41. return Any(self.children + [other_node])
  42. def __repr__(self):
  43. return '%s(%r)' % (self.__class__.__name__, self.children)
  44. class Sequence(Node):
  45. """
  46. Concatenation operation of several grammars. You don't initialize this
  47. yourself, but it's a result of a "Grammar1 + Grammar2" operation.
  48. """
  49. def __init__(self, children):
  50. self.children = children
  51. def __add__(self, other_node):
  52. return Sequence(self.children + [other_node])
  53. def __repr__(self):
  54. return '%s(%r)' % (self.__class__.__name__, self.children)
  55. class Regex(Node):
  56. """
  57. Regular expression.
  58. """
  59. def __init__(self, regex):
  60. re.compile(regex) # Validate
  61. self.regex = regex
  62. def __repr__(self):
  63. return '%s(/%s/)' % (self.__class__.__name__, self.regex)
  64. class Lookahead(Node):
  65. """
  66. Lookahead expression.
  67. """
  68. def __init__(self, childnode, negative=False):
  69. self.childnode = childnode
  70. self.negative = negative
  71. def __repr__(self):
  72. return '%s(%r)' % (self.__class__.__name__, self.childnode)
  73. class Variable(Node):
  74. """
  75. Mark a variable in the regular grammar. This will be translated into a
  76. named group. Each variable can have his own completer, validator, etc..
  77. :param childnode: The grammar which is wrapped inside this variable.
  78. :param varname: String.
  79. """
  80. def __init__(self, childnode, varname=None):
  81. self.childnode = childnode
  82. self.varname = varname
  83. def __repr__(self):
  84. return '%s(childnode=%r, varname=%r)' % (
  85. self.__class__.__name__, self.childnode, self.varname)
  86. class Repeat(Node):
  87. def __init__(self, childnode, min_repeat=0, max_repeat=None, greedy=True):
  88. self.childnode = childnode
  89. self.min_repeat = min_repeat
  90. self.max_repeat = max_repeat
  91. self.greedy = greedy
  92. def __repr__(self):
  93. return '%s(childnode=%r)' % (self.__class__.__name__, self.childnode)
  94. def tokenize_regex(input):
  95. """
  96. Takes a string, representing a regular expression as input, and tokenizes
  97. it.
  98. :param input: string, representing a regular expression.
  99. :returns: List of tokens.
  100. """
  101. # Regular expression for tokenizing other regular expressions.
  102. p = re.compile(r'''^(
  103. \(\?P\<[a-zA-Z0-9_-]+\> | # Start of named group.
  104. \(\?#[^)]*\) | # Comment
  105. \(\?= | # Start of lookahead assertion
  106. \(\?! | # Start of negative lookahead assertion
  107. \(\?<= | # If preceded by.
  108. \(\?< | # If not preceded by.
  109. \(?: | # Start of group. (non capturing.)
  110. \( | # Start of group.
  111. \(?[iLmsux] | # Flags.
  112. \(?P=[a-zA-Z]+\) | # Back reference to named group
  113. \) | # End of group.
  114. \{[^{}]*\} | # Repetition
  115. \*\? | \+\? | \?\?\ | # Non greedy repetition.
  116. \* | \+ | \? | # Repetition
  117. \#.*\n | # Comment
  118. \\. |
  119. # Character group.
  120. \[
  121. ( [^\]\\] | \\.)*
  122. \] |
  123. [^(){}] |
  124. .
  125. )''', re.VERBOSE)
  126. tokens = []
  127. while input:
  128. m = p.match(input)
  129. if m:
  130. token, input = input[:m.end()], input[m.end():]
  131. if not token.isspace():
  132. tokens.append(token)
  133. else:
  134. raise Exception('Could not tokenize input regex.')
  135. return tokens
  136. def parse_regex(regex_tokens):
  137. """
  138. Takes a list of tokens from the tokenizer, and returns a parse tree.
  139. """
  140. # We add a closing brace because that represents the final pop of the stack.
  141. tokens = [')'] + regex_tokens[::-1]
  142. def wrap(lst):
  143. """ Turn list into sequence when it contains several items. """
  144. if len(lst) == 1:
  145. return lst[0]
  146. else:
  147. return Sequence(lst)
  148. def _parse():
  149. or_list = []
  150. result = []
  151. def wrapped_result():
  152. if or_list == []:
  153. return wrap(result)
  154. else:
  155. or_list.append(result)
  156. return Any([wrap(i) for i in or_list])
  157. while tokens:
  158. t = tokens.pop()
  159. if t.startswith('(?P<'):
  160. variable = Variable(_parse(), varname=t[4:-1])
  161. result.append(variable)
  162. elif t in ('*', '*?'):
  163. greedy = (t == '*')
  164. result[-1] = Repeat(result[-1], greedy=greedy)
  165. elif t in ('+', '+?'):
  166. greedy = (t == '+')
  167. result[-1] = Repeat(result[-1], min_repeat=1, greedy=greedy)
  168. elif t in ('?', '??'):
  169. if result == []:
  170. raise Exception('Nothing to repeat.' + repr(tokens))
  171. else:
  172. greedy = (t == '?')
  173. result[-1] = Repeat(result[-1], min_repeat=0, max_repeat=1, greedy=greedy)
  174. elif t == '|':
  175. or_list.append(result)
  176. result = []
  177. elif t in ('(', '(?:'):
  178. result.append(_parse())
  179. elif t == '(?!':
  180. result.append(Lookahead(_parse(), negative=True))
  181. elif t == '(?=':
  182. result.append(Lookahead(_parse(), negative=False))
  183. elif t == ')':
  184. return wrapped_result()
  185. elif t.startswith('#'):
  186. pass
  187. elif t.startswith('{'):
  188. # TODO: implement!
  189. raise Exception('{}-style repitition not yet supported' % t)
  190. elif t.startswith('(?'):
  191. raise Exception('%r not supported' % t)
  192. elif t.isspace():
  193. pass
  194. else:
  195. result.append(Regex(t))
  196. raise Exception("Expecting ')' token")
  197. result = _parse()
  198. if len(tokens) != 0:
  199. raise Exception("Unmatched parantheses.")
  200. else:
  201. return result