123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262 |
- """
- Parser for parsing a regular expression.
- Take a string representing a regular expression and return the root node of its
- parse tree.
- usage::
- root_node = parse_regex('(hello|world)')
- Remarks:
- - The regex parser processes multiline, it ignores all whitespace and supports
- multiple named groups with the same name and #-style comments.
- Limitations:
- - Lookahead is not supported.
- """
- from __future__ import unicode_literals
- import re
- __all__ = (
- 'Repeat',
- 'Variable',
- 'Regex',
- 'Lookahead',
- 'tokenize_regex',
- 'parse_regex',
- )
- class Node(object):
- """
- Base class for all the grammar nodes.
- (You don't initialize this one.)
- """
- def __add__(self, other_node):
- return Sequence([self, other_node])
- def __or__(self, other_node):
- return Any([self, other_node])
- class Any(Node):
- """
- Union operation (OR operation) between several grammars. You don't
- initialize this yourself, but it's a result of a "Grammar1 | Grammar2"
- operation.
- """
- def __init__(self, children):
- self.children = children
- def __or__(self, other_node):
- return Any(self.children + [other_node])
- def __repr__(self):
- return '%s(%r)' % (self.__class__.__name__, self.children)
- class Sequence(Node):
- """
- Concatenation operation of several grammars. You don't initialize this
- yourself, but it's a result of a "Grammar1 + Grammar2" operation.
- """
- def __init__(self, children):
- self.children = children
- def __add__(self, other_node):
- return Sequence(self.children + [other_node])
- def __repr__(self):
- return '%s(%r)' % (self.__class__.__name__, self.children)
- class Regex(Node):
- """
- Regular expression.
- """
- def __init__(self, regex):
- re.compile(regex) # Validate
- self.regex = regex
- def __repr__(self):
- return '%s(/%s/)' % (self.__class__.__name__, self.regex)
- class Lookahead(Node):
- """
- Lookahead expression.
- """
- def __init__(self, childnode, negative=False):
- self.childnode = childnode
- self.negative = negative
- def __repr__(self):
- return '%s(%r)' % (self.__class__.__name__, self.childnode)
- class Variable(Node):
- """
- Mark a variable in the regular grammar. This will be translated into a
- named group. Each variable can have his own completer, validator, etc..
- :param childnode: The grammar which is wrapped inside this variable.
- :param varname: String.
- """
- def __init__(self, childnode, varname=None):
- self.childnode = childnode
- self.varname = varname
- def __repr__(self):
- return '%s(childnode=%r, varname=%r)' % (
- self.__class__.__name__, self.childnode, self.varname)
- class Repeat(Node):
- def __init__(self, childnode, min_repeat=0, max_repeat=None, greedy=True):
- self.childnode = childnode
- self.min_repeat = min_repeat
- self.max_repeat = max_repeat
- self.greedy = greedy
- def __repr__(self):
- return '%s(childnode=%r)' % (self.__class__.__name__, self.childnode)
- def tokenize_regex(input):
- """
- Takes a string, representing a regular expression as input, and tokenizes
- it.
- :param input: string, representing a regular expression.
- :returns: List of tokens.
- """
- # Regular expression for tokenizing other regular expressions.
- p = re.compile(r'''^(
- \(\?P\<[a-zA-Z0-9_-]+\> | # Start of named group.
- \(\?#[^)]*\) | # Comment
- \(\?= | # Start of lookahead assertion
- \(\?! | # Start of negative lookahead assertion
- \(\?<= | # If preceded by.
- \(\?< | # If not preceded by.
- \(?: | # Start of group. (non capturing.)
- \( | # Start of group.
- \(?[iLmsux] | # Flags.
- \(?P=[a-zA-Z]+\) | # Back reference to named group
- \) | # End of group.
- \{[^{}]*\} | # Repetition
- \*\? | \+\? | \?\?\ | # Non greedy repetition.
- \* | \+ | \? | # Repetition
- \#.*\n | # Comment
- \\. |
- # Character group.
- \[
- ( [^\]\\] | \\.)*
- \] |
- [^(){}] |
- .
- )''', re.VERBOSE)
- tokens = []
- while input:
- m = p.match(input)
- if m:
- token, input = input[:m.end()], input[m.end():]
- if not token.isspace():
- tokens.append(token)
- else:
- raise Exception('Could not tokenize input regex.')
- return tokens
- def parse_regex(regex_tokens):
- """
- Takes a list of tokens from the tokenizer, and returns a parse tree.
- """
- # We add a closing brace because that represents the final pop of the stack.
- tokens = [')'] + regex_tokens[::-1]
- def wrap(lst):
- """ Turn list into sequence when it contains several items. """
- if len(lst) == 1:
- return lst[0]
- else:
- return Sequence(lst)
- def _parse():
- or_list = []
- result = []
- def wrapped_result():
- if or_list == []:
- return wrap(result)
- else:
- or_list.append(result)
- return Any([wrap(i) for i in or_list])
- while tokens:
- t = tokens.pop()
- if t.startswith('(?P<'):
- variable = Variable(_parse(), varname=t[4:-1])
- result.append(variable)
- elif t in ('*', '*?'):
- greedy = (t == '*')
- result[-1] = Repeat(result[-1], greedy=greedy)
- elif t in ('+', '+?'):
- greedy = (t == '+')
- result[-1] = Repeat(result[-1], min_repeat=1, greedy=greedy)
- elif t in ('?', '??'):
- if result == []:
- raise Exception('Nothing to repeat.' + repr(tokens))
- else:
- greedy = (t == '?')
- result[-1] = Repeat(result[-1], min_repeat=0, max_repeat=1, greedy=greedy)
- elif t == '|':
- or_list.append(result)
- result = []
- elif t in ('(', '(?:'):
- result.append(_parse())
- elif t == '(?!':
- result.append(Lookahead(_parse(), negative=True))
- elif t == '(?=':
- result.append(Lookahead(_parse(), negative=False))
- elif t == ')':
- return wrapped_result()
- elif t.startswith('#'):
- pass
- elif t.startswith('{'):
- # TODO: implement!
- raise Exception('{}-style repitition not yet supported' % t)
- elif t.startswith('(?'):
- raise Exception('%r not supported' % t)
- elif t.isspace():
- pass
- else:
- result.append(Regex(t))
- raise Exception("Expecting ')' token")
- result = _parse()
- if len(tokens) != 0:
- raise Exception("Unmatched parantheses.")
- else:
- return result
|