| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452 |
- """
- This module contains a tokenizer for Excel formulae.
- The tokenizer is based on the Javascript tokenizer found at
- http://ewbi.blogs.com/develops/2004/12/excel_formula_p.html written by Eric
- Bachtal
- """
- import re
- class TokenizerError(Exception):
- """Base class for all Tokenizer errors."""
- class Tokenizer(object):
- """
- A tokenizer for Excel worksheet formulae.
- Converts a unicode string representing an Excel formula (in A1 notation)
- into a sequence of `Token` objects.
- `formula`: The unicode string to tokenize
- Tokenizer defines a method `._parse()` to parse the formula into tokens,
- which can then be accessed through the `.items` attribute.
- """
- SN_RE = re.compile("^[1-9](\\.[0-9]+)?[Ee]$") # Scientific notation
- WSPACE_RE = re.compile(r"[ \n]+")
- STRING_REGEXES = {
- # Inside a string, all characters are treated as literals, except for
- # the quote character used to start the string. That character, when
- # doubled is treated as a single character in the string. If an
- # unmatched quote appears, the string is terminated.
- '"': re.compile('"(?:[^"]*"")*[^"]*"(?!")'),
- "'": re.compile("'(?:[^']*'')*[^']*'(?!')"),
- }
- ERROR_CODES = ("#NULL!", "#DIV/0!", "#VALUE!", "#REF!", "#NAME?",
- "#NUM!", "#N/A", "#GETTING_DATA")
- TOKEN_ENDERS = ',;}) +-*/^&=><%' # Each of these characters, marks the
- # end of an operand token
- def __init__(self, formula):
- self.formula = formula
- self.items = []
- self.token_stack = [] # Used to keep track of arrays, functions, and
- # parentheses
- self.offset = 0 # How many chars have we read
- self.token = [] # Used to build up token values char by char
- self._parse()
- def _parse(self):
- """Populate self.items with the tokens from the formula."""
- if self.offset:
- return # Already parsed!
- if not self.formula:
- return
- elif self.formula[0] == '=':
- self.offset += 1
- else:
- self.items.append(Token(self.formula, Token.LITERAL))
- return
- consumers = (
- ('"\'', self._parse_string),
- ('[', self._parse_brackets),
- ('#', self._parse_error),
- (' ', self._parse_whitespace),
- ('\n', self._parse_whitespace),
- ('+-*/^&=><%', self._parse_operator),
- ('{(', self._parse_opener),
- (')}', self._parse_closer),
- (';,', self._parse_separator),
- )
- dispatcher = {} # maps chars to the specific parsing function
- for chars, consumer in consumers:
- dispatcher.update(dict.fromkeys(chars, consumer))
- while self.offset < len(self.formula):
- if self.check_scientific_notation(): # May consume one character
- continue
- curr_char = self.formula[self.offset]
- if curr_char in self.TOKEN_ENDERS:
- self.save_token()
- if curr_char in dispatcher:
- self.offset += dispatcher[curr_char]()
- else:
- # TODO: this can probably be sped up using a regex to get to
- # the next interesting character
- self.token.append(curr_char)
- self.offset += 1
- self.save_token()
- def _parse_string(self):
- """
- Parse a "-delimited string or '-delimited link.
- The offset must be pointing to either a single quote ("'") or double
- quote ('"') character. The strings are parsed according to Excel
- rules where to escape the delimiter you just double it up. E.g.,
- "abc""def" in Excel is parsed as 'abc"def' in Python.
- Returns the number of characters matched. (Does not update
- self.offset)
- """
- self.assert_empty_token(can_follow=':')
- delim = self.formula[self.offset]
- assert delim in ('"', "'")
- regex = self.STRING_REGEXES[delim]
- match = regex.match(self.formula[self.offset:])
- if match is None:
- subtype = "string" if delim == '"' else 'link'
- raise TokenizerError(
- "Reached end of formula while parsing %s in %s" %
- (subtype, self.formula))
- match = match.group(0)
- if delim == '"':
- self.items.append(Token.make_operand(match))
- else:
- self.token.append(match)
- return len(match)
- def _parse_brackets(self):
- """
- Consume all the text between square brackets [].
- Returns the number of characters matched. (Does not update
- self.offset)
- """
- assert self.formula[self.offset] == '['
- lefts = [(t.start(), 1) for t in
- re.finditer(r"\[", self.formula[self.offset:])]
- rights = [(t.start(), -1) for t in
- re.finditer(r"\]", self.formula[self.offset:])]
- open_count = 0
- for idx, open_close in sorted(lefts + rights):
- open_count += open_close
- if open_count == 0:
- outer_right = idx + 1
- self.token.append(
- self.formula[self.offset:self.offset + outer_right])
- return outer_right
- raise TokenizerError("Encountered unmatched '[' in %s" % self.formula)
- def _parse_error(self):
- """
- Consume the text following a '#' as an error.
- Looks for a match in self.ERROR_CODES and returns the number of
- characters matched. (Does not update self.offset)
- """
- self.assert_empty_token(can_follow='!')
- assert self.formula[self.offset] == '#'
- subformula = self.formula[self.offset:]
- for err in self.ERROR_CODES:
- if subformula.startswith(err):
- self.items.append(Token.make_operand(''.join(self.token) + err))
- del self.token[:]
- return len(err)
- raise TokenizerError(
- "Invalid error code at position %d in '%s'" %
- (self.offset, self.formula))
- def _parse_whitespace(self):
- """
- Consume a string of consecutive spaces.
- Returns the number of spaces found. (Does not update self.offset).
- """
- assert self.formula[self.offset] in (' ', '\n')
- self.items.append(Token(self.formula[self.offset], Token.WSPACE))
- return self.WSPACE_RE.match(self.formula[self.offset:]).end()
- def _parse_operator(self):
- """
- Consume the characters constituting an operator.
- Returns the number of characters consumed. (Does not update
- self.offset)
- """
- if self.formula[self.offset:self.offset + 2] in ('>=', '<=', '<>'):
- self.items.append(Token(
- self.formula[self.offset:self.offset + 2],
- Token.OP_IN
- ))
- return 2
- curr_char = self.formula[self.offset] # guaranteed to be 1 char
- assert curr_char in '%*/^&=><+-'
- if curr_char == '%':
- token = Token('%', Token.OP_POST)
- elif curr_char in "*/^&=><":
- token = Token(curr_char, Token.OP_IN)
- # From here on, curr_char is guaranteed to be in '+-'
- elif not self.items:
- token = Token(curr_char, Token.OP_PRE)
- else:
- prev = next((i for i in reversed(self.items)
- if i.type != Token.WSPACE), None)
- is_infix = prev and (
- prev.subtype == Token.CLOSE
- or prev.type == Token.OP_POST
- or prev.type == Token.OPERAND
- )
- if is_infix:
- token = Token(curr_char, Token.OP_IN)
- else:
- token = Token(curr_char, Token.OP_PRE)
- self.items.append(token)
- return 1
- def _parse_opener(self):
- """
- Consumes a ( or { character.
- Returns the number of characters consumed. (Does not update
- self.offset)
- """
- assert self.formula[self.offset] in ('(', '{')
- if self.formula[self.offset] == '{':
- self.assert_empty_token()
- token = Token.make_subexp("{")
- elif self.token:
- token_value = "".join(self.token) + '('
- del self.token[:]
- token = Token.make_subexp(token_value)
- else:
- token = Token.make_subexp("(")
- self.items.append(token)
- self.token_stack.append(token)
- return 1
- def _parse_closer(self):
- """
- Consumes a } or ) character.
- Returns the number of characters consumed. (Does not update
- self.offset)
- """
- assert self.formula[self.offset] in (')', '}')
- token = self.token_stack.pop().get_closer()
- if token.value != self.formula[self.offset]:
- raise TokenizerError(
- "Mismatched ( and { pair in '%s'" % self.formula)
- self.items.append(token)
- return 1
- def _parse_separator(self):
- """
- Consumes a ; or , character.
- Returns the number of characters consumed. (Does not update
- self.offset)
- """
- curr_char = self.formula[self.offset]
- assert curr_char in (';', ',')
- if curr_char == ';':
- token = Token.make_separator(";")
- else:
- try:
- top_type = self.token_stack[-1].type
- except IndexError:
- token = Token(",", Token.OP_IN) # Range Union operator
- else:
- if top_type == Token.PAREN:
- token = Token(",", Token.OP_IN) # Range Union operator
- else:
- token = Token.make_separator(",")
- self.items.append(token)
- return 1
- def check_scientific_notation(self):
- """
- Consumes a + or - character if part of a number in sci. notation.
- Returns True if the character was consumed and self.offset was
- updated, False otherwise.
- """
- curr_char = self.formula[self.offset]
- if (curr_char in '+-'
- and len(self.token) >= 1
- and self.SN_RE.match("".join(self.token))):
- self.token.append(curr_char)
- self.offset += 1
- return True
- return False
- def assert_empty_token(self, can_follow=()):
- """
- Ensure that there's no token currently being parsed.
- Or if there is a token being parsed, it must end with a character in
- can_follow.
- If there are unconsumed token contents, it means we hit an unexpected
- token transition. In this case, we raise a TokenizerError
- """
- if self.token and self.token[-1] not in can_follow:
- raise TokenizerError(
- "Unexpected character at position %d in '%s'" %
- (self.offset, self.formula))
- def save_token(self):
- """If there's a token being parsed, add it to the item list."""
- if self.token:
- self.items.append(Token.make_operand("".join(self.token)))
- del self.token[:]
- def render(self):
- """Convert the parsed tokens back to a string."""
- if not self.items:
- return ""
- elif self.items[0].type == Token.LITERAL:
- return self.items[0].value
- return "=" + "".join(token.value for token in self.items)
- class Token(object):
- """
- A token in an Excel formula.
- Tokens have three attributes:
- * `value`: The string value parsed that led to this token
- * `type`: A string identifying the type of token
- * `subtype`: A string identifying subtype of the token (optional, and
- defaults to "")
- """
- __slots__ = ['value', 'type', 'subtype']
- LITERAL = "LITERAL"
- OPERAND = "OPERAND"
- FUNC = "FUNC"
- ARRAY = "ARRAY"
- PAREN = "PAREN"
- SEP = "SEP"
- OP_PRE = "OPERATOR-PREFIX"
- OP_IN = "OPERATOR-INFIX"
- OP_POST = "OPERATOR-POSTFIX"
- WSPACE = "WHITE-SPACE"
- def __init__(self, value, type_, subtype=""):
- self.value = value
- self.type = type_
- self.subtype = subtype
- # Literal operands:
- #
- # Literal operands are always of type 'OPERAND' and can be of subtype
- # 'TEXT' (for text strings), 'NUMBER' (for all numeric types), 'LOGICAL'
- # (for TRUE and FALSE), 'ERROR' (for literal error values), or 'RANGE'
- # (for all range references).
- TEXT = 'TEXT'
- NUMBER = 'NUMBER'
- LOGICAL = 'LOGICAL'
- ERROR = 'ERROR'
- RANGE = 'RANGE'
- def __repr__(self):
- return u"{0} {1} {2}:".format(self.type, self.subtype, self.value)
- @classmethod
- def make_operand(cls, value):
- """Create an operand token."""
- if value.startswith('"'):
- subtype = cls.TEXT
- elif value.startswith('#'):
- subtype = cls.ERROR
- elif value in ('TRUE', 'FALSE'):
- subtype = cls.LOGICAL
- else:
- try:
- float(value)
- subtype = cls.NUMBER
- except ValueError:
- subtype = cls.RANGE
- return cls(value, cls.OPERAND, subtype)
- # Subexpresssions
- #
- # There are 3 types of `Subexpressions`: functions, array literals, and
- # parentheticals. Subexpressions have 'OPEN' and 'CLOSE' tokens. 'OPEN'
- # is used when parsing the initial expression token (i.e., '(' or '{')
- # and 'CLOSE' is used when parsing the closing expression token ('}' or
- # ')').
- OPEN = "OPEN"
- CLOSE = "CLOSE"
- @classmethod
- def make_subexp(cls, value, func=False):
- """
- Create a subexpression token.
- `value`: The value of the token
- `func`: If True, force the token to be of type FUNC
- """
- assert value[-1] in ('{', '}', '(', ')')
- if func:
- assert re.match('.+\\(|\\)', value)
- type_ = Token.FUNC
- elif value in '{}':
- type_ = Token.ARRAY
- elif value in '()':
- type_ = Token.PAREN
- else:
- type_ = Token.FUNC
- subtype = cls.CLOSE if value in ')}' else cls.OPEN
- return cls(value, type_, subtype)
- def get_closer(self):
- """Return a closing token that matches this token's type."""
- assert self.type in (self.FUNC, self.ARRAY, self.PAREN)
- assert self.subtype == self.OPEN
- value = "}" if self.type == self.ARRAY else ")"
- return self.make_subexp(value, func=self.type == self.FUNC)
- # Separator tokens
- #
- # Argument separators always have type 'SEP' and can have one of two
- # subtypes: 'ARG', 'ROW'. 'ARG' is used for the ',' token, when used to
- # delimit either function arguments or array elements. 'ROW' is used for
- # the ';' token, which is always used to delimit rows in an array
- # literal.
- ARG = "ARG"
- ROW = "ROW"
- @classmethod
- def make_separator(cls, value):
- """Create a separator token"""
- assert value in (',', ';')
- subtype = cls.ARG if value == ',' else cls.ROW
- return cls(value, cls.SEP, subtype)
|