123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171 |
- from __future__ import unicode_literals, print_function, absolute_import, division, generators, nested_scopes
- import sys
- import logging
- import ply.lex
- logger = logging.getLogger(__name__)
- class JsonPathLexerError(Exception):
- pass
- class JsonPathLexer(object):
- '''
- A Lexical analyzer for JsonPath.
- '''
- def __init__(self, debug=False):
- self.debug = debug
- if self.__doc__ == None:
- raise JsonPathLexerError('Docstrings have been removed! By design of PLY, jsonpath-rw requires docstrings. You must not use PYTHONOPTIMIZE=2 or python -OO.')
- def tokenize(self, string):
- '''
- Maps a string to an iterator over tokens. In other words: [char] -> [token]
- '''
- new_lexer = ply.lex.lex(module=self, debug=self.debug, errorlog=logger)
- new_lexer.latest_newline = 0
- new_lexer.string_value = None
- new_lexer.input(string)
- while True:
- t = new_lexer.token()
- if t is None: break
- t.col = t.lexpos - new_lexer.latest_newline
- yield t
- if new_lexer.string_value is not None:
- raise JsonPathLexerError('Unexpected EOF in string literal or identifier')
- # ============== PLY Lexer specification ==================
- #
- # This probably should be private but:
- # - the parser requires access to `tokens` (perhaps they should be defined in a third, shared dependency)
- # - things like `literals` might be a legitimate part of the public interface.
- #
- # Anyhow, it is pythonic to give some rope to hang oneself with :-)
- literals = ['*', '.', '[', ']', '(', ')', '$', ',', ':', '|', '&']
- reserved_words = { 'where': 'WHERE' }
- tokens = ['DOUBLEDOT', 'NUMBER', 'ID', 'NAMED_OPERATOR'] + list(reserved_words.values())
- states = [ ('singlequote', 'exclusive'),
- ('doublequote', 'exclusive'),
- ('backquote', 'exclusive') ]
- # Normal lexing, rather easy
- t_DOUBLEDOT = r'\.\.'
- t_ignore = ' \t'
- def t_ID(self, t):
- r'[a-zA-Z_@][a-zA-Z0-9_@\-]*'
- t.type = self.reserved_words.get(t.value, 'ID')
- return t
- def t_NUMBER(self, t):
- r'-?\d+'
- t.value = int(t.value)
- return t
- # Single-quoted strings
- t_singlequote_ignore = ''
- def t_singlequote(self, t):
- r"'"
- t.lexer.string_start = t.lexer.lexpos
- t.lexer.string_value = ''
- t.lexer.push_state('singlequote')
- def t_singlequote_content(self, t):
- r"[^'\\]+"
- t.lexer.string_value += t.value
- def t_singlequote_escape(self, t):
- r'\\.'
- t.lexer.string_value += t.value[1]
- def t_singlequote_end(self, t):
- r"'"
- t.value = t.lexer.string_value
- t.type = 'ID'
- t.lexer.string_value = None
- t.lexer.pop_state()
- return t
- def t_singlequote_error(self, t):
- raise JsonPathLexerError('Error on line %s, col %s while lexing singlequoted field: Unexpected character: %s ' % (t.lexer.lineno, t.lexpos - t.lexer.latest_newline, t.value[0]))
- # Double-quoted strings
- t_doublequote_ignore = ''
- def t_doublequote(self, t):
- r'"'
- t.lexer.string_start = t.lexer.lexpos
- t.lexer.string_value = ''
- t.lexer.push_state('doublequote')
- def t_doublequote_content(self, t):
- r'[^"\\]+'
- t.lexer.string_value += t.value
- def t_doublequote_escape(self, t):
- r'\\.'
- t.lexer.string_value += t.value[1]
- def t_doublequote_end(self, t):
- r'"'
- t.value = t.lexer.string_value
- t.type = 'ID'
- t.lexer.string_value = None
- t.lexer.pop_state()
- return t
- def t_doublequote_error(self, t):
- raise JsonPathLexerError('Error on line %s, col %s while lexing doublequoted field: Unexpected character: %s ' % (t.lexer.lineno, t.lexpos - t.lexer.latest_newline, t.value[0]))
- # Back-quoted "magic" operators
- t_backquote_ignore = ''
- def t_backquote(self, t):
- r'`'
- t.lexer.string_start = t.lexer.lexpos
- t.lexer.string_value = ''
- t.lexer.push_state('backquote')
- def t_backquote_escape(self, t):
- r'\\.'
- t.lexer.string_value += t.value[1]
- def t_backquote_content(self, t):
- r"[^`\\]+"
- t.lexer.string_value += t.value
- def t_backquote_end(self, t):
- r'`'
- t.value = t.lexer.string_value
- t.type = 'NAMED_OPERATOR'
- t.lexer.string_value = None
- t.lexer.pop_state()
- return t
- def t_backquote_error(self, t):
- raise JsonPathLexerError('Error on line %s, col %s while lexing backquoted operator: Unexpected character: %s ' % (t.lexer.lineno, t.lexpos - t.lexer.latest_newline, t.value[0]))
- # Counting lines, handling errors
- def t_newline(self, t):
- r'\n'
- t.lexer.lineno += 1
- t.lexer.latest_newline = t.lexpos
- def t_error(self, t):
- raise JsonPathLexerError('Error on line %s, col %s: Unexpected character: %s ' % (t.lexer.lineno, t.lexpos - t.lexer.latest_newline, t.value[0]))
- if __name__ == '__main__':
- logging.basicConfig()
- lexer = JsonPathLexer(debug=True)
- for token in lexer.tokenize(sys.stdin.read()):
- print('%-20s%s' % (token.value, token.type))
|