lexer.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. from __future__ import unicode_literals, print_function, absolute_import, division, generators, nested_scopes
  2. import sys
  3. import logging
  4. import ply.lex
  5. logger = logging.getLogger(__name__)
  6. class JsonPathLexerError(Exception):
  7. pass
  8. class JsonPathLexer(object):
  9. '''
  10. A Lexical analyzer for JsonPath.
  11. '''
  12. def __init__(self, debug=False):
  13. self.debug = debug
  14. if self.__doc__ == None:
  15. raise JsonPathLexerError('Docstrings have been removed! By design of PLY, jsonpath-rw requires docstrings. You must not use PYTHONOPTIMIZE=2 or python -OO.')
  16. def tokenize(self, string):
  17. '''
  18. Maps a string to an iterator over tokens. In other words: [char] -> [token]
  19. '''
  20. new_lexer = ply.lex.lex(module=self, debug=self.debug, errorlog=logger)
  21. new_lexer.latest_newline = 0
  22. new_lexer.string_value = None
  23. new_lexer.input(string)
  24. while True:
  25. t = new_lexer.token()
  26. if t is None: break
  27. t.col = t.lexpos - new_lexer.latest_newline
  28. yield t
  29. if new_lexer.string_value is not None:
  30. raise JsonPathLexerError('Unexpected EOF in string literal or identifier')
  31. # ============== PLY Lexer specification ==================
  32. #
  33. # This probably should be private but:
  34. # - the parser requires access to `tokens` (perhaps they should be defined in a third, shared dependency)
  35. # - things like `literals` might be a legitimate part of the public interface.
  36. #
  37. # Anyhow, it is pythonic to give some rope to hang oneself with :-)
  38. literals = ['*', '.', '[', ']', '(', ')', '$', ',', ':', '|', '&']
  39. reserved_words = { 'where': 'WHERE' }
  40. tokens = ['DOUBLEDOT', 'NUMBER', 'ID', 'NAMED_OPERATOR'] + list(reserved_words.values())
  41. states = [ ('singlequote', 'exclusive'),
  42. ('doublequote', 'exclusive'),
  43. ('backquote', 'exclusive') ]
  44. # Normal lexing, rather easy
  45. t_DOUBLEDOT = r'\.\.'
  46. t_ignore = ' \t'
  47. def t_ID(self, t):
  48. r'[a-zA-Z_@][a-zA-Z0-9_@\-]*'
  49. t.type = self.reserved_words.get(t.value, 'ID')
  50. return t
  51. def t_NUMBER(self, t):
  52. r'-?\d+'
  53. t.value = int(t.value)
  54. return t
  55. # Single-quoted strings
  56. t_singlequote_ignore = ''
  57. def t_singlequote(self, t):
  58. r"'"
  59. t.lexer.string_start = t.lexer.lexpos
  60. t.lexer.string_value = ''
  61. t.lexer.push_state('singlequote')
  62. def t_singlequote_content(self, t):
  63. r"[^'\\]+"
  64. t.lexer.string_value += t.value
  65. def t_singlequote_escape(self, t):
  66. r'\\.'
  67. t.lexer.string_value += t.value[1]
  68. def t_singlequote_end(self, t):
  69. r"'"
  70. t.value = t.lexer.string_value
  71. t.type = 'ID'
  72. t.lexer.string_value = None
  73. t.lexer.pop_state()
  74. return t
  75. def t_singlequote_error(self, t):
  76. raise JsonPathLexerError('Error on line %s, col %s while lexing singlequoted field: Unexpected character: %s ' % (t.lexer.lineno, t.lexpos - t.lexer.latest_newline, t.value[0]))
  77. # Double-quoted strings
  78. t_doublequote_ignore = ''
  79. def t_doublequote(self, t):
  80. r'"'
  81. t.lexer.string_start = t.lexer.lexpos
  82. t.lexer.string_value = ''
  83. t.lexer.push_state('doublequote')
  84. def t_doublequote_content(self, t):
  85. r'[^"\\]+'
  86. t.lexer.string_value += t.value
  87. def t_doublequote_escape(self, t):
  88. r'\\.'
  89. t.lexer.string_value += t.value[1]
  90. def t_doublequote_end(self, t):
  91. r'"'
  92. t.value = t.lexer.string_value
  93. t.type = 'ID'
  94. t.lexer.string_value = None
  95. t.lexer.pop_state()
  96. return t
  97. def t_doublequote_error(self, t):
  98. raise JsonPathLexerError('Error on line %s, col %s while lexing doublequoted field: Unexpected character: %s ' % (t.lexer.lineno, t.lexpos - t.lexer.latest_newline, t.value[0]))
  99. # Back-quoted "magic" operators
  100. t_backquote_ignore = ''
  101. def t_backquote(self, t):
  102. r'`'
  103. t.lexer.string_start = t.lexer.lexpos
  104. t.lexer.string_value = ''
  105. t.lexer.push_state('backquote')
  106. def t_backquote_escape(self, t):
  107. r'\\.'
  108. t.lexer.string_value += t.value[1]
  109. def t_backquote_content(self, t):
  110. r"[^`\\]+"
  111. t.lexer.string_value += t.value
  112. def t_backquote_end(self, t):
  113. r'`'
  114. t.value = t.lexer.string_value
  115. t.type = 'NAMED_OPERATOR'
  116. t.lexer.string_value = None
  117. t.lexer.pop_state()
  118. return t
  119. def t_backquote_error(self, t):
  120. raise JsonPathLexerError('Error on line %s, col %s while lexing backquoted operator: Unexpected character: %s ' % (t.lexer.lineno, t.lexpos - t.lexer.latest_newline, t.value[0]))
  121. # Counting lines, handling errors
  122. def t_newline(self, t):
  123. r'\n'
  124. t.lexer.lineno += 1
  125. t.lexer.latest_newline = t.lexpos
  126. def t_error(self, t):
  127. raise JsonPathLexerError('Error on line %s, col %s: Unexpected character: %s ' % (t.lexer.lineno, t.lexpos - t.lexer.latest_newline, t.value[0]))
  128. if __name__ == '__main__':
  129. logging.basicConfig()
  130. lexer = JsonPathLexer(debug=True)
  131. for token in lexer.tokenize(sys.stdin.read()):
  132. print('%-20s%s' % (token.value, token.type))