Scanners.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. #=======================================================================
  2. #
  3. # Python Lexical Analyser
  4. #
  5. #
  6. # Scanning an input stream
  7. #
  8. #=======================================================================
  9. from __future__ import absolute_import
  10. import cython
  11. cython.declare(BOL=object, EOL=object, EOF=object, NOT_FOUND=object)
  12. from . import Errors
  13. from .Regexps import BOL, EOL, EOF
  14. NOT_FOUND = object()
  15. class Scanner(object):
  16. """
  17. A Scanner is used to read tokens from a stream of characters
  18. using the token set specified by a Plex.Lexicon.
  19. Constructor:
  20. Scanner(lexicon, stream, name = '')
  21. See the docstring of the __init__ method for details.
  22. Methods:
  23. See the docstrings of the individual methods for more
  24. information.
  25. read() --> (value, text)
  26. Reads the next lexical token from the stream.
  27. position() --> (name, line, col)
  28. Returns the position of the last token read using the
  29. read() method.
  30. begin(state_name)
  31. Causes scanner to change state.
  32. produce(value [, text])
  33. Causes return of a token value to the caller of the
  34. Scanner.
  35. """
  36. # lexicon = None # Lexicon
  37. # stream = None # file-like object
  38. # name = ''
  39. # buffer = ''
  40. # buf_start_pos = 0 # position in input of start of buffer
  41. # next_pos = 0 # position in input of next char to read
  42. # cur_pos = 0 # position in input of current char
  43. # cur_line = 1 # line number of current char
  44. # cur_line_start = 0 # position in input of start of current line
  45. # start_pos = 0 # position in input of start of token
  46. # start_line = 0 # line number of start of token
  47. # start_col = 0 # position in line of start of token
  48. # text = None # text of last token read
  49. # initial_state = None # Node
  50. # state_name = '' # Name of initial state
  51. # queue = None # list of tokens to be returned
  52. # trace = 0
  53. def __init__(self, lexicon, stream, name='', initial_pos=None):
  54. """
  55. Scanner(lexicon, stream, name = '')
  56. |lexicon| is a Plex.Lexicon instance specifying the lexical tokens
  57. to be recognised.
  58. |stream| can be a file object or anything which implements a
  59. compatible read() method.
  60. |name| is optional, and may be the name of the file being
  61. scanned or any other identifying string.
  62. """
  63. self.trace = 0
  64. self.buffer = u''
  65. self.buf_start_pos = 0
  66. self.next_pos = 0
  67. self.cur_pos = 0
  68. self.cur_line = 1
  69. self.start_pos = 0
  70. self.start_line = 0
  71. self.start_col = 0
  72. self.text = None
  73. self.state_name = None
  74. self.lexicon = lexicon
  75. self.stream = stream
  76. self.name = name
  77. self.queue = []
  78. self.initial_state = None
  79. self.begin('')
  80. self.next_pos = 0
  81. self.cur_pos = 0
  82. self.cur_line_start = 0
  83. self.cur_char = BOL
  84. self.input_state = 1
  85. if initial_pos is not None:
  86. self.cur_line, self.cur_line_start = initial_pos[1], -initial_pos[2]
  87. def read(self):
  88. """
  89. Read the next lexical token from the stream and return a
  90. tuple (value, text), where |value| is the value associated with
  91. the token as specified by the Lexicon, and |text| is the actual
  92. string read from the stream. Returns (None, '') on end of file.
  93. """
  94. queue = self.queue
  95. while not queue:
  96. self.text, action = self.scan_a_token()
  97. if action is None:
  98. self.produce(None)
  99. self.eof()
  100. else:
  101. value = action.perform(self, self.text)
  102. if value is not None:
  103. self.produce(value)
  104. result = queue[0]
  105. del queue[0]
  106. return result
  107. def scan_a_token(self):
  108. """
  109. Read the next input sequence recognised by the machine
  110. and return (text, action). Returns ('', None) on end of
  111. file.
  112. """
  113. self.start_pos = self.cur_pos
  114. self.start_line = self.cur_line
  115. self.start_col = self.cur_pos - self.cur_line_start
  116. action = self.run_machine_inlined()
  117. if action is not None:
  118. if self.trace:
  119. print("Scanner: read: Performing %s %d:%d" % (
  120. action, self.start_pos, self.cur_pos))
  121. text = self.buffer[
  122. self.start_pos - self.buf_start_pos:
  123. self.cur_pos - self.buf_start_pos]
  124. return (text, action)
  125. else:
  126. if self.cur_pos == self.start_pos:
  127. if self.cur_char is EOL:
  128. self.next_char()
  129. if self.cur_char is None or self.cur_char is EOF:
  130. return (u'', None)
  131. raise Errors.UnrecognizedInput(self, self.state_name)
  132. def run_machine_inlined(self):
  133. """
  134. Inlined version of run_machine for speed.
  135. """
  136. state = self.initial_state
  137. cur_pos = self.cur_pos
  138. cur_line = self.cur_line
  139. cur_line_start = self.cur_line_start
  140. cur_char = self.cur_char
  141. input_state = self.input_state
  142. next_pos = self.next_pos
  143. buffer = self.buffer
  144. buf_start_pos = self.buf_start_pos
  145. buf_len = len(buffer)
  146. b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_state, b_next_pos = \
  147. None, 0, 0, 0, u'', 0, 0
  148. trace = self.trace
  149. while 1:
  150. if trace: #TRACE#
  151. print("State %d, %d/%d:%s -->" % ( #TRACE#
  152. state['number'], input_state, cur_pos, repr(cur_char))) #TRACE#
  153. # Begin inlined self.save_for_backup()
  154. #action = state.action #@slow
  155. action = state['action'] #@fast
  156. if action is not None:
  157. b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_state, b_next_pos = \
  158. action, cur_pos, cur_line, cur_line_start, cur_char, input_state, next_pos
  159. # End inlined self.save_for_backup()
  160. c = cur_char
  161. #new_state = state.new_state(c) #@slow
  162. new_state = state.get(c, NOT_FOUND) #@fast
  163. if new_state is NOT_FOUND: #@fast
  164. new_state = c and state.get('else') #@fast
  165. if new_state:
  166. if trace: #TRACE#
  167. print("State %d" % new_state['number']) #TRACE#
  168. state = new_state
  169. # Begin inlined: self.next_char()
  170. if input_state == 1:
  171. cur_pos = next_pos
  172. # Begin inlined: c = self.read_char()
  173. buf_index = next_pos - buf_start_pos
  174. if buf_index < buf_len:
  175. c = buffer[buf_index]
  176. next_pos += 1
  177. else:
  178. discard = self.start_pos - buf_start_pos
  179. data = self.stream.read(0x1000)
  180. buffer = self.buffer[discard:] + data
  181. self.buffer = buffer
  182. buf_start_pos += discard
  183. self.buf_start_pos = buf_start_pos
  184. buf_len = len(buffer)
  185. buf_index -= discard
  186. if data:
  187. c = buffer[buf_index]
  188. next_pos += 1
  189. else:
  190. c = u''
  191. # End inlined: c = self.read_char()
  192. if c == u'\n':
  193. cur_char = EOL
  194. input_state = 2
  195. elif not c:
  196. cur_char = EOL
  197. input_state = 4
  198. else:
  199. cur_char = c
  200. elif input_state == 2:
  201. cur_char = u'\n'
  202. input_state = 3
  203. elif input_state == 3:
  204. cur_line += 1
  205. cur_line_start = cur_pos = next_pos
  206. cur_char = BOL
  207. input_state = 1
  208. elif input_state == 4:
  209. cur_char = EOF
  210. input_state = 5
  211. else: # input_state = 5
  212. cur_char = u''
  213. # End inlined self.next_char()
  214. else: # not new_state
  215. if trace: #TRACE#
  216. print("blocked") #TRACE#
  217. # Begin inlined: action = self.back_up()
  218. if b_action is not None:
  219. (action, cur_pos, cur_line, cur_line_start,
  220. cur_char, input_state, next_pos) = \
  221. (b_action, b_cur_pos, b_cur_line, b_cur_line_start,
  222. b_cur_char, b_input_state, b_next_pos)
  223. else:
  224. action = None
  225. break # while 1
  226. # End inlined: action = self.back_up()
  227. self.cur_pos = cur_pos
  228. self.cur_line = cur_line
  229. self.cur_line_start = cur_line_start
  230. self.cur_char = cur_char
  231. self.input_state = input_state
  232. self.next_pos = next_pos
  233. if trace: #TRACE#
  234. if action is not None: #TRACE#
  235. print("Doing %s" % action) #TRACE#
  236. return action
  237. def next_char(self):
  238. input_state = self.input_state
  239. if self.trace:
  240. print("Scanner: next: %s [%d] %d" % (" " * 20, input_state, self.cur_pos))
  241. if input_state == 1:
  242. self.cur_pos = self.next_pos
  243. c = self.read_char()
  244. if c == u'\n':
  245. self.cur_char = EOL
  246. self.input_state = 2
  247. elif not c:
  248. self.cur_char = EOL
  249. self.input_state = 4
  250. else:
  251. self.cur_char = c
  252. elif input_state == 2:
  253. self.cur_char = u'\n'
  254. self.input_state = 3
  255. elif input_state == 3:
  256. self.cur_line += 1
  257. self.cur_line_start = self.cur_pos = self.next_pos
  258. self.cur_char = BOL
  259. self.input_state = 1
  260. elif input_state == 4:
  261. self.cur_char = EOF
  262. self.input_state = 5
  263. else: # input_state = 5
  264. self.cur_char = u''
  265. if self.trace:
  266. print("--> [%d] %d %s" % (input_state, self.cur_pos, repr(self.cur_char)))
  267. def position(self):
  268. """
  269. Return a tuple (name, line, col) representing the location of
  270. the last token read using the read() method. |name| is the
  271. name that was provided to the Scanner constructor; |line|
  272. is the line number in the stream (1-based); |col| is the
  273. position within the line of the first character of the token
  274. (0-based).
  275. """
  276. return (self.name, self.start_line, self.start_col)
  277. def get_position(self):
  278. """Python accessible wrapper around position(), only for error reporting.
  279. """
  280. return self.position()
  281. def begin(self, state_name):
  282. """Set the current state of the scanner to the named state."""
  283. self.initial_state = (
  284. self.lexicon.get_initial_state(state_name))
  285. self.state_name = state_name
  286. def produce(self, value, text=None):
  287. """
  288. Called from an action procedure, causes |value| to be returned
  289. as the token value from read(). If |text| is supplied, it is
  290. returned in place of the scanned text.
  291. produce() can be called more than once during a single call to an action
  292. procedure, in which case the tokens are queued up and returned one
  293. at a time by subsequent calls to read(), until the queue is empty,
  294. whereupon scanning resumes.
  295. """
  296. if text is None:
  297. text = self.text
  298. self.queue.append((value, text))
  299. def eof(self):
  300. """
  301. Override this method if you want something to be done at
  302. end of file.
  303. """