peg.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442
  1. """Macro to easily define recursive-descent PEG parsers"""
  2. import re
  3. from macropy.core.macros import *
  4. from macropy.core.hquotes import macros, hq, u
  5. from macropy.quick_lambda import macros, f
  6. from macropy.case_classes import macros, case
  7. from collections import defaultdict
  8. """
  9. PEG Parser Atoms
  10. ================
  11. Sequence: e1 e2 , Seq
  12. Ordered choice: e1 / e2 | Or
  13. Zero-or-more: e* .rep Rep
  14. One-or-more: e+ .rep1
  15. Optional: e? .opt
  16. And-predicate: &e & And
  17. Not-predicate: !e - Not
  18. """
  19. macros = Macros()
  20. @macros.block
  21. def peg(tree, gen_sym, **kw):
  22. """Macro to easily define recursive-descent PEG parsers"""
  23. potential_targets = [
  24. target.id for stmt in tree
  25. if type(stmt) is Assign
  26. for target in stmt.targets
  27. ]
  28. for statement in tree:
  29. if type(statement) is Assign:
  30. new_tree = process(statement.value, potential_targets, gen_sym)
  31. statement.value = hq[
  32. Parser.Named(lambda: ast[new_tree], [u[statement.targets[0].id]])
  33. ]
  34. return tree
  35. @macros.expr
  36. def peg(tree, gen_sym, **kw):
  37. """Macro to easily define recursive-descent PEG parsers"""
  38. return process(tree, [], gen_sym)
  39. def process(tree, potential_targets, gen_sym):
  40. @Walker
  41. def PegWalker(tree, stop, collect, **kw):
  42. if type(tree) is Str:
  43. stop()
  44. return hq[Parser.Raw(ast[tree])]
  45. if type(tree) is Name and tree.id in potential_targets:
  46. collect(tree.id)
  47. if type(tree) is BinOp and type(tree.op) is RShift:
  48. tree.left, b_left = PegWalker.recurse_collect(tree.left)
  49. tree.right = hq[lambda bindings: ast[tree.right]]
  50. names = distinct(flatten(b_left))
  51. tree.right.args.args = map(f[Name(id = _)], names)
  52. tree.right.args.defaults = [hq[[]]] * len(names)
  53. tree.right.args.kwarg = gen_sym("kw")
  54. stop()
  55. return tree
  56. if type(tree) is BinOp and type(tree.op) is FloorDiv:
  57. tree.left, b_left = PegWalker.recurse_collect(tree.left)
  58. stop()
  59. collect(b_left)
  60. return tree
  61. if type(tree) is Tuple:
  62. result = hq[Parser.Seq([])]
  63. result.args[0].elts = tree.elts
  64. all_bindings = []
  65. for i, elt in enumerate(tree.elts):
  66. result.args[0].elts[i], bindings = PegWalker.recurse_collect(tree.elts[i])
  67. all_bindings.append(bindings)
  68. stop()
  69. collect(all_bindings)
  70. return result
  71. if type(tree) is Compare and type(tree.ops[0]) is Is:
  72. left_tree, bindings = PegWalker.recurse_collect(tree.left)
  73. new_tree = hq[ast[left_tree].bind_to(u[tree.comparators[0].id])]
  74. stop()
  75. collect(bindings + [tree.comparators[0].id])
  76. return new_tree
  77. new_tree = PegWalker.recurse(tree)
  78. return new_tree
  79. def cut():
  80. """Used within a Seq parser (p1, p2, p3,...) to commit the Seq to that
  81. alternative.
  82. After the Seq passes the `cut`, any failure becomes fatal, and backtracking
  83. is not performed. This improves performance and improves the quality
  84. of the error messages."""
  85. @case
  86. class Input(string, index):
  87. pass
  88. @case
  89. class Success(output, bindings, remaining):
  90. """
  91. output: the final value that was parsed
  92. bindings: named value bindings, created by the `is` keyword
  93. remaining: an Input representing the unread portion of the input
  94. """
  95. pass
  96. @case
  97. class Failure(remaining, failed, fatal | False):
  98. """
  99. remaining: an Input representing the unread portion of the input
  100. failed: a List[Parser], containing the stack of parsers in
  101. effect at time of failure
  102. fatal: whether a parent parser which receives this result from a child
  103. should continue backtracking
  104. """
  105. @property
  106. def index(self):
  107. return self.remaining.index
  108. @property
  109. def trace(self):
  110. return [x for f in self.failed for x in f.trace_name]
  111. @property
  112. def msg(self):
  113. """A pretty human-readable error message representing this Failure"""
  114. line_length = 60
  115. string = self.remaining.string
  116. index = self.index
  117. line_start = string.rfind('\n', 0, index+1)
  118. line_end = string.find('\n', index+1, len(string))
  119. if line_end == -1:
  120. line_end = len(string)
  121. line_num = string.count('\n', 0, index)
  122. offset = min(index - line_start , 40)
  123. msg = "index: " + str(self.index) + ", line: " + str(line_num + 1) + ", col: " + str(index - line_start) + "\n" + \
  124. " / ".join(self.trace) + "\n" + \
  125. string[line_start+1:line_end][index - offset - line_start:index+line_length-offset - line_start] + "\n" + \
  126. (offset-1) * " " + "^" + "\n" +\
  127. "expected: " + self.failed[-1].short_str()
  128. return msg
  129. class ParseError(Exception):
  130. """An exception that wraps a Failure"""
  131. def __init__(self, failure):
  132. self.failure = failure
  133. Exception.__init__(self, failure.msg)
  134. @case
  135. class Parser:
  136. def parse(self, string):
  137. """String -> value; throws ParseError in case of failure"""
  138. res = Parser.Full(self).parse_input(Input(string, 0))
  139. if type(res) is Success:
  140. return res.output
  141. else:
  142. raise ParseError(res)
  143. def parse_partial(self, string):
  144. """String -> Success | Failure"""
  145. return self.parse_input(Input(string, 0))
  146. def parse_string(self, string):
  147. """String -> Success | Failure"""
  148. return Parser.Full(self).parse_input(Input(string, 0))
  149. def parse_input(self, input):
  150. """Input -> Success | Failure"""
  151. @property
  152. def trace_name(self):
  153. return []
  154. def bind_to(self, string):
  155. return Parser.Named(lambda: self, [string])
  156. def __and__(self, other): return Parser.And([self, other])
  157. def __or__(self, other): return Parser.Or([self, other])
  158. def __neg__(self): return Parser.Not(self)
  159. @property
  160. def join(self):
  161. return self // "".join
  162. @property
  163. def rep1(self):
  164. return Parser.And([Parser.Rep(self), self])
  165. @property
  166. def rep(self):
  167. return Parser.Rep(self)
  168. def rep1_with(self, other):
  169. return Parser.Seq([self, Parser.Seq([other, self]).rep]) // (lambda x: [x[0]] + [y[1] for y in x[1]])
  170. def rep_with(self, other):
  171. return self.rep1_with(other) | Parser.Succeed([])
  172. @property
  173. def opt(self):
  174. return Parser.Or([self, Parser.Raw("")])
  175. @property
  176. def r(self):
  177. """Creates a regex-matching parser from the given raw parser"""
  178. return Parser.Regex(self.string)
  179. def __mul__(self, n): return Parser.RepN(self, n)
  180. def __floordiv__(self, other): return Parser.Transform(self, other)
  181. def __pow__(self, other): return Parser.Transform(self, lambda x: other(*x))
  182. def __rshift__(self, other): return Parser.TransformBound(self, other)
  183. class Full(parser):
  184. def parse_input(self, input):
  185. res = self.parser.parse_input(input)
  186. if type(res) is Success and res.remaining.index < len(input.string):
  187. return Failure(res.remaining, [self])
  188. else:
  189. return res
  190. def short_str(self):
  191. return self.parser.short_str()
  192. class Raw(string):
  193. def parse_input(self, input):
  194. if input.string[input.index:].startswith(self.string):
  195. return Success(self.string, {}, input.copy(index = input.index + len(self.string)))
  196. else:
  197. return Failure(input, [self])
  198. def short_str(self):
  199. return repr(self.string)
  200. class Regex(regex_string):
  201. def parse_input(self, input):
  202. match = re.match(self.regex_string, input.string[input.index:])
  203. if match:
  204. group = match.group()
  205. return Success(group, {}, input.copy(index = input.index + len(group)))
  206. else:
  207. return Failure(input, [self])
  208. def short_str(self):
  209. return repr(self.regex_string) + ".r"
  210. class Seq(children):
  211. def parse_input(self, input):
  212. current_input = input
  213. results = []
  214. result_dict = defaultdict(lambda: [])
  215. committed = False
  216. for child in self.children:
  217. if child is cut:
  218. committed = True
  219. else:
  220. res = child.parse_input(current_input)
  221. if type(res) is Failure:
  222. if committed or res.fatal:
  223. return Failure(res.remaining, [self] + res.failed, True)
  224. else:
  225. return res
  226. current_input = res.remaining
  227. results.append(res.output)
  228. for k, v in res.bindings.items():
  229. result_dict[k] = v
  230. return Success(results, result_dict, current_input)
  231. def short_str(self):
  232. return "(" + ", ".join(map(lambda x: x.short_str(), self.children)) + ")"
  233. class Or(children):
  234. def parse_input(self, input):
  235. for child in self.children:
  236. res = child.parse_input(input)
  237. if type(res) is Success:
  238. return res
  239. elif res.fatal:
  240. res.failed = [self] + res.failed
  241. return res
  242. return Failure(input, [self])
  243. def __or__(self, other): return Parser.Or(self.children + [other])
  244. def short_str(self):
  245. return "(" + " | ".join(map(lambda x: x.short_str(), self.children)) + ")"
  246. class And(children):
  247. def parse_input(self, input):
  248. results = [child.parse_input(input) for child in self.children]
  249. failures = [res for res in results if type(res) is Failure]
  250. if failures == []:
  251. return results[0]
  252. else:
  253. failures[0].failed = [self] + failures[0].failed
  254. return failures[0]
  255. def __and__(self, other): return Parser.And(self.children + [other])
  256. def short_str(self):
  257. return "(" + " & ".join(map(lambda x: x.short_str(), self.children)) + ")"
  258. class Not(parser):
  259. def parse_input(self, input):
  260. if type(self.parser.parse_input(input)) is Success:
  261. return Failure(input, [self])
  262. else:
  263. return Success(None, {}, input)
  264. def short_str(self):
  265. return "-" + self.parser.short_str()
  266. class Rep(parser):
  267. def parse_input(self, input):
  268. current_input = input
  269. results = []
  270. result_dict = defaultdict(lambda: [])
  271. while True:
  272. res = self.parser.parse_input(current_input)
  273. if type(res) is Failure:
  274. if res.fatal:
  275. res.failed = [self] + res.failed
  276. return res
  277. else:
  278. return Success(results, result_dict, current_input)
  279. current_input = res.remaining
  280. for k, v in res.bindings.items():
  281. result_dict[k] = result_dict[k] + [v]
  282. results.append(res.output)
  283. class RepN(parser, n):
  284. def parse_input(self, input):
  285. current_input = input
  286. results = []
  287. result_dict = defaultdict(lambda: [])
  288. for i in range(self.n):
  289. res = self.parser.parse_input(current_input)
  290. if type(res) is Failure:
  291. res.failed = [self] + res.failed
  292. return res
  293. current_input = res.remaining
  294. for k, v in res.bindings.items():
  295. result_dict[k] = result_dict[k] + [v]
  296. results.append(res.output)
  297. return Success(results, result_dict, current_input)
  298. def short_str(self):
  299. return self.parser.short_str() + "*" + n
  300. class Transform(parser, func):
  301. def parse_input(self, input):
  302. res = self.parser.parse_input(input)
  303. if type(res) is Success:
  304. res.output = self.func(res.output)
  305. else:
  306. res.failed = [self] + res.failed
  307. return res
  308. def short_str(self):
  309. return self.parser.short_str()
  310. class TransformBound(parser, func):
  311. def parse_input(self, input):
  312. res = self.parser.parse_input(input)
  313. if type(res) is Success:
  314. res.output = self.func(**res.bindings)
  315. res.bindings = {}
  316. else:
  317. res.failed = [self] + res.failed
  318. return res
  319. def short_str(self):
  320. return self.parser.short_str()
  321. class Named(parser_thunk, trace_name):
  322. self.stored_parser = None
  323. @property
  324. def parser(self):
  325. if not self.stored_parser:
  326. self.stored_parser = self.parser_thunk()
  327. return self.stored_parser
  328. def parse_input(self, input):
  329. res = self.parser.parse_input(input)
  330. if type(res) is Success:
  331. res.bindings = {self.trace_name[0]: res.output}
  332. else:
  333. res.failed = [self] + res.failed
  334. return res
  335. def short_str(self):
  336. return self.trace_name[0]
  337. class Succeed(string):
  338. def parse_input(self, input):
  339. return Success(self.string, {}, input)
  340. class Fail():
  341. def parse_input(self, input):
  342. return Failure(input, [self])
  343. def short_str(self):
  344. return "fail"