sre_parse.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862
  1. #
  2. # Secret Labs' Regular Expression Engine
  3. #
  4. # convert re-style regular expression to sre pattern
  5. #
  6. # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
  7. #
  8. # See the sre.py file for information on usage and redistribution.
  9. #
  10. """Internal support module for sre"""
  11. # XXX: show string offset and offending character for all errors
  12. import sys
  13. from sre_constants import *
  14. SPECIAL_CHARS = ".\\[{()*+?^$|"
  15. REPEAT_CHARS = "*+?{"
  16. DIGITS = set("0123456789")
  17. OCTDIGITS = set("01234567")
  18. HEXDIGITS = set("0123456789abcdefABCDEF")
  19. ASCIILETTERS = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
  20. WHITESPACE = set(" \t\n\r\v\f")
  21. ESCAPES = {
  22. r"\a": (LITERAL, ord("\a")),
  23. r"\b": (LITERAL, ord("\b")),
  24. r"\f": (LITERAL, ord("\f")),
  25. r"\n": (LITERAL, ord("\n")),
  26. r"\r": (LITERAL, ord("\r")),
  27. r"\t": (LITERAL, ord("\t")),
  28. r"\v": (LITERAL, ord("\v")),
  29. r"\\": (LITERAL, ord("\\"))
  30. }
  31. CATEGORIES = {
  32. r"\A": (AT, AT_BEGINNING_STRING), # start of string
  33. r"\b": (AT, AT_BOUNDARY),
  34. r"\B": (AT, AT_NON_BOUNDARY),
  35. r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
  36. r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
  37. r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
  38. r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
  39. r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
  40. r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
  41. r"\Z": (AT, AT_END_STRING), # end of string
  42. }
  43. FLAGS = {
  44. # standard flags
  45. "i": SRE_FLAG_IGNORECASE,
  46. "L": SRE_FLAG_LOCALE,
  47. "m": SRE_FLAG_MULTILINE,
  48. "s": SRE_FLAG_DOTALL,
  49. "x": SRE_FLAG_VERBOSE,
  50. # extensions
  51. "t": SRE_FLAG_TEMPLATE,
  52. "u": SRE_FLAG_UNICODE,
  53. }
  54. class Pattern:
  55. # master pattern object. keeps track of global attributes
  56. def __init__(self):
  57. self.flags = 0
  58. self.open = []
  59. self.groups = 1
  60. self.groupdict = {}
  61. self.lookbehind = 0
  62. def opengroup(self, name=None):
  63. gid = self.groups
  64. self.groups = gid + 1
  65. if name is not None:
  66. ogid = self.groupdict.get(name, None)
  67. if ogid is not None:
  68. raise error, ("redefinition of group name %s as group %d; "
  69. "was group %d" % (repr(name), gid, ogid))
  70. self.groupdict[name] = gid
  71. self.open.append(gid)
  72. return gid
  73. def closegroup(self, gid):
  74. self.open.remove(gid)
  75. def checkgroup(self, gid):
  76. return gid < self.groups and gid not in self.open
  77. class SubPattern:
  78. # a subpattern, in intermediate form
  79. def __init__(self, pattern, data=None):
  80. self.pattern = pattern
  81. if data is None:
  82. data = []
  83. self.data = data
  84. self.width = None
  85. def dump(self, level=0):
  86. seqtypes = (tuple, list)
  87. for op, av in self.data:
  88. print level*" " + op,
  89. if op == IN:
  90. # member sublanguage
  91. print
  92. for op, a in av:
  93. print (level+1)*" " + op, a
  94. elif op == BRANCH:
  95. print
  96. for i, a in enumerate(av[1]):
  97. if i:
  98. print level*" " + "or"
  99. a.dump(level+1)
  100. elif op == GROUPREF_EXISTS:
  101. condgroup, item_yes, item_no = av
  102. print condgroup
  103. item_yes.dump(level+1)
  104. if item_no:
  105. print level*" " + "else"
  106. item_no.dump(level+1)
  107. elif isinstance(av, seqtypes):
  108. nl = 0
  109. for a in av:
  110. if isinstance(a, SubPattern):
  111. if not nl:
  112. print
  113. a.dump(level+1)
  114. nl = 1
  115. else:
  116. print a,
  117. nl = 0
  118. if not nl:
  119. print
  120. else:
  121. print av
  122. def __repr__(self):
  123. return repr(self.data)
  124. def __len__(self):
  125. return len(self.data)
  126. def __delitem__(self, index):
  127. del self.data[index]
  128. def __getitem__(self, index):
  129. if isinstance(index, slice):
  130. return SubPattern(self.pattern, self.data[index])
  131. return self.data[index]
  132. def __setitem__(self, index, code):
  133. self.data[index] = code
  134. def insert(self, index, code):
  135. self.data.insert(index, code)
  136. def append(self, code):
  137. self.data.append(code)
  138. def getwidth(self):
  139. # determine the width (min, max) for this subpattern
  140. if self.width:
  141. return self.width
  142. lo = hi = 0
  143. UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)
  144. REPEATCODES = (MIN_REPEAT, MAX_REPEAT)
  145. for op, av in self.data:
  146. if op is BRANCH:
  147. i = MAXREPEAT - 1
  148. j = 0
  149. for av in av[1]:
  150. l, h = av.getwidth()
  151. i = min(i, l)
  152. j = max(j, h)
  153. lo = lo + i
  154. hi = hi + j
  155. elif op is CALL:
  156. i, j = av.getwidth()
  157. lo = lo + i
  158. hi = hi + j
  159. elif op is SUBPATTERN:
  160. i, j = av[1].getwidth()
  161. lo = lo + i
  162. hi = hi + j
  163. elif op in REPEATCODES:
  164. i, j = av[2].getwidth()
  165. lo = lo + i * av[0]
  166. hi = hi + j * av[1]
  167. elif op in UNITCODES:
  168. lo = lo + 1
  169. hi = hi + 1
  170. elif op == SUCCESS:
  171. break
  172. self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
  173. return self.width
  174. class Tokenizer:
  175. def __init__(self, string):
  176. self.string = string
  177. self.index = 0
  178. self.__next()
  179. def __next(self):
  180. if self.index >= len(self.string):
  181. self.next = None
  182. return
  183. char = self.string[self.index]
  184. if char[0] == "\\":
  185. try:
  186. c = self.string[self.index + 1]
  187. except IndexError:
  188. raise error, "bogus escape (end of line)"
  189. char = char + c
  190. self.index = self.index + len(char)
  191. self.next = char
  192. def match(self, char, skip=1):
  193. if char == self.next:
  194. if skip:
  195. self.__next()
  196. return 1
  197. return 0
  198. def get(self):
  199. this = self.next
  200. self.__next()
  201. return this
  202. def tell(self):
  203. return self.index, self.next
  204. def seek(self, index):
  205. self.index, self.next = index
  206. def isident(char):
  207. return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
  208. def isdigit(char):
  209. return "0" <= char <= "9"
  210. def isname(name):
  211. # check that group name is a valid string
  212. if not isident(name[0]):
  213. return False
  214. for char in name[1:]:
  215. if not isident(char) and not isdigit(char):
  216. return False
  217. return True
  218. def _class_escape(source, escape, nested):
  219. # handle escape code inside character class
  220. code = ESCAPES.get(escape)
  221. if code:
  222. return code
  223. code = CATEGORIES.get(escape)
  224. if code and code[0] == IN:
  225. return code
  226. try:
  227. c = escape[1:2]
  228. if c == "x":
  229. # hexadecimal escape (exactly two digits)
  230. while source.next in HEXDIGITS and len(escape) < 4:
  231. escape = escape + source.get()
  232. escape = escape[2:]
  233. if len(escape) != 2:
  234. raise error, "bogus escape: %s" % repr("\\" + escape)
  235. return LITERAL, int(escape, 16) & 0xff
  236. elif c in OCTDIGITS:
  237. # octal escape (up to three digits)
  238. while source.next in OCTDIGITS and len(escape) < 4:
  239. escape = escape + source.get()
  240. escape = escape[1:]
  241. return LITERAL, int(escape, 8) & 0xff
  242. elif c in DIGITS:
  243. raise error, "bogus escape: %s" % repr(escape)
  244. if len(escape) == 2:
  245. if sys.py3kwarning and c in ASCIILETTERS:
  246. import warnings
  247. if c in 'Uu':
  248. warnings.warn('bad escape %s; Unicode escapes are '
  249. 'supported only since Python 3.3' % escape,
  250. FutureWarning, stacklevel=nested + 6)
  251. else:
  252. warnings.warnpy3k('bad escape %s' % escape,
  253. DeprecationWarning, stacklevel=nested + 6)
  254. return LITERAL, ord(escape[1])
  255. except ValueError:
  256. pass
  257. raise error, "bogus escape: %s" % repr(escape)
  258. def _escape(source, escape, state, nested):
  259. # handle escape code in expression
  260. code = CATEGORIES.get(escape)
  261. if code:
  262. return code
  263. code = ESCAPES.get(escape)
  264. if code:
  265. return code
  266. try:
  267. c = escape[1:2]
  268. if c == "x":
  269. # hexadecimal escape
  270. while source.next in HEXDIGITS and len(escape) < 4:
  271. escape = escape + source.get()
  272. if len(escape) != 4:
  273. raise ValueError
  274. return LITERAL, int(escape[2:], 16) & 0xff
  275. elif c == "0":
  276. # octal escape
  277. while source.next in OCTDIGITS and len(escape) < 4:
  278. escape = escape + source.get()
  279. return LITERAL, int(escape[1:], 8) & 0xff
  280. elif c in DIGITS:
  281. # octal escape *or* decimal group reference (sigh)
  282. if source.next in DIGITS:
  283. escape = escape + source.get()
  284. if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
  285. source.next in OCTDIGITS):
  286. # got three octal digits; this is an octal escape
  287. escape = escape + source.get()
  288. return LITERAL, int(escape[1:], 8) & 0xff
  289. # not an octal escape, so this is a group reference
  290. group = int(escape[1:])
  291. if group < state.groups:
  292. if not state.checkgroup(group):
  293. raise error, "cannot refer to open group"
  294. if state.lookbehind:
  295. import warnings
  296. warnings.warn('group references in lookbehind '
  297. 'assertions are not supported',
  298. RuntimeWarning, stacklevel=nested + 6)
  299. return GROUPREF, group
  300. raise ValueError
  301. if len(escape) == 2:
  302. if sys.py3kwarning and c in ASCIILETTERS:
  303. import warnings
  304. if c in 'Uu':
  305. warnings.warn('bad escape %s; Unicode escapes are '
  306. 'supported only since Python 3.3' % escape,
  307. FutureWarning, stacklevel=nested + 6)
  308. else:
  309. warnings.warnpy3k('bad escape %s' % escape,
  310. DeprecationWarning, stacklevel=nested + 6)
  311. return LITERAL, ord(escape[1])
  312. except ValueError:
  313. pass
  314. raise error, "bogus escape: %s" % repr(escape)
  315. def _parse_sub(source, state, nested):
  316. # parse an alternation: a|b|c
  317. items = []
  318. itemsappend = items.append
  319. sourcematch = source.match
  320. while 1:
  321. itemsappend(_parse(source, state, nested + 1))
  322. if sourcematch("|"):
  323. continue
  324. if not nested:
  325. break
  326. if not source.next or sourcematch(")", 0):
  327. break
  328. else:
  329. raise error, "pattern not properly closed"
  330. if len(items) == 1:
  331. return items[0]
  332. subpattern = SubPattern(state)
  333. subpatternappend = subpattern.append
  334. # check if all items share a common prefix
  335. while 1:
  336. prefix = None
  337. for item in items:
  338. if not item:
  339. break
  340. if prefix is None:
  341. prefix = item[0]
  342. elif item[0] != prefix:
  343. break
  344. else:
  345. # all subitems start with a common "prefix".
  346. # move it out of the branch
  347. for item in items:
  348. del item[0]
  349. subpatternappend(prefix)
  350. continue # check next one
  351. break
  352. # check if the branch can be replaced by a character set
  353. for item in items:
  354. if len(item) != 1 or item[0][0] != LITERAL:
  355. break
  356. else:
  357. # we can store this as a character set instead of a
  358. # branch (the compiler may optimize this even more)
  359. set = []
  360. setappend = set.append
  361. for item in items:
  362. setappend(item[0])
  363. subpatternappend((IN, set))
  364. return subpattern
  365. subpattern.append((BRANCH, (None, items)))
  366. return subpattern
  367. def _parse_sub_cond(source, state, condgroup, nested):
  368. item_yes = _parse(source, state, nested + 1)
  369. if source.match("|"):
  370. item_no = _parse(source, state, nested + 1)
  371. if source.match("|"):
  372. raise error, "conditional backref with more than two branches"
  373. else:
  374. item_no = None
  375. if source.next and not source.match(")", 0):
  376. raise error, "pattern not properly closed"
  377. subpattern = SubPattern(state)
  378. subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
  379. return subpattern
  380. _PATTERNENDERS = set("|)")
  381. _ASSERTCHARS = set("=!<")
  382. _LOOKBEHINDASSERTCHARS = set("=!")
  383. _REPEATCODES = set([MIN_REPEAT, MAX_REPEAT])
  384. def _parse(source, state, nested):
  385. # parse a simple pattern
  386. subpattern = SubPattern(state)
  387. # precompute constants into local variables
  388. subpatternappend = subpattern.append
  389. sourceget = source.get
  390. sourcematch = source.match
  391. _len = len
  392. PATTERNENDERS = _PATTERNENDERS
  393. ASSERTCHARS = _ASSERTCHARS
  394. LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS
  395. REPEATCODES = _REPEATCODES
  396. while 1:
  397. if source.next in PATTERNENDERS:
  398. break # end of subpattern
  399. this = sourceget()
  400. if this is None:
  401. break # end of pattern
  402. if state.flags & SRE_FLAG_VERBOSE:
  403. # skip whitespace and comments
  404. if this in WHITESPACE:
  405. continue
  406. if this == "#":
  407. while 1:
  408. this = sourceget()
  409. if this in (None, "\n"):
  410. break
  411. continue
  412. if this and this[0] not in SPECIAL_CHARS:
  413. subpatternappend((LITERAL, ord(this)))
  414. elif this == "[":
  415. # character set
  416. set = []
  417. setappend = set.append
  418. ## if sourcematch(":"):
  419. ## pass # handle character classes
  420. if sourcematch("^"):
  421. setappend((NEGATE, None))
  422. # check remaining characters
  423. start = set[:]
  424. while 1:
  425. this = sourceget()
  426. if this == "]" and set != start:
  427. break
  428. elif this and this[0] == "\\":
  429. code1 = _class_escape(source, this, nested + 1)
  430. elif this:
  431. code1 = LITERAL, ord(this)
  432. else:
  433. raise error, "unexpected end of regular expression"
  434. if sourcematch("-"):
  435. # potential range
  436. this = sourceget()
  437. if this == "]":
  438. if code1[0] is IN:
  439. code1 = code1[1][0]
  440. setappend(code1)
  441. setappend((LITERAL, ord("-")))
  442. break
  443. elif this:
  444. if this[0] == "\\":
  445. code2 = _class_escape(source, this, nested + 1)
  446. else:
  447. code2 = LITERAL, ord(this)
  448. if code1[0] != LITERAL or code2[0] != LITERAL:
  449. raise error, "bad character range"
  450. lo = code1[1]
  451. hi = code2[1]
  452. if hi < lo:
  453. raise error, "bad character range"
  454. setappend((RANGE, (lo, hi)))
  455. else:
  456. raise error, "unexpected end of regular expression"
  457. else:
  458. if code1[0] is IN:
  459. code1 = code1[1][0]
  460. setappend(code1)
  461. # XXX: <fl> should move set optimization to compiler!
  462. if _len(set)==1 and set[0][0] is LITERAL:
  463. subpatternappend(set[0]) # optimization
  464. elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
  465. subpatternappend((NOT_LITERAL, set[1][1])) # optimization
  466. else:
  467. # XXX: <fl> should add charmap optimization here
  468. subpatternappend((IN, set))
  469. elif this and this[0] in REPEAT_CHARS:
  470. # repeat previous item
  471. if this == "?":
  472. min, max = 0, 1
  473. elif this == "*":
  474. min, max = 0, MAXREPEAT
  475. elif this == "+":
  476. min, max = 1, MAXREPEAT
  477. elif this == "{":
  478. if source.next == "}":
  479. subpatternappend((LITERAL, ord(this)))
  480. continue
  481. here = source.tell()
  482. min, max = 0, MAXREPEAT
  483. lo = hi = ""
  484. while source.next in DIGITS:
  485. lo = lo + source.get()
  486. if sourcematch(","):
  487. while source.next in DIGITS:
  488. hi = hi + sourceget()
  489. else:
  490. hi = lo
  491. if not sourcematch("}"):
  492. subpatternappend((LITERAL, ord(this)))
  493. source.seek(here)
  494. continue
  495. if lo:
  496. min = int(lo)
  497. if min >= MAXREPEAT:
  498. raise OverflowError("the repetition number is too large")
  499. if hi:
  500. max = int(hi)
  501. if max >= MAXREPEAT:
  502. raise OverflowError("the repetition number is too large")
  503. if max < min:
  504. raise error("bad repeat interval")
  505. else:
  506. raise error, "not supported"
  507. # figure out which item to repeat
  508. if subpattern:
  509. item = subpattern[-1:]
  510. else:
  511. item = None
  512. if not item or (_len(item) == 1 and item[0][0] == AT):
  513. raise error, "nothing to repeat"
  514. if item[0][0] in REPEATCODES:
  515. raise error, "multiple repeat"
  516. if sourcematch("?"):
  517. subpattern[-1] = (MIN_REPEAT, (min, max, item))
  518. else:
  519. subpattern[-1] = (MAX_REPEAT, (min, max, item))
  520. elif this == ".":
  521. subpatternappend((ANY, None))
  522. elif this == "(":
  523. group = 1
  524. name = None
  525. condgroup = None
  526. if sourcematch("?"):
  527. group = 0
  528. # options
  529. if sourcematch("P"):
  530. # python extensions
  531. if sourcematch("<"):
  532. # named group: skip forward to end of name
  533. name = ""
  534. while 1:
  535. char = sourceget()
  536. if char is None:
  537. raise error, "unterminated name"
  538. if char == ">":
  539. break
  540. name = name + char
  541. group = 1
  542. if not name:
  543. raise error("missing group name")
  544. if not isname(name):
  545. raise error("bad character in group name %r" %
  546. name)
  547. elif sourcematch("="):
  548. # named backreference
  549. name = ""
  550. while 1:
  551. char = sourceget()
  552. if char is None:
  553. raise error, "unterminated name"
  554. if char == ")":
  555. break
  556. name = name + char
  557. if not name:
  558. raise error("missing group name")
  559. if not isname(name):
  560. raise error("bad character in backref group name "
  561. "%r" % name)
  562. gid = state.groupdict.get(name)
  563. if gid is None:
  564. msg = "unknown group name: {0!r}".format(name)
  565. raise error(msg)
  566. if state.lookbehind:
  567. import warnings
  568. warnings.warn('group references in lookbehind '
  569. 'assertions are not supported',
  570. RuntimeWarning, stacklevel=nested + 6)
  571. subpatternappend((GROUPREF, gid))
  572. continue
  573. else:
  574. char = sourceget()
  575. if char is None:
  576. raise error, "unexpected end of pattern"
  577. raise error, "unknown specifier: ?P%s" % char
  578. elif sourcematch(":"):
  579. # non-capturing group
  580. group = 2
  581. elif sourcematch("#"):
  582. # comment
  583. while 1:
  584. if source.next is None or source.next == ")":
  585. break
  586. sourceget()
  587. if not sourcematch(")"):
  588. raise error, "unbalanced parenthesis"
  589. continue
  590. elif source.next in ASSERTCHARS:
  591. # lookahead assertions
  592. char = sourceget()
  593. dir = 1
  594. if char == "<":
  595. if source.next not in LOOKBEHINDASSERTCHARS:
  596. raise error, "syntax error"
  597. dir = -1 # lookbehind
  598. char = sourceget()
  599. state.lookbehind += 1
  600. p = _parse_sub(source, state, nested + 1)
  601. if dir < 0:
  602. state.lookbehind -= 1
  603. if not sourcematch(")"):
  604. raise error, "unbalanced parenthesis"
  605. if char == "=":
  606. subpatternappend((ASSERT, (dir, p)))
  607. else:
  608. subpatternappend((ASSERT_NOT, (dir, p)))
  609. continue
  610. elif sourcematch("("):
  611. # conditional backreference group
  612. condname = ""
  613. while 1:
  614. char = sourceget()
  615. if char is None:
  616. raise error, "unterminated name"
  617. if char == ")":
  618. break
  619. condname = condname + char
  620. group = 2
  621. if not condname:
  622. raise error("missing group name")
  623. if isname(condname):
  624. condgroup = state.groupdict.get(condname)
  625. if condgroup is None:
  626. msg = "unknown group name: {0!r}".format(condname)
  627. raise error(msg)
  628. else:
  629. try:
  630. condgroup = int(condname)
  631. except ValueError:
  632. raise error, "bad character in group name"
  633. if state.lookbehind:
  634. import warnings
  635. warnings.warn('group references in lookbehind '
  636. 'assertions are not supported',
  637. RuntimeWarning, stacklevel=nested + 6)
  638. else:
  639. # flags
  640. if not source.next in FLAGS:
  641. raise error, "unexpected end of pattern"
  642. while source.next in FLAGS:
  643. state.flags = state.flags | FLAGS[sourceget()]
  644. if group:
  645. # parse group contents
  646. if group == 2:
  647. # anonymous group
  648. group = None
  649. else:
  650. group = state.opengroup(name)
  651. if condgroup:
  652. p = _parse_sub_cond(source, state, condgroup, nested + 1)
  653. else:
  654. p = _parse_sub(source, state, nested + 1)
  655. if not sourcematch(")"):
  656. raise error, "unbalanced parenthesis"
  657. if group is not None:
  658. state.closegroup(group)
  659. subpatternappend((SUBPATTERN, (group, p)))
  660. else:
  661. while 1:
  662. char = sourceget()
  663. if char is None:
  664. raise error, "unexpected end of pattern"
  665. if char == ")":
  666. break
  667. raise error, "unknown extension"
  668. elif this == "^":
  669. subpatternappend((AT, AT_BEGINNING))
  670. elif this == "$":
  671. subpattern.append((AT, AT_END))
  672. elif this and this[0] == "\\":
  673. code = _escape(source, this, state, nested + 1)
  674. subpatternappend(code)
  675. else:
  676. raise error, "parser error"
  677. return subpattern
  678. def parse(str, flags=0, pattern=None):
  679. # parse 're' pattern into list of (opcode, argument) tuples
  680. source = Tokenizer(str)
  681. if pattern is None:
  682. pattern = Pattern()
  683. pattern.flags = flags
  684. pattern.str = str
  685. p = _parse_sub(source, pattern, 0)
  686. if (sys.py3kwarning and
  687. (p.pattern.flags & SRE_FLAG_LOCALE) and
  688. (p.pattern.flags & SRE_FLAG_UNICODE)):
  689. import warnings
  690. warnings.warnpy3k("LOCALE and UNICODE flags are incompatible",
  691. DeprecationWarning, stacklevel=5)
  692. tail = source.get()
  693. if tail == ")":
  694. raise error, "unbalanced parenthesis"
  695. elif tail:
  696. raise error, "bogus characters at end of regular expression"
  697. if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
  698. # the VERBOSE flag was switched on inside the pattern. to be
  699. # on the safe side, we'll parse the whole thing again...
  700. return parse(str, p.pattern.flags)
  701. if flags & SRE_FLAG_DEBUG:
  702. p.dump()
  703. return p
  704. def parse_template(source, pattern):
  705. # parse 're' replacement string into list of literals and
  706. # group references
  707. s = Tokenizer(source)
  708. sget = s.get
  709. p = []
  710. a = p.append
  711. def literal(literal, p=p, pappend=a):
  712. if p and p[-1][0] is LITERAL:
  713. p[-1] = LITERAL, p[-1][1] + literal
  714. else:
  715. pappend((LITERAL, literal))
  716. sep = source[:0]
  717. if type(sep) is type(""):
  718. makechar = chr
  719. else:
  720. makechar = unichr
  721. while 1:
  722. this = sget()
  723. if this is None:
  724. break # end of replacement string
  725. if this and this[0] == "\\":
  726. # group
  727. c = this[1:2]
  728. if c == "g":
  729. name = ""
  730. if s.match("<"):
  731. while 1:
  732. char = sget()
  733. if char is None:
  734. raise error, "unterminated group name"
  735. if char == ">":
  736. break
  737. name = name + char
  738. if not name:
  739. raise error, "missing group name"
  740. try:
  741. index = int(name)
  742. if index < 0:
  743. raise error, "negative group number"
  744. except ValueError:
  745. if not isname(name):
  746. raise error, "bad character in group name"
  747. try:
  748. index = pattern.groupindex[name]
  749. except KeyError:
  750. msg = "unknown group name: {0!r}".format(name)
  751. raise IndexError(msg)
  752. a((MARK, index))
  753. elif c == "0":
  754. if s.next in OCTDIGITS:
  755. this = this + sget()
  756. if s.next in OCTDIGITS:
  757. this = this + sget()
  758. literal(makechar(int(this[1:], 8) & 0xff))
  759. elif c in DIGITS:
  760. isoctal = False
  761. if s.next in DIGITS:
  762. this = this + sget()
  763. if (c in OCTDIGITS and this[2] in OCTDIGITS and
  764. s.next in OCTDIGITS):
  765. this = this + sget()
  766. isoctal = True
  767. literal(makechar(int(this[1:], 8) & 0xff))
  768. if not isoctal:
  769. a((MARK, int(this[1:])))
  770. else:
  771. try:
  772. this = makechar(ESCAPES[this][1])
  773. except KeyError:
  774. if sys.py3kwarning and c in ASCIILETTERS:
  775. import warnings
  776. warnings.warnpy3k('bad escape %s' % this,
  777. DeprecationWarning, stacklevel=4)
  778. literal(this)
  779. else:
  780. literal(this)
  781. # convert template to groups and literals lists
  782. i = 0
  783. groups = []
  784. groupsappend = groups.append
  785. literals = [None] * len(p)
  786. for c, s in p:
  787. if c is MARK:
  788. groupsappend((i, s))
  789. # literal[i] is already None
  790. else:
  791. literals[i] = s
  792. i = i + 1
  793. return groups, literals
  794. def expand_template(template, match):
  795. g = match.group
  796. sep = match.string[:0]
  797. groups, literals = template
  798. literals = literals[:]
  799. try:
  800. for index, group in groups:
  801. literals[index] = s = g(group)
  802. if s is None:
  803. raise error, "unmatched group"
  804. except IndexError:
  805. raise error, "invalid group reference"
  806. return sep.join(literals)