regex.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499
  1. # coding=utf-8
  2. #
  3. # This file is part of Hypothesis, which may be found at
  4. # https://github.com/HypothesisWorks/hypothesis-python
  5. #
  6. # Most of this work is copyright (C) 2013-2018 David R. MacIver
  7. # (david@drmaciver.com), but it contains contributions by others. See
  8. # CONTRIBUTING.rst for a full list of people who may hold copyright, and
  9. # consult the git log if you need to determine who owns an individual
  10. # contribution.
  11. #
  12. # This Source Code Form is subject to the terms of the Mozilla Public License,
  13. # v. 2.0. If a copy of the MPL was not distributed with this file, You can
  14. # obtain one at http://mozilla.org/MPL/2.0/.
  15. #
  16. # END HEADER
  17. from __future__ import division, print_function, absolute_import
  18. import re
  19. import sys
  20. import operator
  21. import sre_parse
  22. import sre_constants as sre
  23. import hypothesis.strategies as st
  24. from hypothesis import reject
  25. from hypothesis.internal.compat import PY3, hrange, hunichr, text_type, \
  26. int_to_byte
  27. HAS_SUBPATTERN_FLAGS = sys.version_info[:2] >= (3, 6)
  28. UNICODE_CATEGORIES = set([
  29. 'Cf', 'Cn', 'Co', 'LC', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
  30. 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe',
  31. 'Pf', 'Pi', 'Po', 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl',
  32. 'Zp', 'Zs',
  33. ])
  34. SPACE_CHARS = set(u' \t\n\r\f\v')
  35. UNICODE_SPACE_CHARS = SPACE_CHARS | set(u'\x1c\x1d\x1e\x1f\x85')
  36. UNICODE_DIGIT_CATEGORIES = set(['Nd'])
  37. UNICODE_SPACE_CATEGORIES = set(['Zs', 'Zl', 'Zp'])
  38. UNICODE_LETTER_CATEGORIES = set(['LC', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu'])
  39. UNICODE_WORD_CATEGORIES = UNICODE_LETTER_CATEGORIES | set(['Nd', 'Nl', 'No'])
  40. # This is verbose, but correct on all versions of Python
  41. BYTES_ALL = set(int_to_byte(i) for i in range(256))
  42. BYTES_DIGIT = set(b for b in BYTES_ALL if re.match(b'\\d', b))
  43. BYTES_SPACE = set(b for b in BYTES_ALL if re.match(b'\\s', b))
  44. BYTES_WORD = set(b for b in BYTES_ALL if re.match(b'\\w', b))
  45. BYTES_LOOKUP = {
  46. sre.CATEGORY_DIGIT: BYTES_DIGIT,
  47. sre.CATEGORY_SPACE: BYTES_SPACE,
  48. sre.CATEGORY_WORD: BYTES_WORD,
  49. sre.CATEGORY_NOT_DIGIT: BYTES_ALL - BYTES_DIGIT,
  50. sre.CATEGORY_NOT_SPACE: BYTES_ALL - BYTES_SPACE,
  51. sre.CATEGORY_NOT_WORD: BYTES_ALL - BYTES_WORD,
  52. }
  53. # On Python < 3.4 (including 2.7), the following unicode chars are weird.
  54. # They are matched by the \W, meaning 'not word', but unicodedata.category(c)
  55. # returns one of the word categories above. There's special handling below.
  56. HAS_WEIRD_WORD_CHARS = sys.version_info[:2] < (3, 4)
  57. UNICODE_WEIRD_NONWORD_CHARS = set(u'\U00012432\U00012433\U00012456\U00012457')
  58. GROUP_CACHE_STRATEGY = st.shared(
  59. st.builds(dict), key='hypothesis.regex.group_cache'
  60. )
  61. @st.composite
  62. def update_group(draw, group_name, strategy):
  63. cache = draw(GROUP_CACHE_STRATEGY)
  64. result = draw(strategy)
  65. cache[group_name] = result
  66. return result
  67. @st.composite
  68. def reuse_group(draw, group_name):
  69. cache = draw(GROUP_CACHE_STRATEGY)
  70. try:
  71. return cache[group_name]
  72. except KeyError:
  73. reject()
  74. @st.composite
  75. def group_conditional(draw, group_name, if_yes, if_no):
  76. cache = draw(GROUP_CACHE_STRATEGY)
  77. if group_name in cache:
  78. return draw(if_yes)
  79. else:
  80. return draw(if_no)
  81. @st.composite
  82. def clear_cache_after_draw(draw, base_strategy):
  83. cache = draw(GROUP_CACHE_STRATEGY)
  84. result = draw(base_strategy)
  85. cache.clear()
  86. return result
  87. class Context(object):
  88. __slots__ = ['flags']
  89. def __init__(self, groups=None, flags=0):
  90. self.flags = flags
  91. class CharactersBuilder(object):
  92. """Helper object that allows to configure `characters` strategy with
  93. various unicode categories and characters. Also allows negation of
  94. configured set.
  95. :param negate: If True, configure :func:`hypothesis.strategies.characters`
  96. to match anything other than configured character set
  97. :param flags: Regex flags. They affect how and which characters are matched
  98. """
  99. def __init__(self, negate=False, flags=0):
  100. self._categories = set()
  101. self._whitelist_chars = set()
  102. self._blacklist_chars = set()
  103. self._negate = negate
  104. self._ignorecase = flags & re.IGNORECASE
  105. self._unicode = not bool(flags & re.ASCII) \
  106. if PY3 else bool(flags & re.UNICODE)
  107. self.code_to_char = hunichr
  108. @property
  109. def strategy(self):
  110. """Returns resulting strategy that generates configured char set."""
  111. max_codepoint = None if self._unicode else 127
  112. if self._negate:
  113. black_chars = self._blacklist_chars - self._whitelist_chars
  114. return st.characters(
  115. blacklist_categories=self._categories | {'Cc', 'Cs'},
  116. blacklist_characters=self._whitelist_chars,
  117. whitelist_characters=black_chars,
  118. max_codepoint=max_codepoint,
  119. )
  120. white_chars = self._whitelist_chars - self._blacklist_chars
  121. return st.characters(
  122. whitelist_categories=self._categories,
  123. blacklist_characters=self._blacklist_chars,
  124. whitelist_characters=white_chars,
  125. max_codepoint=max_codepoint,
  126. )
  127. def add_category(self, category):
  128. """Update unicode state to match sre_parse object ``category``."""
  129. if category == sre.CATEGORY_DIGIT:
  130. self._categories |= UNICODE_DIGIT_CATEGORIES
  131. elif category == sre.CATEGORY_NOT_DIGIT:
  132. self._categories |= UNICODE_CATEGORIES - UNICODE_DIGIT_CATEGORIES
  133. elif category == sre.CATEGORY_SPACE:
  134. self._categories |= UNICODE_SPACE_CATEGORIES
  135. self._whitelist_chars |= UNICODE_SPACE_CHARS \
  136. if self._unicode else SPACE_CHARS
  137. elif category == sre.CATEGORY_NOT_SPACE:
  138. self._categories |= UNICODE_CATEGORIES - UNICODE_SPACE_CATEGORIES
  139. self._blacklist_chars |= UNICODE_SPACE_CHARS \
  140. if self._unicode else SPACE_CHARS
  141. elif category == sre.CATEGORY_WORD:
  142. self._categories |= UNICODE_WORD_CATEGORIES
  143. self._whitelist_chars.add(u'_')
  144. if HAS_WEIRD_WORD_CHARS and self._unicode: # pragma: no cover
  145. # This code is workaround of weird behavior in
  146. # specific Python versions and run only on those versions
  147. self._blacklist_chars |= UNICODE_WEIRD_NONWORD_CHARS
  148. elif category == sre.CATEGORY_NOT_WORD:
  149. self._categories |= UNICODE_CATEGORIES - UNICODE_WORD_CATEGORIES
  150. self._blacklist_chars.add(u'_')
  151. if HAS_WEIRD_WORD_CHARS and self._unicode: # pragma: no cover
  152. # This code is workaround of weird behavior in
  153. # specific Python versions and run only on those versions
  154. self._whitelist_chars |= UNICODE_WEIRD_NONWORD_CHARS
  155. else: # pragma: no cover
  156. raise AssertionError('Unknown character category: %s' % category)
  157. def add_char(self, char):
  158. """Add given char to the whitelist."""
  159. c = self.code_to_char(char)
  160. self._whitelist_chars.add(c)
  161. if self._ignorecase and \
  162. re.match(c, c.swapcase(), re.IGNORECASE) is not None:
  163. self._whitelist_chars.add(c.swapcase())
  164. class BytesBuilder(CharactersBuilder):
  165. def __init__(self, negate=False, flags=0):
  166. self._whitelist_chars = set()
  167. self._blacklist_chars = set()
  168. self._negate = negate
  169. self._ignorecase = flags & re.IGNORECASE
  170. self.code_to_char = int_to_byte
  171. @property
  172. def strategy(self):
  173. """Returns resulting strategy that generates configured char set."""
  174. allowed = self._whitelist_chars
  175. if self._negate:
  176. allowed = BYTES_ALL - allowed
  177. return st.sampled_from(sorted(allowed))
  178. def add_category(self, category):
  179. """Update characters state to match sre_parse object ``category``."""
  180. self._whitelist_chars |= BYTES_LOOKUP[category]
  181. @st.composite
  182. def maybe_pad(draw, regex, strategy, left_pad_strategy, right_pad_strategy):
  183. """Attempt to insert padding around the result of a regex draw while
  184. preserving the match."""
  185. result = draw(strategy)
  186. left_pad = draw(left_pad_strategy)
  187. if left_pad and regex.search(left_pad + result):
  188. result = left_pad + result
  189. right_pad = draw(right_pad_strategy)
  190. if right_pad and regex.search(result + right_pad):
  191. result += right_pad
  192. return result
  193. def base_regex_strategy(regex, parsed=None):
  194. if parsed is None:
  195. parsed = sre_parse.parse(regex.pattern, flags=regex.flags)
  196. return clear_cache_after_draw(_strategy(
  197. parsed,
  198. Context(flags=regex.flags),
  199. isinstance(regex.pattern, text_type)
  200. ))
  201. def regex_strategy(regex):
  202. if not hasattr(regex, 'pattern'):
  203. regex = re.compile(regex)
  204. is_unicode = isinstance(regex.pattern, text_type)
  205. parsed = sre_parse.parse(regex.pattern, flags=regex.flags)
  206. if not parsed:
  207. if is_unicode:
  208. return st.text()
  209. else:
  210. return st.binary()
  211. if is_unicode:
  212. base_padding_strategy = st.text(average_size=1)
  213. empty = st.just(u'')
  214. newline = st.just(u'\n')
  215. else:
  216. base_padding_strategy = st.binary(average_size=1)
  217. empty = st.just(b'')
  218. newline = st.just(b'\n')
  219. right_pad = base_padding_strategy
  220. left_pad = base_padding_strategy
  221. if parsed[-1][0] == sre.AT:
  222. if parsed[-1][1] == sre.AT_END_STRING:
  223. right_pad = empty
  224. elif parsed[-1][1] == sre.AT_END:
  225. if regex.flags & re.MULTILINE:
  226. right_pad = st.one_of(
  227. empty,
  228. st.builds(operator.add, newline, right_pad)
  229. )
  230. else:
  231. right_pad = st.one_of(empty, newline)
  232. if parsed[0][0] == sre.AT:
  233. if parsed[0][1] == sre.AT_BEGINNING_STRING:
  234. left_pad = empty
  235. elif parsed[0][1] == sre.AT_BEGINNING:
  236. if regex.flags & re.MULTILINE:
  237. left_pad = st.one_of(
  238. empty,
  239. st.builds(operator.add, left_pad, newline),
  240. )
  241. else:
  242. left_pad = empty
  243. base = base_regex_strategy(regex, parsed).filter(regex.search)
  244. return maybe_pad(regex, base, left_pad, right_pad)
  245. def _strategy(codes, context, is_unicode):
  246. """Convert SRE regex parse tree to strategy that generates strings matching
  247. that regex represented by that parse tree.
  248. `codes` is either a list of SRE regex elements representations or a
  249. particular element representation. Each element is a tuple of element code
  250. (as string) and parameters. E.g. regex 'ab[0-9]+' compiles to following
  251. elements:
  252. [
  253. (LITERAL, 97),
  254. (LITERAL, 98),
  255. (MAX_REPEAT, (1, 4294967295, [
  256. (IN, [
  257. (RANGE, (48, 57))
  258. ])
  259. ]))
  260. ]
  261. The function recursively traverses regex element tree and converts each
  262. element to strategy that generates strings that match that element.
  263. Context stores
  264. 1. List of groups (for backreferences)
  265. 2. Active regex flags (e.g. IGNORECASE, DOTALL, UNICODE, they affect
  266. behavior of various inner strategies)
  267. """
  268. def recurse(codes):
  269. return _strategy(codes, context, is_unicode)
  270. if is_unicode:
  271. empty = u''
  272. to_char = hunichr
  273. else:
  274. empty = b''
  275. to_char = int_to_byte
  276. binary_char = st.binary(min_size=1, max_size=1)
  277. if not isinstance(codes, tuple):
  278. # List of codes
  279. strategies = []
  280. i = 0
  281. while i < len(codes):
  282. if codes[i][0] == sre.LITERAL and \
  283. not context.flags & re.IGNORECASE:
  284. # Merge subsequent "literals" into one `just()` strategy
  285. # that generates corresponding text if no IGNORECASE
  286. j = i + 1
  287. while j < len(codes) and codes[j][0] == sre.LITERAL:
  288. j += 1
  289. if i + 1 < j:
  290. strategies.append(st.just(
  291. empty.join([to_char(charcode)
  292. for (_, charcode) in codes[i:j]])
  293. ))
  294. i = j
  295. continue
  296. strategies.append(recurse(codes[i]))
  297. i += 1
  298. # We handle this separately at the top level, but some regex can
  299. # contain empty lists internally, so we need to handle this here too.
  300. if not strategies:
  301. return st.just(empty)
  302. if len(strategies) == 1:
  303. return strategies[0]
  304. return st.tuples(*strategies).map(empty.join)
  305. else:
  306. # Single code
  307. code, value = codes
  308. if code == sre.LITERAL:
  309. # Regex 'a' (single char)
  310. c = to_char(value)
  311. if context.flags & re.IGNORECASE and \
  312. re.match(c, c.swapcase(), re.IGNORECASE) is not None:
  313. # We do the explicit check for swapped-case matching because
  314. # eg 'ß'.upper() == 'SS' and ignorecase doesn't match it.
  315. return st.sampled_from([c, c.swapcase()])
  316. return st.just(c)
  317. elif code == sre.NOT_LITERAL:
  318. # Regex '[^a]' (negation of a single char)
  319. c = to_char(value)
  320. blacklist = set(c)
  321. if context.flags & re.IGNORECASE and \
  322. re.match(c, c.swapcase(), re.IGNORECASE) is not None:
  323. blacklist |= set(c.swapcase())
  324. if is_unicode:
  325. return st.characters(blacklist_characters=blacklist)
  326. else:
  327. return binary_char.filter(lambda c: c not in blacklist)
  328. elif code == sre.IN:
  329. # Regex '[abc0-9]' (set of characters)
  330. negate = value[0][0] == sre.NEGATE
  331. if is_unicode:
  332. builder = CharactersBuilder(negate, context.flags)
  333. else:
  334. builder = BytesBuilder(negate, context.flags)
  335. for charset_code, charset_value in value:
  336. if charset_code == sre.NEGATE:
  337. # Regex '[^...]' (negation)
  338. # handled by builder = CharactersBuilder(...) above
  339. pass
  340. elif charset_code == sre.LITERAL:
  341. # Regex '[a]' (single char)
  342. builder.add_char(charset_value)
  343. elif charset_code == sre.RANGE:
  344. # Regex '[a-z]' (char range)
  345. low, high = charset_value
  346. for char_code in hrange(low, high + 1):
  347. builder.add_char(char_code)
  348. elif charset_code == sre.CATEGORY:
  349. # Regex '[\w]' (char category)
  350. builder.add_category(charset_value)
  351. else: # pragma: no cover
  352. # Currently there are no known code points other than
  353. # handled here. This code is just future proofing
  354. raise AssertionError('Unknown charset code: %s'
  355. % charset_code)
  356. return builder.strategy
  357. elif code == sre.ANY:
  358. # Regex '.' (any char)
  359. if is_unicode:
  360. if context.flags & re.DOTALL:
  361. return st.characters()
  362. return st.characters(blacklist_characters=u'\n')
  363. else:
  364. if context.flags & re.DOTALL:
  365. return binary_char
  366. return binary_char.filter(lambda c: c != b'\n')
  367. elif code == sre.AT:
  368. # Regexes like '^...', '...$', '\bfoo', '\Bfoo'
  369. # An empty string (or newline) will match the token itself, but
  370. # we don't and can't check the position (eg '%' at the end)
  371. return st.just(empty)
  372. elif code == sre.SUBPATTERN:
  373. # Various groups: '(...)', '(:...)' or '(?P<name>...)'
  374. old_flags = context.flags
  375. if HAS_SUBPATTERN_FLAGS: # pragma: no cover
  376. # This feature is available only in specific Python versions
  377. context.flags = (context.flags | value[1]) & ~value[2]
  378. strat = _strategy(value[-1], context, is_unicode)
  379. context.flags = old_flags
  380. if value[0]:
  381. strat = update_group(value[0], strat)
  382. return strat
  383. elif code == sre.GROUPREF:
  384. # Regex '\\1' or '(?P=name)' (group reference)
  385. return reuse_group(value)
  386. elif code == sre.ASSERT:
  387. # Regex '(?=...)' or '(?<=...)' (positive lookahead/lookbehind)
  388. return recurse(value[1])
  389. elif code == sre.ASSERT_NOT:
  390. # Regex '(?!...)' or '(?<!...)' (negative lookahead/lookbehind)
  391. return st.just(empty)
  392. elif code == sre.BRANCH:
  393. # Regex 'a|b|c' (branch)
  394. return st.one_of([recurse(branch) for branch in value[1]])
  395. elif code in [sre.MIN_REPEAT, sre.MAX_REPEAT]:
  396. # Regexes 'a?', 'a*', 'a+' and their non-greedy variants
  397. # (repeaters)
  398. at_least, at_most, subregex = value
  399. if at_most == sre.MAXREPEAT:
  400. at_most = None
  401. if at_least == 0 and at_most == 1:
  402. return st.just(empty) | recurse(subregex)
  403. return st.lists(recurse(subregex),
  404. min_size=at_least,
  405. max_size=at_most).map(empty.join)
  406. elif code == sre.GROUPREF_EXISTS:
  407. # Regex '(?(id/name)yes-pattern|no-pattern)'
  408. # (if group exists choice)
  409. return group_conditional(
  410. value[0],
  411. recurse(value[1]),
  412. recurse(value[2]) if value[2] else st.just(empty),
  413. )
  414. else: # pragma: no cover
  415. # Currently there are no known code points other than handled here.
  416. # This code is just future proofing
  417. raise AssertionError('Unknown code point: %s' % repr(code))