parse.py 47 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336
  1. r'''Parse strings using a specification based on the Python format() syntax.
  2. ``parse()`` is the opposite of ``format()``
  3. The module is set up to only export ``parse()``, ``search()``, ``findall()``,
  4. and ``with_pattern()`` when ``import \*`` is used:
  5. >>> from parse import *
  6. From there it's a simple thing to parse a string:
  7. >>> parse("It's {}, I love it!", "It's spam, I love it!")
  8. <Result ('spam',) {}>
  9. >>> _[0]
  10. 'spam'
  11. Or to search a string for some pattern:
  12. >>> search('Age: {:d}\n', 'Name: Rufus\nAge: 42\nColor: red\n')
  13. <Result (42,) {}>
  14. Or find all the occurrences of some pattern in a string:
  15. >>> ''.join(r.fixed[0] for r in findall(">{}<", "<p>the <b>bold</b> text</p>"))
  16. 'the bold text'
  17. If you're going to use the same pattern to match lots of strings you can
  18. compile it once:
  19. >>> from parse import compile
  20. >>> p = compile("It's {}, I love it!")
  21. >>> print(p)
  22. <Parser "It's {}, I love it!">
  23. >>> p.parse("It's spam, I love it!")
  24. <Result ('spam',) {}>
  25. ("compile" is not exported for ``import *`` usage as it would override the
  26. built-in ``compile()`` function)
  27. The default behaviour is to match strings case insensitively. You may match with
  28. case by specifying `case_sensitive=True`:
  29. >>> parse('SPAM', 'spam', case_sensitive=True) is None
  30. True
  31. Format Syntax
  32. -------------
  33. A basic version of the `Format String Syntax`_ is supported with anonymous
  34. (fixed-position), named and formatted fields::
  35. {[field name]:[format spec]}
  36. Field names must be a valid Python identifiers, including dotted names;
  37. element indexes imply dictionaries (see below for example).
  38. Numbered fields are also not supported: the result of parsing will include
  39. the parsed fields in the order they are parsed.
  40. The conversion of fields to types other than strings is done based on the
  41. type in the format specification, which mirrors the ``format()`` behaviour.
  42. There are no "!" field conversions like ``format()`` has.
  43. Some simple parse() format string examples:
  44. >>> parse("Bring me a {}", "Bring me a shrubbery")
  45. <Result ('shrubbery',) {}>
  46. >>> r = parse("The {} who say {}", "The knights who say Ni!")
  47. >>> print(r)
  48. <Result ('knights', 'Ni!') {}>
  49. >>> print(r.fixed)
  50. ('knights', 'Ni!')
  51. >>> r = parse("Bring out the holy {item}", "Bring out the holy hand grenade")
  52. >>> print(r)
  53. <Result () {'item': 'hand grenade'}>
  54. >>> print(r.named)
  55. {'item': 'hand grenade'}
  56. >>> print(r['item'])
  57. hand grenade
  58. >>> 'item' in r
  59. True
  60. Note that `in` only works if you have named fields. Dotted names and indexes
  61. are possible though the application must make additional sense of the result:
  62. >>> r = parse("Mmm, {food.type}, I love it!", "Mmm, spam, I love it!")
  63. >>> print(r)
  64. <Result () {'food.type': 'spam'}>
  65. >>> print(r.named)
  66. {'food.type': 'spam'}
  67. >>> print(r['food.type'])
  68. spam
  69. >>> r = parse("My quest is {quest[name]}", "My quest is to seek the holy grail!")
  70. >>> print(r)
  71. <Result () {'quest': {'name': 'to seek the holy grail!'}}>
  72. >>> print(r['quest'])
  73. {'name': 'to seek the holy grail!'}
  74. >>> print(r['quest']['name'])
  75. to seek the holy grail!
  76. If the text you're matching has braces in it you can match those by including
  77. a double-brace ``{{`` or ``}}`` in your format string, just like format() does.
  78. Format Specification
  79. --------------------
  80. Most often a straight format-less ``{}`` will suffice where a more complex
  81. format specification might have been used.
  82. Most of `format()`'s `Format Specification Mini-Language`_ is supported:
  83. [[fill]align][0][width][.precision][type]
  84. The differences between `parse()` and `format()` are:
  85. - The align operators will cause spaces (or specified fill character) to be
  86. stripped from the parsed value. The width is not enforced; it just indicates
  87. there may be whitespace or "0"s to strip.
  88. - Numeric parsing will automatically handle a "0b", "0o" or "0x" prefix.
  89. That is, the "#" format character is handled automatically by d, b, o
  90. and x formats. For "d" any will be accepted, but for the others the correct
  91. prefix must be present if at all.
  92. - Numeric sign is handled automatically.
  93. - The thousands separator is handled automatically if the "n" type is used.
  94. - The types supported are a slightly different mix to the format() types. Some
  95. format() types come directly over: "d", "n", "%", "f", "e", "b", "o" and "x".
  96. In addition some regular expression character group types "D", "w", "W", "s"
  97. and "S" are also available.
  98. - The "e" and "g" types are case-insensitive so there is not need for
  99. the "E" or "G" types.
  100. ===== =========================================== ========
  101. Type Characters Matched Output
  102. ===== =========================================== ========
  103. l Letters (ASCII) str
  104. w Letters, numbers and underscore str
  105. W Not letters, numbers and underscore str
  106. s Whitespace str
  107. S Non-whitespace str
  108. d Digits (effectively integer numbers) int
  109. D Non-digit str
  110. n Numbers with thousands separators (, or .) int
  111. % Percentage (converted to value/100.0) float
  112. f Fixed-point numbers float
  113. F Decimal numbers Decimal
  114. e Floating-point numbers with exponent float
  115. e.g. 1.1e-10, NAN (all case insensitive)
  116. g General number format (either d, f or e) float
  117. b Binary numbers int
  118. o Octal numbers int
  119. x Hexadecimal numbers (lower and upper case) int
  120. ti ISO 8601 format date/time datetime
  121. e.g. 1972-01-20T10:21:36Z ("T" and "Z"
  122. optional)
  123. te RFC2822 e-mail format date/time datetime
  124. e.g. Mon, 20 Jan 1972 10:21:36 +1000
  125. tg Global (day/month) format date/time datetime
  126. e.g. 20/1/1972 10:21:36 AM +1:00
  127. ta US (month/day) format date/time datetime
  128. e.g. 1/20/1972 10:21:36 PM +10:30
  129. tc ctime() format date/time datetime
  130. e.g. Sun Sep 16 01:03:52 1973
  131. th HTTP log format date/time datetime
  132. e.g. 21/Nov/2011:00:07:11 +0000
  133. ts Linux system log format date/time datetime
  134. e.g. Nov 9 03:37:44
  135. tt Time time
  136. e.g. 10:21:36 PM -5:30
  137. ===== =========================================== ========
  138. Some examples of typed parsing with ``None`` returned if the typing
  139. does not match:
  140. >>> parse('Our {:d} {:w} are...', 'Our 3 weapons are...')
  141. <Result (3, 'weapons') {}>
  142. >>> parse('Our {:d} {:w} are...', 'Our three weapons are...')
  143. >>> parse('Meet at {:tg}', 'Meet at 1/2/2011 11:00 PM')
  144. <Result (datetime.datetime(2011, 2, 1, 23, 0),) {}>
  145. And messing about with alignment:
  146. >>> parse('with {:>} herring', 'with a herring')
  147. <Result ('a',) {}>
  148. >>> parse('spam {:^} spam', 'spam lovely spam')
  149. <Result ('lovely',) {}>
  150. Note that the "center" alignment does not test to make sure the value is
  151. centered - it just strips leading and trailing whitespace.
  152. Width and precision may be used to restrict the size of matched text
  153. from the input. Width specifies a minimum size and precision specifies
  154. a maximum. For example:
  155. >>> parse('{:.2}{:.2}', 'look') # specifying precision
  156. <Result ('lo', 'ok') {}>
  157. >>> parse('{:4}{:4}', 'look at that') # specifying width
  158. <Result ('look', 'at that') {}>
  159. >>> parse('{:4}{:.4}', 'look at that') # specifying both
  160. <Result ('look at ', 'that') {}>
  161. >>> parse('{:2d}{:2d}', '0440') # parsing two contiguous numbers
  162. <Result (4, 40) {}>
  163. Some notes for the date and time types:
  164. - the presence of the time part is optional (including ISO 8601, starting
  165. at the "T"). A full datetime object will always be returned; the time
  166. will be set to 00:00:00. You may also specify a time without seconds.
  167. - when a seconds amount is present in the input fractions will be parsed
  168. to give microseconds.
  169. - except in ISO 8601 the day and month digits may be 0-padded.
  170. - the date separator for the tg and ta formats may be "-" or "/".
  171. - named months (abbreviations or full names) may be used in the ta and tg
  172. formats in place of numeric months.
  173. - as per RFC 2822 the e-mail format may omit the day (and comma), and the
  174. seconds but nothing else.
  175. - hours greater than 12 will be happily accepted.
  176. - the AM/PM are optional, and if PM is found then 12 hours will be added
  177. to the datetime object's hours amount - even if the hour is greater
  178. than 12 (for consistency.)
  179. - in ISO 8601 the "Z" (UTC) timezone part may be a numeric offset
  180. - timezones are specified as "+HH:MM" or "-HH:MM". The hour may be one or two
  181. digits (0-padded is OK.) Also, the ":" is optional.
  182. - the timezone is optional in all except the e-mail format (it defaults to
  183. UTC.)
  184. - named timezones are not handled yet.
  185. Note: attempting to match too many datetime fields in a single parse() will
  186. currently result in a resource allocation issue. A TooManyFields exception
  187. will be raised in this instance. The current limit is about 15. It is hoped
  188. that this limit will be removed one day.
  189. .. _`Format String Syntax`:
  190. http://docs.python.org/library/string.html#format-string-syntax
  191. .. _`Format Specification Mini-Language`:
  192. http://docs.python.org/library/string.html#format-specification-mini-language
  193. Result and Match Objects
  194. ------------------------
  195. The result of a ``parse()`` and ``search()`` operation is either ``None`` (no match), a
  196. ``Result`` instance or a ``Match`` instance if ``evaluate_result`` is False.
  197. The ``Result`` instance has three attributes:
  198. fixed
  199. A tuple of the fixed-position, anonymous fields extracted from the input.
  200. named
  201. A dictionary of the named fields extracted from the input.
  202. spans
  203. A dictionary mapping the names and fixed position indices matched to a
  204. 2-tuple slice range of where the match occurred in the input.
  205. The span does not include any stripped padding (alignment or width).
  206. The ``Match`` instance has one method:
  207. evaluate_result()
  208. Generates and returns a ``Result`` instance for this ``Match`` object.
  209. Custom Type Conversions
  210. -----------------------
  211. If you wish to have matched fields automatically converted to your own type you
  212. may pass in a dictionary of type conversion information to ``parse()`` and
  213. ``compile()``.
  214. The converter will be passed the field string matched. Whatever it returns
  215. will be substituted in the ``Result`` instance for that field.
  216. Your custom type conversions may override the builtin types if you supply one
  217. with the same identifier.
  218. >>> def shouty(string):
  219. ... return string.upper()
  220. ...
  221. >>> parse('{:shouty} world', 'hello world', dict(shouty=shouty))
  222. <Result ('HELLO',) {}>
  223. If the type converter has the optional ``pattern`` attribute, it is used as
  224. regular expression for better pattern matching (instead of the default one).
  225. >>> def parse_number(text):
  226. ... return int(text)
  227. >>> parse_number.pattern = r'\d+'
  228. >>> parse('Answer: {number:Number}', 'Answer: 42', dict(Number=parse_number))
  229. <Result () {'number': 42}>
  230. >>> _ = parse('Answer: {:Number}', 'Answer: Alice', dict(Number=parse_number))
  231. >>> assert _ is None, "MISMATCH"
  232. You can also use the ``with_pattern(pattern)`` decorator to add this
  233. information to a type converter function:
  234. >>> from parse import with_pattern
  235. >>> @with_pattern(r'\d+')
  236. ... def parse_number(text):
  237. ... return int(text)
  238. >>> parse('Answer: {number:Number}', 'Answer: 42', dict(Number=parse_number))
  239. <Result () {'number': 42}>
  240. A more complete example of a custom type might be:
  241. >>> yesno_mapping = {
  242. ... "yes": True, "no": False,
  243. ... "on": True, "off": False,
  244. ... "true": True, "false": False,
  245. ... }
  246. >>> @with_pattern(r"|".join(yesno_mapping))
  247. ... def parse_yesno(text):
  248. ... return yesno_mapping[text.lower()]
  249. If the type converter ``pattern`` uses regex-grouping (with parenthesis),
  250. you should indicate this by using the optional ``regex_group_count`` parameter
  251. in the ``with_pattern()`` decorator:
  252. >>> @with_pattern(r'((\d+))', regex_group_count=2)
  253. ... def parse_number2(text):
  254. ... return int(text)
  255. >>> parse('Answer: {:Number2} {:Number2}', 'Answer: 42 43', dict(Number2=parse_number2))
  256. <Result (42, 43) {}>
  257. Otherwise, this may cause parsing problems with unnamed/fixed parameters.
  258. Potential Gotchas
  259. -----------------
  260. `parse()` will always match the shortest text necessary (from left to right)
  261. to fulfil the parse pattern, so for example:
  262. >>> pattern = '{dir1}/{dir2}'
  263. >>> data = 'root/parent/subdir'
  264. >>> sorted(parse(pattern, data).named.items())
  265. [('dir1', 'root'), ('dir2', 'parent/subdir')]
  266. So, even though `{'dir1': 'root/parent', 'dir2': 'subdir'}` would also fit
  267. the pattern, the actual match represents the shortest successful match for
  268. `dir1`.
  269. ----
  270. **Version history (in brief)**:
  271. - 1.12.0 Do not assume closing brace when an opening one is found (thanks @mattsep)
  272. - 1.11.1 Revert having unicode char in docstring, it breaks Bamboo builds(?!)
  273. - 1.11.0 Implement `__contains__` for Result instances.
  274. - 1.10.0 Introduce a "letters" matcher, since "w" matches numbers
  275. also.
  276. - 1.9.1 Fix deprecation warnings around backslashes in regex strings
  277. (thanks Mickael Schoentgen). Also fix some documentation formatting
  278. issues.
  279. - 1.9.0 We now honor precision and width specifiers when parsing numbers
  280. and strings, allowing parsing of concatenated elements of fixed width
  281. (thanks Julia Signell)
  282. - 1.8.4 Add LICENSE file at request of packagers.
  283. Correct handling of AM/PM to follow most common interpretation.
  284. Correct parsing of hexadecimal that looks like a binary prefix.
  285. Add ability to parse case sensitively.
  286. Add parsing of numbers to Decimal with "F" (thanks John Vandenberg)
  287. - 1.8.3 Add regex_group_count to with_pattern() decorator to support
  288. user-defined types that contain brackets/parenthesis (thanks Jens Engel)
  289. - 1.8.2 add documentation for including braces in format string
  290. - 1.8.1 ensure bare hexadecimal digits are not matched
  291. - 1.8.0 support manual control over result evaluation (thanks Timo Furrer)
  292. - 1.7.0 parse dict fields (thanks Mark Visser) and adapted to allow
  293. more than 100 re groups in Python 3.5+ (thanks David King)
  294. - 1.6.6 parse Linux system log dates (thanks Alex Cowan)
  295. - 1.6.5 handle precision in float format (thanks Levi Kilcher)
  296. - 1.6.4 handle pipe "|" characters in parse string (thanks Martijn Pieters)
  297. - 1.6.3 handle repeated instances of named fields, fix bug in PM time
  298. overflow
  299. - 1.6.2 fix logging to use local, not root logger (thanks Necku)
  300. - 1.6.1 be more flexible regarding matched ISO datetimes and timezones in
  301. general, fix bug in timezones without ":" and improve docs
  302. - 1.6.0 add support for optional ``pattern`` attribute in user-defined types
  303. (thanks Jens Engel)
  304. - 1.5.3 fix handling of question marks
  305. - 1.5.2 fix type conversion error with dotted names (thanks Sebastian Thiel)
  306. - 1.5.1 implement handling of named datetime fields
  307. - 1.5 add handling of dotted field names (thanks Sebastian Thiel)
  308. - 1.4.1 fix parsing of "0" in int conversion (thanks James Rowe)
  309. - 1.4 add __getitem__ convenience access on Result.
  310. - 1.3.3 fix Python 2.5 setup.py issue.
  311. - 1.3.2 fix Python 3.2 setup.py issue.
  312. - 1.3.1 fix a couple of Python 3.2 compatibility issues.
  313. - 1.3 added search() and findall(); removed compile() from ``import *``
  314. export as it overwrites builtin.
  315. - 1.2 added ability for custom and override type conversions to be
  316. provided; some cleanup
  317. - 1.1.9 to keep things simpler number sign is handled automatically;
  318. significant robustification in the face of edge-case input.
  319. - 1.1.8 allow "d" fields to have number base "0x" etc. prefixes;
  320. fix up some field type interactions after stress-testing the parser;
  321. implement "%" type.
  322. - 1.1.7 Python 3 compatibility tweaks (2.5 to 2.7 and 3.2 are supported).
  323. - 1.1.6 add "e" and "g" field types; removed redundant "h" and "X";
  324. removed need for explicit "#".
  325. - 1.1.5 accept textual dates in more places; Result now holds match span
  326. positions.
  327. - 1.1.4 fixes to some int type conversion; implemented "=" alignment; added
  328. date/time parsing with a variety of formats handled.
  329. - 1.1.3 type conversion is automatic based on specified field types. Also added
  330. "f" and "n" types.
  331. - 1.1.2 refactored, added compile() and limited ``from parse import *``
  332. - 1.1.1 documentation improvements
  333. - 1.1.0 implemented more of the `Format Specification Mini-Language`_
  334. and removed the restriction on mixing fixed-position and named fields
  335. - 1.0.0 initial release
  336. This code is copyright 2012-2019 Richard Jones <richard@python.org>
  337. See the end of the source file for the license of use.
  338. '''
  339. from __future__ import absolute_import
  340. __version__ = '1.12.0'
  341. # yes, I now have two problems
  342. import re
  343. import sys
  344. from datetime import datetime, time, tzinfo, timedelta
  345. from decimal import Decimal
  346. from functools import partial
  347. import logging
  348. __all__ = 'parse search findall with_pattern'.split()
  349. log = logging.getLogger(__name__)
  350. def with_pattern(pattern, regex_group_count=None):
  351. r"""Attach a regular expression pattern matcher to a custom type converter
  352. function.
  353. This annotates the type converter with the :attr:`pattern` attribute.
  354. EXAMPLE:
  355. >>> import parse
  356. >>> @parse.with_pattern(r"\d+")
  357. ... def parse_number(text):
  358. ... return int(text)
  359. is equivalent to:
  360. >>> def parse_number(text):
  361. ... return int(text)
  362. >>> parse_number.pattern = r"\d+"
  363. :param pattern: regular expression pattern (as text)
  364. :param regex_group_count: Indicates how many regex-groups are in pattern.
  365. :return: wrapped function
  366. """
  367. def decorator(func):
  368. func.pattern = pattern
  369. func.regex_group_count = regex_group_count
  370. return func
  371. return decorator
  372. def int_convert(base):
  373. '''Convert a string to an integer.
  374. The string may start with a sign.
  375. It may be of a base other than 10.
  376. If may start with a base indicator, 0#nnnn, which we assume should
  377. override the specified base.
  378. It may also have other non-numeric characters that we can ignore.
  379. '''
  380. CHARS = '0123456789abcdefghijklmnopqrstuvwxyz'
  381. def f(string, match, base=base):
  382. if string[0] == '-':
  383. sign = -1
  384. else:
  385. sign = 1
  386. if string[0] == '0' and len(string) > 2:
  387. if string[1] in 'bB':
  388. base = 2
  389. elif string[1] in 'oO':
  390. base = 8
  391. elif string[1] in 'xX':
  392. base = 16
  393. else:
  394. # just go with the base specifed
  395. pass
  396. chars = CHARS[:base]
  397. string = re.sub('[^%s]' % chars, '', string.lower())
  398. return sign * int(string, base)
  399. return f
  400. def percentage(string, match):
  401. return float(string[:-1]) / 100.
  402. class FixedTzOffset(tzinfo):
  403. """Fixed offset in minutes east from UTC.
  404. """
  405. ZERO = timedelta(0)
  406. def __init__(self, offset, name):
  407. self._offset = timedelta(minutes=offset)
  408. self._name = name
  409. def __repr__(self):
  410. return '<%s %s %s>' % (self.__class__.__name__, self._name,
  411. self._offset)
  412. def utcoffset(self, dt):
  413. return self._offset
  414. def tzname(self, dt):
  415. return self._name
  416. def dst(self, dt):
  417. return self.ZERO
  418. def __eq__(self, other):
  419. return self._name == other._name and self._offset == other._offset
  420. MONTHS_MAP = dict(
  421. Jan=1, January=1,
  422. Feb=2, February=2,
  423. Mar=3, March=3,
  424. Apr=4, April=4,
  425. May=5,
  426. Jun=6, June=6,
  427. Jul=7, July=7,
  428. Aug=8, August=8,
  429. Sep=9, September=9,
  430. Oct=10, October=10,
  431. Nov=11, November=11,
  432. Dec=12, December=12
  433. )
  434. DAYS_PAT = r'(Mon|Tue|Wed|Thu|Fri|Sat|Sun)'
  435. MONTHS_PAT = r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)'
  436. ALL_MONTHS_PAT = r'(%s)' % '|'.join(MONTHS_MAP)
  437. TIME_PAT = r'(\d{1,2}:\d{1,2}(:\d{1,2}(\.\d+)?)?)'
  438. AM_PAT = r'(\s+[AP]M)'
  439. TZ_PAT = r'(\s+[-+]\d\d?:?\d\d)'
  440. def date_convert(string, match, ymd=None, mdy=None, dmy=None,
  441. d_m_y=None, hms=None, am=None, tz=None, mm=None, dd=None):
  442. '''Convert the incoming string containing some date / time info into a
  443. datetime instance.
  444. '''
  445. groups = match.groups()
  446. time_only = False
  447. if mm and dd:
  448. y=datetime.today().year
  449. m=groups[mm]
  450. d=groups[dd]
  451. elif ymd is not None:
  452. y, m, d = re.split(r'[-/\s]', groups[ymd])
  453. elif mdy is not None:
  454. m, d, y = re.split(r'[-/\s]', groups[mdy])
  455. elif dmy is not None:
  456. d, m, y = re.split(r'[-/\s]', groups[dmy])
  457. elif d_m_y is not None:
  458. d, m, y = d_m_y
  459. d = groups[d]
  460. m = groups[m]
  461. y = groups[y]
  462. else:
  463. time_only = True
  464. H = M = S = u = 0
  465. if hms is not None and groups[hms]:
  466. t = groups[hms].split(':')
  467. if len(t) == 2:
  468. H, M = t
  469. else:
  470. H, M, S = t
  471. if '.' in S:
  472. S, u = S.split('.')
  473. u = int(float('.' + u) * 1000000)
  474. S = int(S)
  475. H = int(H)
  476. M = int(M)
  477. if am is not None:
  478. am = groups[am]
  479. if am:
  480. am = am.strip()
  481. if am == 'AM' and H == 12:
  482. # correction for "12" hour functioning as "0" hour: 12:15 AM = 00:15 by 24 hr clock
  483. H -= 12
  484. elif am == 'PM' and H == 12:
  485. # no correction needed: 12PM is midday, 12:00 by 24 hour clock
  486. pass
  487. elif am == 'PM':
  488. H += 12
  489. if tz is not None:
  490. tz = groups[tz]
  491. if tz == 'Z':
  492. tz = FixedTzOffset(0, 'UTC')
  493. elif tz:
  494. tz = tz.strip()
  495. if tz.isupper():
  496. # TODO use the awesome python TZ module?
  497. pass
  498. else:
  499. sign = tz[0]
  500. if ':' in tz:
  501. tzh, tzm = tz[1:].split(':')
  502. elif len(tz) == 4: # 'snnn'
  503. tzh, tzm = tz[1], tz[2:4]
  504. else:
  505. tzh, tzm = tz[1:3], tz[3:5]
  506. offset = int(tzm) + int(tzh) * 60
  507. if sign == '-':
  508. offset = -offset
  509. tz = FixedTzOffset(offset, tz)
  510. if time_only:
  511. d = time(H, M, S, u, tzinfo=tz)
  512. else:
  513. y = int(y)
  514. if m.isdigit():
  515. m = int(m)
  516. else:
  517. m = MONTHS_MAP[m]
  518. d = int(d)
  519. d = datetime(y, m, d, H, M, S, u, tzinfo=tz)
  520. return d
  521. class TooManyFields(ValueError):
  522. pass
  523. class RepeatedNameError(ValueError):
  524. pass
  525. # note: {} are handled separately
  526. # note: I don't use r'' here because Sublime Text 2 syntax highlight has a fit
  527. REGEX_SAFETY = re.compile(r'([?\\\\.[\]()*+\^$!\|])')
  528. # allowed field types
  529. ALLOWED_TYPES = set(list('nbox%fFegwWdDsSl') +
  530. ['t' + c for c in 'ieahgcts'])
  531. def extract_format(format, extra_types):
  532. '''Pull apart the format [[fill]align][0][width][.precision][type]
  533. '''
  534. fill = align = None
  535. if format[0] in '<>=^':
  536. align = format[0]
  537. format = format[1:]
  538. elif len(format) > 1 and format[1] in '<>=^':
  539. fill = format[0]
  540. align = format[1]
  541. format = format[2:]
  542. zero = False
  543. if format and format[0] == '0':
  544. zero = True
  545. format = format[1:]
  546. width = ''
  547. while format:
  548. if not format[0].isdigit():
  549. break
  550. width += format[0]
  551. format = format[1:]
  552. if format.startswith('.'):
  553. # Precision isn't needed but we need to capture it so that
  554. # the ValueError isn't raised.
  555. format = format[1:] # drop the '.'
  556. precision = ''
  557. while format:
  558. if not format[0].isdigit():
  559. break
  560. precision += format[0]
  561. format = format[1:]
  562. # the rest is the type, if present
  563. type = format
  564. if type and type not in ALLOWED_TYPES and type not in extra_types:
  565. raise ValueError('format spec %r not recognised' % type)
  566. return locals()
  567. PARSE_RE = re.compile(r"""({{|}}|{\w*(?:(?:\.\w+)|(?:\[[^\]]+\]))*(?::[^}]+)?})""")
  568. class Parser(object):
  569. '''Encapsulate a format string that may be used to parse other strings.
  570. '''
  571. def __init__(self, format, extra_types=None, case_sensitive=False):
  572. # a mapping of a name as in {hello.world} to a regex-group compatible
  573. # name, like hello__world Its used to prevent the transformation of
  574. # name-to-group and group to name to fail subtly, such as in:
  575. # hello_.world-> hello___world->hello._world
  576. self._group_to_name_map = {}
  577. # also store the original field name to group name mapping to allow
  578. # multiple instances of a name in the format string
  579. self._name_to_group_map = {}
  580. # and to sanity check the repeated instances store away the first
  581. # field type specification for the named field
  582. self._name_types = {}
  583. self._format = format
  584. if extra_types is None:
  585. extra_types = {}
  586. self._extra_types = extra_types
  587. if case_sensitive:
  588. self._re_flags = re.DOTALL
  589. else:
  590. self._re_flags = re.IGNORECASE | re.DOTALL
  591. self._fixed_fields = []
  592. self._named_fields = []
  593. self._group_index = 0
  594. self._type_conversions = {}
  595. self._expression = self._generate_expression()
  596. self.__search_re = None
  597. self.__match_re = None
  598. log.debug('format %r -> %r', format, self._expression)
  599. def __repr__(self):
  600. if len(self._format) > 20:
  601. return '<%s %r>' % (self.__class__.__name__,
  602. self._format[:17] + '...')
  603. return '<%s %r>' % (self.__class__.__name__, self._format)
  604. @property
  605. def _search_re(self):
  606. if self.__search_re is None:
  607. try:
  608. self.__search_re = re.compile(self._expression, self._re_flags)
  609. except AssertionError:
  610. # access error through sys to keep py3k and backward compat
  611. e = str(sys.exc_info()[1])
  612. if e.endswith('this version only supports 100 named groups'):
  613. raise TooManyFields('sorry, you are attempting to parse '
  614. 'too many complex fields')
  615. return self.__search_re
  616. @property
  617. def _match_re(self):
  618. if self.__match_re is None:
  619. expression = r'^%s$' % self._expression
  620. try:
  621. self.__match_re = re.compile(expression, self._re_flags)
  622. except AssertionError:
  623. # access error through sys to keep py3k and backward compat
  624. e = str(sys.exc_info()[1])
  625. if e.endswith('this version only supports 100 named groups'):
  626. raise TooManyFields('sorry, you are attempting to parse '
  627. 'too many complex fields')
  628. except re.error:
  629. raise NotImplementedError("Group names (e.g. (?P<name>) can "
  630. "cause failure, as they are not escaped properly: '%s'" %
  631. expression)
  632. return self.__match_re
  633. def parse(self, string, evaluate_result=True):
  634. '''Match my format to the string exactly.
  635. Return a Result or Match instance or None if there's no match.
  636. '''
  637. m = self._match_re.match(string)
  638. if m is None:
  639. return None
  640. if evaluate_result:
  641. return self.evaluate_result(m)
  642. else:
  643. return Match(self, m)
  644. def search(self, string, pos=0, endpos=None, evaluate_result=True):
  645. '''Search the string for my format.
  646. Optionally start the search at "pos" character index and limit the
  647. search to a maximum index of endpos - equivalent to
  648. search(string[:endpos]).
  649. If the ``evaluate_result`` argument is set to ``False`` a
  650. Match instance is returned instead of the actual Result instance.
  651. Return either a Result instance or None if there's no match.
  652. '''
  653. if endpos is None:
  654. endpos = len(string)
  655. m = self._search_re.search(string, pos, endpos)
  656. if m is None:
  657. return None
  658. if evaluate_result:
  659. return self.evaluate_result(m)
  660. else:
  661. return Match(self, m)
  662. def findall(self, string, pos=0, endpos=None, extra_types=None, evaluate_result=True):
  663. '''Search "string" for all occurrences of "format".
  664. Optionally start the search at "pos" character index and limit the
  665. search to a maximum index of endpos - equivalent to
  666. search(string[:endpos]).
  667. Returns an iterator that holds Result or Match instances for each format match
  668. found.
  669. '''
  670. if endpos is None:
  671. endpos = len(string)
  672. return ResultIterator(self, string, pos, endpos, evaluate_result=evaluate_result)
  673. def _expand_named_fields(self, named_fields):
  674. result = {}
  675. for field, value in named_fields.items():
  676. # split 'aaa[bbb][ccc]...' into 'aaa' and '[bbb][ccc]...'
  677. basename, subkeys = re.match(r'([^\[]+)(.*)', field).groups()
  678. # create nested dictionaries {'aaa': {'bbb': {'ccc': ...}}}
  679. d = result
  680. k = basename
  681. if subkeys:
  682. for subkey in re.findall(r'\[[^\]]+\]', subkeys):
  683. d = d.setdefault(k,{})
  684. k = subkey[1:-1]
  685. # assign the value to the last key
  686. d[k] = value
  687. return result
  688. def evaluate_result(self, m):
  689. '''Generate a Result instance for the given regex match object'''
  690. # ok, figure the fixed fields we've pulled out and type convert them
  691. fixed_fields = list(m.groups())
  692. for n in self._fixed_fields:
  693. if n in self._type_conversions:
  694. fixed_fields[n] = self._type_conversions[n](fixed_fields[n], m)
  695. fixed_fields = tuple(fixed_fields[n] for n in self._fixed_fields)
  696. # grab the named fields, converting where requested
  697. groupdict = m.groupdict()
  698. named_fields = {}
  699. name_map = {}
  700. for k in self._named_fields:
  701. korig = self._group_to_name_map[k]
  702. name_map[korig] = k
  703. if k in self._type_conversions:
  704. value = self._type_conversions[k](groupdict[k], m)
  705. else:
  706. value = groupdict[k]
  707. named_fields[korig] = value
  708. # now figure the match spans
  709. spans = dict((n, m.span(name_map[n])) for n in named_fields)
  710. spans.update((i, m.span(n + 1))
  711. for i, n in enumerate(self._fixed_fields))
  712. # and that's our result
  713. return Result(fixed_fields, self._expand_named_fields(named_fields), spans)
  714. def _regex_replace(self, match):
  715. return '\\' + match.group(1)
  716. def _generate_expression(self):
  717. # turn my _format attribute into the _expression attribute
  718. e = []
  719. for part in PARSE_RE.split(self._format):
  720. if not part:
  721. continue
  722. elif part == '{{':
  723. e.append(r'\{')
  724. elif part == '}}':
  725. e.append(r'\}')
  726. elif part[0] == '{' and part[-1] == '}':
  727. # this will be a braces-delimited field to handle
  728. e.append(self._handle_field(part))
  729. else:
  730. # just some text to match
  731. e.append(REGEX_SAFETY.sub(self._regex_replace, part))
  732. return ''.join(e)
  733. def _to_group_name(self, field):
  734. # return a version of field which can be used as capture group, even
  735. # though it might contain '.'
  736. group = field.replace('.', '_').replace('[', '_').replace(']', '_')
  737. # make sure we don't collide ("a.b" colliding with "a_b")
  738. n = 1
  739. while group in self._group_to_name_map:
  740. n += 1
  741. if '.' in field:
  742. group = field.replace('.', '_' * n)
  743. elif '_' in field:
  744. group = field.replace('_', '_' * n)
  745. else:
  746. raise KeyError('duplicated group name %r' % (field,))
  747. # save off the mapping
  748. self._group_to_name_map[group] = field
  749. self._name_to_group_map[field] = group
  750. return group
  751. def _handle_field(self, field):
  752. # first: lose the braces
  753. field = field[1:-1]
  754. # now figure whether this is an anonymous or named field, and whether
  755. # there's any format specification
  756. format = ''
  757. if field and field[0].isalpha():
  758. if ':' in field:
  759. name, format = field.split(':')
  760. else:
  761. name = field
  762. if name in self._name_to_group_map:
  763. if self._name_types[name] != format:
  764. raise RepeatedNameError('field type %r for field "%s" '
  765. 'does not match previous seen type %r' % (format,
  766. name, self._name_types[name]))
  767. group = self._name_to_group_map[name]
  768. # match previously-seen value
  769. return r'(?P=%s)' % group
  770. else:
  771. group = self._to_group_name(name)
  772. self._name_types[name] = format
  773. self._named_fields.append(group)
  774. # this will become a group, which must not contain dots
  775. wrap = r'(?P<%s>%%s)' % group
  776. else:
  777. self._fixed_fields.append(self._group_index)
  778. wrap = r'(%s)'
  779. if ':' in field:
  780. format = field[1:]
  781. group = self._group_index
  782. # simplest case: no type specifier ({} or {name})
  783. if not format:
  784. self._group_index += 1
  785. return wrap % r'.+?'
  786. # decode the format specification
  787. format = extract_format(format, self._extra_types)
  788. # figure type conversions, if any
  789. type = format['type']
  790. is_numeric = type and type in 'n%fegdobh'
  791. if type in self._extra_types:
  792. type_converter = self._extra_types[type]
  793. s = getattr(type_converter, 'pattern', r'.+?')
  794. regex_group_count = getattr(type_converter, 'regex_group_count', 0)
  795. if regex_group_count is None:
  796. regex_group_count = 0
  797. self._group_index += regex_group_count
  798. def f(string, m):
  799. return type_converter(string)
  800. self._type_conversions[group] = f
  801. elif type == 'n':
  802. s = r'\d{1,3}([,.]\d{3})*'
  803. self._group_index += 1
  804. self._type_conversions[group] = int_convert(10)
  805. elif type == 'b':
  806. s = r'(0[bB])?[01]+'
  807. self._type_conversions[group] = int_convert(2)
  808. self._group_index += 1
  809. elif type == 'o':
  810. s = r'(0[oO])?[0-7]+'
  811. self._type_conversions[group] = int_convert(8)
  812. self._group_index += 1
  813. elif type == 'x':
  814. s = r'(0[xX])?[0-9a-fA-F]+'
  815. self._type_conversions[group] = int_convert(16)
  816. self._group_index += 1
  817. elif type == '%':
  818. s = r'\d+(\.\d+)?%'
  819. self._group_index += 1
  820. self._type_conversions[group] = percentage
  821. elif type == 'f':
  822. s = r'\d+\.\d+'
  823. self._type_conversions[group] = lambda s, m: float(s)
  824. elif type == 'F':
  825. s = r'\d+\.\d+'
  826. self._type_conversions[group] = lambda s, m: Decimal(s)
  827. elif type == 'e':
  828. s = r'\d+\.\d+[eE][-+]?\d+|nan|NAN|[-+]?inf|[-+]?INF'
  829. self._type_conversions[group] = lambda s, m: float(s)
  830. elif type == 'g':
  831. s = r'\d+(\.\d+)?([eE][-+]?\d+)?|nan|NAN|[-+]?inf|[-+]?INF'
  832. self._group_index += 2
  833. self._type_conversions[group] = lambda s, m: float(s)
  834. elif type == 'd':
  835. if format.get('width'):
  836. width = r'{1,%s}' % int(format['width'])
  837. else:
  838. width = '+'
  839. s = r'\d{w}|0[xX][0-9a-fA-F]{w}|0[bB][01]{w}|0[oO][0-7]{w}'.format(w=width)
  840. self._type_conversions[group] = int_convert(10)
  841. elif type == 'ti':
  842. s = r'(\d{4}-\d\d-\d\d)((\s+|T)%s)?(Z|\s*[-+]\d\d:?\d\d)?' % \
  843. TIME_PAT
  844. n = self._group_index
  845. self._type_conversions[group] = partial(date_convert, ymd=n + 1,
  846. hms=n + 4, tz=n + 7)
  847. self._group_index += 7
  848. elif type == 'tg':
  849. s = r'(\d{1,2}[-/](\d{1,2}|%s)[-/]\d{4})(\s+%s)?%s?%s?' % (
  850. ALL_MONTHS_PAT, TIME_PAT, AM_PAT, TZ_PAT)
  851. n = self._group_index
  852. self._type_conversions[group] = partial(date_convert, dmy=n + 1,
  853. hms=n + 5, am=n + 8, tz=n + 9)
  854. self._group_index += 9
  855. elif type == 'ta':
  856. s = r'((\d{1,2}|%s)[-/]\d{1,2}[-/]\d{4})(\s+%s)?%s?%s?' % (
  857. ALL_MONTHS_PAT, TIME_PAT, AM_PAT, TZ_PAT)
  858. n = self._group_index
  859. self._type_conversions[group] = partial(date_convert, mdy=n + 1,
  860. hms=n + 5, am=n + 8, tz=n + 9)
  861. self._group_index += 9
  862. elif type == 'te':
  863. # this will allow microseconds through if they're present, but meh
  864. s = r'(%s,\s+)?(\d{1,2}\s+%s\s+\d{4})\s+%s%s' % (DAYS_PAT,
  865. MONTHS_PAT, TIME_PAT, TZ_PAT)
  866. n = self._group_index
  867. self._type_conversions[group] = partial(date_convert, dmy=n + 3,
  868. hms=n + 5, tz=n + 8)
  869. self._group_index += 8
  870. elif type == 'th':
  871. # slight flexibility here from the stock Apache format
  872. s = r'(\d{1,2}[-/]%s[-/]\d{4}):%s%s' % (MONTHS_PAT, TIME_PAT,
  873. TZ_PAT)
  874. n = self._group_index
  875. self._type_conversions[group] = partial(date_convert, dmy=n + 1,
  876. hms=n + 3, tz=n + 6)
  877. self._group_index += 6
  878. elif type == 'tc':
  879. s = r'(%s)\s+%s\s+(\d{1,2})\s+%s\s+(\d{4})' % (
  880. DAYS_PAT, MONTHS_PAT, TIME_PAT)
  881. n = self._group_index
  882. self._type_conversions[group] = partial(date_convert,
  883. d_m_y=(n + 4, n + 3, n + 8), hms=n + 5)
  884. self._group_index += 8
  885. elif type == 'tt':
  886. s = r'%s?%s?%s?' % (TIME_PAT, AM_PAT, TZ_PAT)
  887. n = self._group_index
  888. self._type_conversions[group] = partial(date_convert, hms=n + 1,
  889. am=n + 4, tz=n + 5)
  890. self._group_index += 5
  891. elif type == 'ts':
  892. s = r'%s(\s+)(\d+)(\s+)(\d{1,2}:\d{1,2}:\d{1,2})?' % MONTHS_PAT
  893. n = self._group_index
  894. self._type_conversions[group] = partial(date_convert, mm=n+1, dd=n+3,
  895. hms=n + 5)
  896. self._group_index += 5
  897. elif type == 'l':
  898. s = r'[A-Za-z]+'
  899. elif type:
  900. s = r'\%s+' % type
  901. elif format.get('precision'):
  902. if format.get('width'):
  903. s = r'.{%s,%s}?' % (format['width'], format['precision'])
  904. else:
  905. s = r'.{1,%s}?' % format['precision']
  906. elif format.get('width'):
  907. s = r'.{%s,}?' % format['width']
  908. else:
  909. s = r'.+?'
  910. align = format['align']
  911. fill = format['fill']
  912. # handle some numeric-specific things like fill and sign
  913. if is_numeric:
  914. # prefix with something (align "=" trumps zero)
  915. if align == '=':
  916. # special case - align "=" acts like the zero above but with
  917. # configurable fill defaulting to "0"
  918. if not fill:
  919. fill = '0'
  920. s = r'%s*' % fill + s
  921. # allow numbers to be prefixed with a sign
  922. s = r'[-+ ]?' + s
  923. if not fill:
  924. fill = ' '
  925. # Place into a group now - this captures the value we want to keep.
  926. # Everything else from now is just padding to be stripped off
  927. if wrap:
  928. s = wrap % s
  929. self._group_index += 1
  930. if format['width']:
  931. # all we really care about is that if the format originally
  932. # specified a width then there will probably be padding - without
  933. # an explicit alignment that'll mean right alignment with spaces
  934. # padding
  935. if not align:
  936. align = '>'
  937. if fill in r'.\+?*[](){}^$':
  938. fill = '\\' + fill
  939. # align "=" has been handled
  940. if align == '<':
  941. s = '%s%s*' % (s, fill)
  942. elif align == '>':
  943. s = '%s*%s' % (fill, s)
  944. elif align == '^':
  945. s = '%s*%s%s*' % (fill, s, fill)
  946. return s
  947. class Result(object):
  948. '''The result of a parse() or search().
  949. Fixed results may be looked up using `result[index]`.
  950. Named results may be looked up using `result['name']`.
  951. Named results may be tested for existence using `'name' in result`.
  952. '''
  953. def __init__(self, fixed, named, spans):
  954. self.fixed = fixed
  955. self.named = named
  956. self.spans = spans
  957. def __getitem__(self, item):
  958. if isinstance(item, int):
  959. return self.fixed[item]
  960. return self.named[item]
  961. def __repr__(self):
  962. return '<%s %r %r>' % (self.__class__.__name__, self.fixed,
  963. self.named)
  964. def __contains__(self, name):
  965. return name in self.named
  966. class Match(object):
  967. '''The result of a parse() or search() if no results are generated.
  968. This class is only used to expose internal used regex match objects
  969. to the user and use them for external Parser.evaluate_result calls.
  970. '''
  971. def __init__(self, parser, match):
  972. self.parser = parser
  973. self.match = match
  974. def evaluate_result(self):
  975. '''Generate results for this Match'''
  976. return self.parser.evaluate_result(self.match)
  977. class ResultIterator(object):
  978. '''The result of a findall() operation.
  979. Each element is a Result instance.
  980. '''
  981. def __init__(self, parser, string, pos, endpos, evaluate_result=True):
  982. self.parser = parser
  983. self.string = string
  984. self.pos = pos
  985. self.endpos = endpos
  986. self.evaluate_result = evaluate_result
  987. def __iter__(self):
  988. return self
  989. def __next__(self):
  990. m = self.parser._search_re.search(self.string, self.pos, self.endpos)
  991. if m is None:
  992. raise StopIteration()
  993. self.pos = m.end()
  994. if self.evaluate_result:
  995. return self.parser.evaluate_result(m)
  996. else:
  997. return Match(self.parser, m)
  998. # pre-py3k compat
  999. next = __next__
  1000. def parse(format, string, extra_types=None, evaluate_result=True, case_sensitive=False):
  1001. '''Using "format" attempt to pull values from "string".
  1002. The format must match the string contents exactly. If the value
  1003. you're looking for is instead just a part of the string use
  1004. search().
  1005. If ``evaluate_result`` is True the return value will be an Result instance with two attributes:
  1006. .fixed - tuple of fixed-position values from the string
  1007. .named - dict of named values from the string
  1008. If ``evaluate_result`` is False the return value will be a Match instance with one method:
  1009. .evaluate_result() - This will return a Result instance like you would get
  1010. with ``evaluate_result`` set to True
  1011. The default behaviour is to match strings case insensitively. You may match with
  1012. case by specifying case_sensitive=True.
  1013. If the format is invalid a ValueError will be raised.
  1014. See the module documentation for the use of "extra_types".
  1015. In the case there is no match parse() will return None.
  1016. '''
  1017. p = Parser(format, extra_types=extra_types, case_sensitive=case_sensitive)
  1018. return p.parse(string, evaluate_result=evaluate_result)
  1019. def search(format, string, pos=0, endpos=None, extra_types=None, evaluate_result=True,
  1020. case_sensitive=False):
  1021. '''Search "string" for the first occurrence of "format".
  1022. The format may occur anywhere within the string. If
  1023. instead you wish for the format to exactly match the string
  1024. use parse().
  1025. Optionally start the search at "pos" character index and limit the search
  1026. to a maximum index of endpos - equivalent to search(string[:endpos]).
  1027. If ``evaluate_result`` is True the return value will be an Result instance with two attributes:
  1028. .fixed - tuple of fixed-position values from the string
  1029. .named - dict of named values from the string
  1030. If ``evaluate_result`` is False the return value will be a Match instance with one method:
  1031. .evaluate_result() - This will return a Result instance like you would get
  1032. with ``evaluate_result`` set to True
  1033. The default behaviour is to match strings case insensitively. You may match with
  1034. case by specifying case_sensitive=True.
  1035. If the format is invalid a ValueError will be raised.
  1036. See the module documentation for the use of "extra_types".
  1037. In the case there is no match parse() will return None.
  1038. '''
  1039. p = Parser(format, extra_types=extra_types, case_sensitive=case_sensitive)
  1040. return p.search(string, pos, endpos, evaluate_result=evaluate_result)
  1041. def findall(format, string, pos=0, endpos=None, extra_types=None, evaluate_result=True,
  1042. case_sensitive=False):
  1043. '''Search "string" for all occurrences of "format".
  1044. You will be returned an iterator that holds Result instances
  1045. for each format match found.
  1046. Optionally start the search at "pos" character index and limit the search
  1047. to a maximum index of endpos - equivalent to search(string[:endpos]).
  1048. If ``evaluate_result`` is True each returned Result instance has two attributes:
  1049. .fixed - tuple of fixed-position values from the string
  1050. .named - dict of named values from the string
  1051. If ``evaluate_result`` is False each returned value is a Match instance with one method:
  1052. .evaluate_result() - This will return a Result instance like you would get
  1053. with ``evaluate_result`` set to True
  1054. The default behaviour is to match strings case insensitively. You may match with
  1055. case by specifying case_sensitive=True.
  1056. If the format is invalid a ValueError will be raised.
  1057. See the module documentation for the use of "extra_types".
  1058. '''
  1059. p = Parser(format, extra_types=extra_types, case_sensitive=case_sensitive)
  1060. return Parser(format, extra_types=extra_types).findall(string, pos, endpos, evaluate_result=evaluate_result)
  1061. def compile(format, extra_types=None, case_sensitive=False):
  1062. '''Create a Parser instance to parse "format".
  1063. The resultant Parser has a method .parse(string) which
  1064. behaves in the same manner as parse(format, string).
  1065. The default behaviour is to match strings case insensitively. You may match with
  1066. case by specifying case_sensitive=True.
  1067. Use this function if you intend to parse many strings
  1068. with the same format.
  1069. See the module documentation for the use of "extra_types".
  1070. Returns a Parser instance.
  1071. '''
  1072. return Parser(format, extra_types=extra_types)
  1073. # Copyright (c) 2012-2019 Richard Jones <richard@python.org>
  1074. #
  1075. # Permission is hereby granted, free of charge, to any person obtaining a copy
  1076. # of this software and associated documentation files (the "Software"), to deal
  1077. # in the Software without restriction, including without limitation the rights
  1078. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  1079. # copies of the Software, and to permit persons to whom the Software is
  1080. # furnished to do so, subject to the following conditions:
  1081. #
  1082. # The above copyright notice and this permission notice shall be included in
  1083. # all copies or substantial portions of the Software.
  1084. #
  1085. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  1086. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  1087. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  1088. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  1089. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  1090. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  1091. # SOFTWARE.
  1092. # vim: set filetype=python ts=4 sw=4 et si tw=75