parser.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569
  1. # -*- coding: utf-8 -*-
  2. from __future__ import absolute_import, unicode_literals
  3. import re
  4. from datetime import datetime, timedelta
  5. from dateutil import tz
  6. from arrow import locales
  7. from arrow.constants import MAX_TIMESTAMP, MAX_TIMESTAMP_MS, MAX_TIMESTAMP_US
  8. from arrow.util import iso_to_gregorian
  9. try:
  10. from functools import lru_cache
  11. except ImportError: # pragma: no cover
  12. from backports.functools_lru_cache import lru_cache # pragma: no cover
  13. class ParserError(ValueError):
  14. pass
  15. # Allows for ParserErrors to be propagated from _build_datetime()
  16. # when day_of_year errors occur.
  17. # Before this, the ParserErrors were caught by the try/except in
  18. # _parse_multiformat() and the appropriate error message was not
  19. # transmitted to the user.
  20. class ParserMatchError(ParserError):
  21. pass
  22. class DateTimeParser(object):
  23. _FORMAT_RE = re.compile(
  24. r"(YYY?Y?|MM?M?M?|Do|DD?D?D?|d?d?d?d|HH?|hh?|mm?|ss?|S+|ZZ?Z?|a|A|x|X|W)"
  25. )
  26. _ESCAPE_RE = re.compile(r"\[[^\[\]]*\]")
  27. _ONE_OR_TWO_DIGIT_RE = re.compile(r"\d{1,2}")
  28. _ONE_OR_TWO_OR_THREE_DIGIT_RE = re.compile(r"\d{1,3}")
  29. _ONE_OR_MORE_DIGIT_RE = re.compile(r"\d+")
  30. _TWO_DIGIT_RE = re.compile(r"\d{2}")
  31. _THREE_DIGIT_RE = re.compile(r"\d{3}")
  32. _FOUR_DIGIT_RE = re.compile(r"\d{4}")
  33. _TZ_Z_RE = re.compile(r"([\+\-])(\d{2})(?:(\d{2}))?|Z")
  34. _TZ_ZZ_RE = re.compile(r"([\+\-])(\d{2})(?:\:(\d{2}))?|Z")
  35. _TZ_NAME_RE = re.compile(r"\w[\w+\-/]+")
  36. # NOTE: timestamps cannot be parsed from natural language strings (by removing the ^...$) because it will
  37. # break cases like "15 Jul 2000" and a format list (see issue #447)
  38. _TIMESTAMP_RE = re.compile(r"^\-?\d+\.?\d+$")
  39. _TIMESTAMP_EXPANDED_RE = re.compile(r"^\-?\d+$")
  40. _TIME_RE = re.compile(r"^(\d{2})(?:\:?(\d{2}))?(?:\:?(\d{2}))?(?:([\.\,])(\d+))?$")
  41. _WEEK_DATE_RE = re.compile(r"(?P<year>\d{4})[\-]?W(?P<week>\d{2})[\-]?(?P<day>\d)?")
  42. _BASE_INPUT_RE_MAP = {
  43. "YYYY": _FOUR_DIGIT_RE,
  44. "YY": _TWO_DIGIT_RE,
  45. "MM": _TWO_DIGIT_RE,
  46. "M": _ONE_OR_TWO_DIGIT_RE,
  47. "DDDD": _THREE_DIGIT_RE,
  48. "DDD": _ONE_OR_TWO_OR_THREE_DIGIT_RE,
  49. "DD": _TWO_DIGIT_RE,
  50. "D": _ONE_OR_TWO_DIGIT_RE,
  51. "HH": _TWO_DIGIT_RE,
  52. "H": _ONE_OR_TWO_DIGIT_RE,
  53. "hh": _TWO_DIGIT_RE,
  54. "h": _ONE_OR_TWO_DIGIT_RE,
  55. "mm": _TWO_DIGIT_RE,
  56. "m": _ONE_OR_TWO_DIGIT_RE,
  57. "ss": _TWO_DIGIT_RE,
  58. "s": _ONE_OR_TWO_DIGIT_RE,
  59. "X": _TIMESTAMP_RE,
  60. "x": _TIMESTAMP_EXPANDED_RE,
  61. "ZZZ": _TZ_NAME_RE,
  62. "ZZ": _TZ_ZZ_RE,
  63. "Z": _TZ_Z_RE,
  64. "S": _ONE_OR_MORE_DIGIT_RE,
  65. "W": _WEEK_DATE_RE,
  66. }
  67. SEPARATORS = ["-", "/", "."]
  68. def __init__(self, locale="en_us", cache_size=0):
  69. self.locale = locales.get_locale(locale)
  70. self._input_re_map = self._BASE_INPUT_RE_MAP.copy()
  71. self._input_re_map.update(
  72. {
  73. "MMMM": self._generate_choice_re(
  74. self.locale.month_names[1:], re.IGNORECASE
  75. ),
  76. "MMM": self._generate_choice_re(
  77. self.locale.month_abbreviations[1:], re.IGNORECASE
  78. ),
  79. "Do": re.compile(self.locale.ordinal_day_re),
  80. "dddd": self._generate_choice_re(
  81. self.locale.day_names[1:], re.IGNORECASE
  82. ),
  83. "ddd": self._generate_choice_re(
  84. self.locale.day_abbreviations[1:], re.IGNORECASE
  85. ),
  86. "d": re.compile(r"[1-7]"),
  87. "a": self._generate_choice_re(
  88. (self.locale.meridians["am"], self.locale.meridians["pm"])
  89. ),
  90. # note: 'A' token accepts both 'am/pm' and 'AM/PM' formats to
  91. # ensure backwards compatibility of this token
  92. "A": self._generate_choice_re(self.locale.meridians.values()),
  93. }
  94. )
  95. if cache_size > 0:
  96. self._generate_pattern_re = lru_cache(maxsize=cache_size)(
  97. self._generate_pattern_re
  98. )
  99. # TODO: since we support more than ISO 8601, we should rename this function
  100. # IDEA: break into multiple functions
  101. def parse_iso(self, datetime_string):
  102. # TODO: add a flag to normalize whitespace (useful in logs, ref issue #421)
  103. has_space_divider = " " in datetime_string
  104. has_t_divider = "T" in datetime_string
  105. num_spaces = datetime_string.count(" ")
  106. if has_space_divider and num_spaces != 1 or has_t_divider and num_spaces > 0:
  107. raise ParserError(
  108. "Expected an ISO 8601-like string, but was given '{}'. Try passing in a format string to resolve this.".format(
  109. datetime_string
  110. )
  111. )
  112. has_time = has_space_divider or has_t_divider
  113. has_tz = False
  114. # date formats (ISO 8601 and others) to test against
  115. # NOTE: YYYYMM is omitted to avoid confusion with YYMMDD (no longer part of ISO 8601, but is still often used)
  116. formats = [
  117. "YYYY-MM-DD",
  118. "YYYY-M-DD",
  119. "YYYY-M-D",
  120. "YYYY/MM/DD",
  121. "YYYY/M/DD",
  122. "YYYY/M/D",
  123. "YYYY.MM.DD",
  124. "YYYY.M.DD",
  125. "YYYY.M.D",
  126. "YYYYMMDD",
  127. "YYYY-DDDD",
  128. "YYYYDDDD",
  129. "YYYY-MM",
  130. "YYYY/MM",
  131. "YYYY.MM",
  132. "YYYY",
  133. "W",
  134. ]
  135. if has_time:
  136. if has_space_divider:
  137. date_string, time_string = datetime_string.split(" ", 1)
  138. else:
  139. date_string, time_string = datetime_string.split("T", 1)
  140. time_parts = re.split(r"[\+\-Z]", time_string, 1, re.IGNORECASE)
  141. time_components = self._TIME_RE.match(time_parts[0])
  142. if time_components is None:
  143. raise ParserError(
  144. "Invalid time component provided. Please specify a format or provide a valid time component in the basic or extended ISO 8601 time format."
  145. )
  146. (
  147. hours,
  148. minutes,
  149. seconds,
  150. subseconds_sep,
  151. subseconds,
  152. ) = time_components.groups()
  153. has_tz = len(time_parts) == 2
  154. has_minutes = minutes is not None
  155. has_seconds = seconds is not None
  156. has_subseconds = subseconds is not None
  157. is_basic_time_format = ":" not in time_parts[0]
  158. tz_format = "Z"
  159. # use 'ZZ' token instead since tz offset is present in non-basic format
  160. if has_tz and ":" in time_parts[1]:
  161. tz_format = "ZZ"
  162. time_sep = "" if is_basic_time_format else ":"
  163. if has_subseconds:
  164. time_string = "HH{time_sep}mm{time_sep}ss{subseconds_sep}S".format(
  165. time_sep=time_sep, subseconds_sep=subseconds_sep
  166. )
  167. elif has_seconds:
  168. time_string = "HH{time_sep}mm{time_sep}ss".format(time_sep=time_sep)
  169. elif has_minutes:
  170. time_string = "HH{time_sep}mm".format(time_sep=time_sep)
  171. else:
  172. time_string = "HH"
  173. if has_space_divider:
  174. formats = ["{} {}".format(f, time_string) for f in formats]
  175. else:
  176. formats = ["{}T{}".format(f, time_string) for f in formats]
  177. if has_time and has_tz:
  178. # Add "Z" or "ZZ" to the format strings to indicate to
  179. # _parse_token() that a timezone needs to be parsed
  180. formats = ["{}{}".format(f, tz_format) for f in formats]
  181. return self._parse_multiformat(datetime_string, formats)
  182. def parse(self, datetime_string, fmt):
  183. if isinstance(fmt, list):
  184. return self._parse_multiformat(datetime_string, fmt)
  185. fmt_tokens, fmt_pattern_re = self._generate_pattern_re(fmt)
  186. match = fmt_pattern_re.search(datetime_string)
  187. if match is None:
  188. raise ParserMatchError(
  189. "Failed to match '{}' when parsing '{}'".format(fmt, datetime_string)
  190. )
  191. parts = {}
  192. for token in fmt_tokens:
  193. if token == "Do":
  194. value = match.group("value")
  195. elif token == "W":
  196. value = (match.group("year"), match.group("week"), match.group("day"))
  197. else:
  198. value = match.group(token)
  199. self._parse_token(token, value, parts)
  200. return self._build_datetime(parts)
  201. def _generate_pattern_re(self, fmt):
  202. # fmt is a string of tokens like 'YYYY-MM-DD'
  203. # we construct a new string by replacing each
  204. # token by its pattern:
  205. # 'YYYY-MM-DD' -> '(?P<YYYY>\d{4})-(?P<MM>\d{2})-(?P<DD>\d{2})'
  206. tokens = []
  207. offset = 0
  208. # Escape all special RegEx chars
  209. escaped_fmt = re.escape(fmt)
  210. # Extract the bracketed expressions to be reinserted later.
  211. escaped_fmt = re.sub(self._ESCAPE_RE, "#", escaped_fmt)
  212. # Any number of S is the same as one.
  213. # TODO: allow users to specify the number of digits to parse
  214. escaped_fmt = re.sub(r"S+", "S", escaped_fmt)
  215. escaped_data = re.findall(self._ESCAPE_RE, fmt)
  216. fmt_pattern = escaped_fmt
  217. for m in self._FORMAT_RE.finditer(escaped_fmt):
  218. token = m.group(0)
  219. try:
  220. input_re = self._input_re_map[token]
  221. except KeyError:
  222. raise ParserError("Unrecognized token '{}'".format(token))
  223. input_pattern = "(?P<{}>{})".format(token, input_re.pattern)
  224. tokens.append(token)
  225. # a pattern doesn't have the same length as the token
  226. # it replaces! We keep the difference in the offset variable.
  227. # This works because the string is scanned left-to-right and matches
  228. # are returned in the order found by finditer.
  229. fmt_pattern = (
  230. fmt_pattern[: m.start() + offset]
  231. + input_pattern
  232. + fmt_pattern[m.end() + offset :]
  233. )
  234. offset += len(input_pattern) - (m.end() - m.start())
  235. final_fmt_pattern = ""
  236. split_fmt = fmt_pattern.split(r"\#")
  237. # Due to the way Python splits, 'split_fmt' will always be longer
  238. for i in range(len(split_fmt)):
  239. final_fmt_pattern += split_fmt[i]
  240. if i < len(escaped_data):
  241. final_fmt_pattern += escaped_data[i][1:-1]
  242. # Wrap final_fmt_pattern in a custom word boundary to strictly
  243. # match the formatting pattern and filter out date and time formats
  244. # that include junk such as: blah1998-09-12 blah, blah 1998-09-12blah,
  245. # blah1998-09-12blah. The custom word boundary matches every character
  246. # that is not a whitespace character to allow for searching for a date
  247. # and time string in a natural language sentence. Therefore, searching
  248. # for a string of the form YYYY-MM-DD in "blah 1998-09-12 blah" will
  249. # work properly.
  250. # Certain punctuation before or after the target pattern such as
  251. # "1998-09-12," is permitted. For the full list of valid punctuation,
  252. # see the documentation.
  253. starting_word_boundary = (
  254. r"(?<!\S\S)" # Don't have two consecutive non-whitespace characters. This ensures that we allow cases like .11.25.2019 but not 1.11.25.2019 (for pattern MM.DD.YYYY)
  255. r"(?<![^\,\.\;\:\?\!\"\'\`\[\]\{\}\(\)<>\s])" # This is the list of punctuation that is ok before the pattern (i.e. "It can't not be these characters before the pattern")
  256. r"(\b|^)" # The \b is to block cases like 1201912 but allow 201912 for pattern YYYYMM. The ^ was necessary to allow a negative number through i.e. before epoch numbers
  257. )
  258. ending_word_boundary = (
  259. r"(?=[\,\.\;\:\?\!\"\'\`\[\]\{\}\(\)\<\>]?" # Positive lookahead stating that these punctuation marks can appear after the pattern at most 1 time
  260. r"(?!\S))" # Don't allow any non-whitespace character after the punctuation
  261. )
  262. bounded_fmt_pattern = r"{}{}{}".format(
  263. starting_word_boundary, final_fmt_pattern, ending_word_boundary
  264. )
  265. return tokens, re.compile(bounded_fmt_pattern, flags=re.IGNORECASE)
  266. def _parse_token(self, token, value, parts):
  267. if token == "YYYY":
  268. parts["year"] = int(value)
  269. elif token == "YY":
  270. value = int(value)
  271. parts["year"] = 1900 + value if value > 68 else 2000 + value
  272. elif token in ["MMMM", "MMM"]:
  273. parts["month"] = self.locale.month_number(value.lower())
  274. elif token in ["MM", "M"]:
  275. parts["month"] = int(value)
  276. elif token in ["DDDD", "DDD"]:
  277. parts["day_of_year"] = int(value)
  278. elif token in ["DD", "D"]:
  279. parts["day"] = int(value)
  280. elif token in ["Do"]:
  281. parts["day"] = int(value)
  282. elif token.upper() in ["HH", "H"]:
  283. parts["hour"] = int(value)
  284. elif token in ["mm", "m"]:
  285. parts["minute"] = int(value)
  286. elif token in ["ss", "s"]:
  287. parts["second"] = int(value)
  288. elif token == "S":
  289. # We have the *most significant* digits of an arbitrary-precision integer.
  290. # We want the six most significant digits as an integer, rounded.
  291. # IDEA: add nanosecond support somehow? Need datetime support for it first.
  292. value = value.ljust(7, str("0"))
  293. # floating-point (IEEE-754) defaults to half-to-even rounding
  294. seventh_digit = int(value[6])
  295. if seventh_digit == 5:
  296. rounding = int(value[5]) % 2
  297. elif seventh_digit > 5:
  298. rounding = 1
  299. else:
  300. rounding = 0
  301. parts["microsecond"] = int(value[:6]) + rounding
  302. elif token == "X":
  303. parts["timestamp"] = float(value)
  304. elif token == "x":
  305. parts["expanded_timestamp"] = int(value)
  306. elif token in ["ZZZ", "ZZ", "Z"]:
  307. parts["tzinfo"] = TzinfoParser.parse(value)
  308. elif token in ["a", "A"]:
  309. if value in (self.locale.meridians["am"], self.locale.meridians["AM"]):
  310. parts["am_pm"] = "am"
  311. elif value in (self.locale.meridians["pm"], self.locale.meridians["PM"]):
  312. parts["am_pm"] = "pm"
  313. elif token == "W":
  314. parts["weekdate"] = value
  315. @staticmethod
  316. def _build_datetime(parts):
  317. weekdate = parts.get("weekdate")
  318. if weekdate is not None:
  319. # we can use strptime (%G, %V, %u) in python 3.6 but these tokens aren't available before that
  320. year, week = int(weekdate[0]), int(weekdate[1])
  321. if weekdate[2] is not None:
  322. day = int(weekdate[2])
  323. else:
  324. # day not given, default to 1
  325. day = 1
  326. dt = iso_to_gregorian(year, week, day)
  327. parts["year"] = dt.year
  328. parts["month"] = dt.month
  329. parts["day"] = dt.day
  330. timestamp = parts.get("timestamp")
  331. if timestamp is not None:
  332. return datetime.fromtimestamp(timestamp, tz=tz.tzutc())
  333. expanded_timestamp = parts.get("expanded_timestamp")
  334. if expanded_timestamp is not None:
  335. if expanded_timestamp > MAX_TIMESTAMP:
  336. if expanded_timestamp < MAX_TIMESTAMP_MS:
  337. expanded_timestamp /= 1000.0
  338. elif expanded_timestamp < MAX_TIMESTAMP_US:
  339. expanded_timestamp /= 1000000.0
  340. else:
  341. raise ValueError(
  342. "The specified timestamp '{}' is too large.".format(
  343. expanded_timestamp
  344. )
  345. )
  346. return datetime.fromtimestamp(expanded_timestamp, tz=tz.tzutc())
  347. day_of_year = parts.get("day_of_year")
  348. if day_of_year is not None:
  349. year = parts.get("year")
  350. month = parts.get("month")
  351. if year is None:
  352. raise ParserError(
  353. "Year component is required with the DDD and DDDD tokens."
  354. )
  355. if month is not None:
  356. raise ParserError(
  357. "Month component is not allowed with the DDD and DDDD tokens."
  358. )
  359. date_string = "{}-{}".format(year, day_of_year)
  360. try:
  361. dt = datetime.strptime(date_string, "%Y-%j")
  362. except ValueError:
  363. raise ParserError(
  364. "The provided day of year '{}' is invalid.".format(day_of_year)
  365. )
  366. parts["year"] = dt.year
  367. parts["month"] = dt.month
  368. parts["day"] = dt.day
  369. am_pm = parts.get("am_pm")
  370. hour = parts.get("hour", 0)
  371. if am_pm == "pm" and hour < 12:
  372. hour += 12
  373. elif am_pm == "am" and hour == 12:
  374. hour = 0
  375. # Support for midnight at the end of day
  376. if hour == 24:
  377. if parts.get("minute", 0) != 0:
  378. raise ParserError("Midnight at the end of day must not contain minutes")
  379. if parts.get("second", 0) != 0:
  380. raise ParserError("Midnight at the end of day must not contain seconds")
  381. if parts.get("microsecond", 0) != 0:
  382. raise ParserError(
  383. "Midnight at the end of day must not contain microseconds"
  384. )
  385. hour = 0
  386. day_increment = 1
  387. else:
  388. day_increment = 0
  389. # account for rounding up to 1000000
  390. microsecond = parts.get("microsecond", 0)
  391. if microsecond == 1000000:
  392. microsecond = 0
  393. second_increment = 1
  394. else:
  395. second_increment = 0
  396. increment = timedelta(days=day_increment, seconds=second_increment)
  397. return (
  398. datetime(
  399. year=parts.get("year", 1),
  400. month=parts.get("month", 1),
  401. day=parts.get("day", 1),
  402. hour=hour,
  403. minute=parts.get("minute", 0),
  404. second=parts.get("second", 0),
  405. microsecond=microsecond,
  406. tzinfo=parts.get("tzinfo"),
  407. )
  408. + increment
  409. )
  410. def _parse_multiformat(self, string, formats):
  411. _datetime = None
  412. for fmt in formats:
  413. try:
  414. _datetime = self.parse(string, fmt)
  415. break
  416. except ParserMatchError:
  417. pass
  418. if _datetime is None:
  419. raise ParserError(
  420. "Could not match input '{}' to any of the following formats: {}".format(
  421. string, ", ".join(formats)
  422. )
  423. )
  424. return _datetime
  425. # generates a capture group of choices separated by an OR operator
  426. @staticmethod
  427. def _generate_choice_re(choices, flags=0):
  428. return re.compile(r"({})".format("|".join(choices)), flags=flags)
  429. class TzinfoParser(object):
  430. _TZINFO_RE = re.compile(r"^([\+\-])?(\d{2})(?:\:?(\d{2}))?$")
  431. @classmethod
  432. def parse(cls, tzinfo_string):
  433. tzinfo = None
  434. if tzinfo_string == "local":
  435. tzinfo = tz.tzlocal()
  436. elif tzinfo_string in ["utc", "UTC", "Z"]:
  437. tzinfo = tz.tzutc()
  438. else:
  439. iso_match = cls._TZINFO_RE.match(tzinfo_string)
  440. if iso_match:
  441. sign, hours, minutes = iso_match.groups()
  442. if minutes is None:
  443. minutes = 0
  444. seconds = int(hours) * 3600 + int(minutes) * 60
  445. if sign == "-":
  446. seconds *= -1
  447. tzinfo = tz.tzoffset(None, seconds)
  448. else:
  449. tzinfo = tz.gettz(tzinfo_string)
  450. if tzinfo is None:
  451. raise ParserError(
  452. 'Could not parse timezone expression "{}"'.format(tzinfo_string)
  453. )
  454. return tzinfo