test_parse_dates.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849
  1. # -*- coding: utf-8 -*-
  2. """
  3. Tests date parsing functionality for all of the
  4. parsers defined in parsers.py
  5. """
  6. from datetime import date, datetime
  7. import numpy as np
  8. import pytest
  9. import pytz
  10. from pandas._libs.tslib import Timestamp
  11. from pandas._libs.tslibs import parsing
  12. from pandas.compat import StringIO, lrange, parse_date
  13. from pandas.compat.numpy import np_array_datetime64_compat
  14. import pandas as pd
  15. from pandas import DataFrame, DatetimeIndex, Index, MultiIndex
  16. from pandas.core.indexes.datetimes import date_range
  17. import pandas.util.testing as tm
  18. import pandas.io.date_converters as conv
  19. import pandas.io.parsers as parsers
  20. def test_separator_date_conflict(all_parsers):
  21. # Regression test for gh-4678
  22. #
  23. # Make sure thousands separator and
  24. # date parsing do not conflict.
  25. parser = all_parsers
  26. data = "06-02-2013;13:00;1-000.215"
  27. expected = DataFrame([[datetime(2013, 6, 2, 13, 0, 0), 1000.215]],
  28. columns=["Date", 2])
  29. df = parser.read_csv(StringIO(data), sep=";", thousands="-",
  30. parse_dates={"Date": [0, 1]}, header=None)
  31. tm.assert_frame_equal(df, expected)
  32. @pytest.mark.parametrize("keep_date_col", [True, False])
  33. def test_multiple_date_col_custom(all_parsers, keep_date_col):
  34. data = """\
  35. KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
  36. KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
  37. KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
  38. KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
  39. KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
  40. KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
  41. """
  42. parser = all_parsers
  43. def date_parser(*date_cols):
  44. """
  45. Test date parser.
  46. Parameters
  47. ----------
  48. date_cols : args
  49. The list of data columns to parse.
  50. Returns
  51. -------
  52. parsed : Series
  53. """
  54. return parsing.try_parse_dates(parsers._concat_date_cols(date_cols))
  55. result = parser.read_csv(StringIO(data), header=None,
  56. date_parser=date_parser, prefix="X",
  57. parse_dates={"actual": [1, 2],
  58. "nominal": [1, 3]},
  59. keep_date_col=keep_date_col)
  60. expected = DataFrame([
  61. [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56),
  62. "KORD", "19990127", " 19:00:00", " 18:56:00",
  63. 0.81, 2.81, 7.2, 0.0, 280.0],
  64. [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56),
  65. "KORD", "19990127", " 20:00:00", " 19:56:00",
  66. 0.01, 2.21, 7.2, 0.0, 260.0],
  67. [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56),
  68. "KORD", "19990127", " 21:00:00", " 20:56:00",
  69. -0.59, 2.21, 5.7, 0.0, 280.0],
  70. [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18),
  71. "KORD", "19990127", " 21:00:00", " 21:18:00",
  72. -0.99, 2.01, 3.6, 0.0, 270.0],
  73. [datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56),
  74. "KORD", "19990127", " 22:00:00", " 21:56:00",
  75. -0.59, 1.71, 5.1, 0.0, 290.0],
  76. [datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56),
  77. "KORD", "19990127", " 23:00:00", " 22:56:00",
  78. -0.59, 1.71, 4.6, 0.0, 280.0],
  79. ], columns=["actual", "nominal", "X0", "X1", "X2",
  80. "X3", "X4", "X5", "X6", "X7", "X8"])
  81. if not keep_date_col:
  82. expected = expected.drop(["X1", "X2", "X3"], axis=1)
  83. elif parser.engine == "python":
  84. expected["X1"] = expected["X1"].astype(np.int64)
  85. # Python can sometimes be flaky about how
  86. # the aggregated columns are entered, so
  87. # this standardizes the order.
  88. result = result[expected.columns]
  89. tm.assert_frame_equal(result, expected)
  90. @pytest.mark.parametrize("keep_date_col", [True, False])
  91. def test_multiple_date_col(all_parsers, keep_date_col):
  92. data = """\
  93. KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
  94. KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
  95. KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
  96. KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
  97. KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
  98. KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
  99. """
  100. parser = all_parsers
  101. result = parser.read_csv(StringIO(data), header=None,
  102. prefix="X", parse_dates=[[1, 2], [1, 3]],
  103. keep_date_col=keep_date_col)
  104. expected = DataFrame([
  105. [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56),
  106. "KORD", "19990127", " 19:00:00", " 18:56:00",
  107. 0.81, 2.81, 7.2, 0.0, 280.0],
  108. [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56),
  109. "KORD", "19990127", " 20:00:00", " 19:56:00",
  110. 0.01, 2.21, 7.2, 0.0, 260.0],
  111. [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56),
  112. "KORD", "19990127", " 21:00:00", " 20:56:00",
  113. -0.59, 2.21, 5.7, 0.0, 280.0],
  114. [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18),
  115. "KORD", "19990127", " 21:00:00", " 21:18:00",
  116. -0.99, 2.01, 3.6, 0.0, 270.0],
  117. [datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56),
  118. "KORD", "19990127", " 22:00:00", " 21:56:00",
  119. -0.59, 1.71, 5.1, 0.0, 290.0],
  120. [datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56),
  121. "KORD", "19990127", " 23:00:00", " 22:56:00",
  122. -0.59, 1.71, 4.6, 0.0, 280.0],
  123. ], columns=["X1_X2", "X1_X3", "X0", "X1", "X2",
  124. "X3", "X4", "X5", "X6", "X7", "X8"])
  125. if not keep_date_col:
  126. expected = expected.drop(["X1", "X2", "X3"], axis=1)
  127. elif parser.engine == "python":
  128. expected["X1"] = expected["X1"].astype(np.int64)
  129. tm.assert_frame_equal(result, expected)
  130. def test_date_col_as_index_col(all_parsers):
  131. data = """\
  132. KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
  133. KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
  134. KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
  135. KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
  136. KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
  137. """
  138. parser = all_parsers
  139. result = parser.read_csv(StringIO(data), header=None, prefix="X",
  140. parse_dates=[1], index_col=1)
  141. index = Index([datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 20, 0),
  142. datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 0),
  143. datetime(1999, 1, 27, 22, 0)], name="X1")
  144. expected = DataFrame([
  145. ["KORD", " 18:56:00", 0.81, 2.81, 7.2, 0.0, 280.0],
  146. ["KORD", " 19:56:00", 0.01, 2.21, 7.2, 0.0, 260.0],
  147. ["KORD", " 20:56:00", -0.59, 2.21, 5.7, 0.0, 280.0],
  148. ["KORD", " 21:18:00", -0.99, 2.01, 3.6, 0.0, 270.0],
  149. ["KORD", " 21:56:00", -0.59, 1.71, 5.1, 0.0, 290.0],
  150. ], columns=["X0", "X2", "X3", "X4", "X5", "X6", "X7"], index=index)
  151. tm.assert_frame_equal(result, expected)
  152. def test_multiple_date_cols_int_cast(all_parsers):
  153. data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
  154. "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
  155. "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
  156. "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
  157. "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
  158. "KORD,19990127, 23:00:00, 22:56:00, -0.5900")
  159. parse_dates = {"actual": [1, 2], "nominal": [1, 3]}
  160. parser = all_parsers
  161. result = parser.read_csv(StringIO(data), header=None,
  162. date_parser=conv.parse_date_time,
  163. parse_dates=parse_dates, prefix="X")
  164. expected = DataFrame([
  165. [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56),
  166. "KORD", 0.81],
  167. [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56),
  168. "KORD", 0.01],
  169. [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56),
  170. "KORD", -0.59],
  171. [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18),
  172. "KORD", -0.99],
  173. [datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56),
  174. "KORD", -0.59],
  175. [datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56),
  176. "KORD", -0.59],
  177. ], columns=["actual", "nominal", "X0", "X4"])
  178. # Python can sometimes be flaky about how
  179. # the aggregated columns are entered, so
  180. # this standardizes the order.
  181. result = result[expected.columns]
  182. tm.assert_frame_equal(result, expected)
  183. def test_multiple_date_col_timestamp_parse(all_parsers):
  184. parser = all_parsers
  185. data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25
  186. 05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25"""
  187. result = parser.read_csv(StringIO(data), parse_dates=[[0, 1]],
  188. header=None, date_parser=Timestamp)
  189. expected = DataFrame([
  190. [Timestamp("05/31/2012, 15:30:00.029"),
  191. 1306.25, 1, "E", 0, np.nan, 1306.25],
  192. [Timestamp("05/31/2012, 15:30:00.029"),
  193. 1306.25, 8, "E", 0, np.nan, 1306.25]
  194. ], columns=["0_1", 2, 3, 4, 5, 6, 7])
  195. tm.assert_frame_equal(result, expected)
  196. def test_multiple_date_cols_with_header(all_parsers):
  197. parser = all_parsers
  198. data = """\
  199. ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
  200. KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
  201. KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
  202. KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
  203. KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
  204. KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
  205. KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
  206. result = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]})
  207. expected = DataFrame([
  208. [datetime(1999, 1, 27, 19, 0), "KORD", " 18:56:00",
  209. 0.81, 2.81, 7.2, 0.0, 280.0],
  210. [datetime(1999, 1, 27, 20, 0), "KORD", " 19:56:00",
  211. 0.01, 2.21, 7.2, 0.0, 260.0],
  212. [datetime(1999, 1, 27, 21, 0), "KORD", " 20:56:00",
  213. -0.59, 2.21, 5.7, 0.0, 280.0],
  214. [datetime(1999, 1, 27, 21, 0), "KORD", " 21:18:00",
  215. -0.99, 2.01, 3.6, 0.0, 270.0],
  216. [datetime(1999, 1, 27, 22, 0), "KORD", " 21:56:00",
  217. -0.59, 1.71, 5.1, 0.0, 290.0],
  218. [datetime(1999, 1, 27, 23, 0), "KORD", " 22:56:00",
  219. -0.59, 1.71, 4.6, 0.0, 280.0],
  220. ], columns=["nominal", "ID", "ActualTime", "TDew",
  221. "TAir", "Windspeed", "Precip", "WindDir"])
  222. tm.assert_frame_equal(result, expected)
  223. @pytest.mark.parametrize("data,parse_dates,msg", [
  224. ("""\
  225. date_NominalTime,date,NominalTime
  226. KORD1,19990127, 19:00:00
  227. KORD2,19990127, 20:00:00""", [[1, 2]], ("New date column already "
  228. "in dict date_NominalTime")),
  229. ("""\
  230. ID,date,nominalTime
  231. KORD,19990127, 19:00:00
  232. KORD,19990127, 20:00:00""", dict(ID=[1, 2]), "Date column ID already in dict")
  233. ])
  234. def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg):
  235. parser = all_parsers
  236. with pytest.raises(ValueError, match=msg):
  237. parser.read_csv(StringIO(data), parse_dates=parse_dates)
  238. def test_date_parser_int_bug(all_parsers):
  239. # see gh-3071
  240. parser = all_parsers
  241. data = ("posix_timestamp,elapsed,sys,user,queries,query_time,rows,"
  242. "accountid,userid,contactid,level,silo,method\n"
  243. "1343103150,0.062353,0,4,6,0.01690,3,"
  244. "12345,1,-1,3,invoice_InvoiceResource,search\n")
  245. result = parser.read_csv(
  246. StringIO(data), index_col=0, parse_dates=[0],
  247. date_parser=lambda x: datetime.utcfromtimestamp(int(x)))
  248. expected = DataFrame([[0.062353, 0, 4, 6, 0.01690, 3, 12345, 1, -1,
  249. 3, "invoice_InvoiceResource", "search"]],
  250. columns=["elapsed", "sys", "user", "queries",
  251. "query_time", "rows", "accountid",
  252. "userid", "contactid", "level",
  253. "silo", "method"],
  254. index=Index([Timestamp("2012-07-24 04:12:30")],
  255. name="posix_timestamp"))
  256. tm.assert_frame_equal(result, expected)
  257. def test_nat_parse(all_parsers):
  258. # see gh-3062
  259. parser = all_parsers
  260. df = DataFrame(dict({"A": np.asarray(lrange(10), dtype="float64"),
  261. "B": pd.Timestamp("20010101")}))
  262. df.iloc[3:6, :] = np.nan
  263. with tm.ensure_clean("__nat_parse_.csv") as path:
  264. df.to_csv(path)
  265. result = parser.read_csv(path, index_col=0, parse_dates=["B"])
  266. tm.assert_frame_equal(result, df)
  267. def test_csv_custom_parser(all_parsers):
  268. data = """A,B,C
  269. 20090101,a,1,2
  270. 20090102,b,3,4
  271. 20090103,c,4,5
  272. """
  273. parser = all_parsers
  274. result = parser.read_csv(
  275. StringIO(data),
  276. date_parser=lambda x: datetime.strptime(x, "%Y%m%d"))
  277. expected = parser.read_csv(StringIO(data), parse_dates=True)
  278. tm.assert_frame_equal(result, expected)
  279. def test_parse_dates_implicit_first_col(all_parsers):
  280. data = """A,B,C
  281. 20090101,a,1,2
  282. 20090102,b,3,4
  283. 20090103,c,4,5
  284. """
  285. parser = all_parsers
  286. result = parser.read_csv(StringIO(data), parse_dates=True)
  287. expected = parser.read_csv(StringIO(data), index_col=0,
  288. parse_dates=True)
  289. tm.assert_frame_equal(result, expected)
  290. def test_parse_dates_string(all_parsers):
  291. data = """date,A,B,C
  292. 20090101,a,1,2
  293. 20090102,b,3,4
  294. 20090103,c,4,5
  295. """
  296. parser = all_parsers
  297. result = parser.read_csv(StringIO(data), index_col="date",
  298. parse_dates=["date"])
  299. index = date_range("1/1/2009", periods=3)
  300. index.name = "date"
  301. expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 4],
  302. "C": [2, 4, 5]}, index=index)
  303. tm.assert_frame_equal(result, expected)
  304. # Bug in https://github.com/dateutil/dateutil/issues/217
  305. # has been addressed, but we just don't pass in the `yearfirst`
  306. @pytest.mark.xfail(reason="yearfirst is not surfaced in read_*")
  307. @pytest.mark.parametrize("parse_dates", [
  308. [["date", "time"]],
  309. [[0, 1]]
  310. ])
  311. def test_yy_format_with_year_first(all_parsers, parse_dates):
  312. data = """date,time,B,C
  313. 090131,0010,1,2
  314. 090228,1020,3,4
  315. 090331,0830,5,6
  316. """
  317. parser = all_parsers
  318. result = parser.read_csv(StringIO(data), index_col=0,
  319. parse_dates=parse_dates)
  320. index = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0),
  321. datetime(2009, 2, 28, 10, 20, 0),
  322. datetime(2009, 3, 31, 8, 30, 0)],
  323. dtype=object, name="date_time")
  324. expected = DataFrame({"B": [1, 3, 5], "C": [2, 4, 6]}, index=index)
  325. tm.assert_frame_equal(result, expected)
  326. @pytest.mark.parametrize("parse_dates", [[0, 2], ["a", "c"]])
  327. def test_parse_dates_column_list(all_parsers, parse_dates):
  328. data = "a,b,c\n01/01/2010,1,15/02/2010"
  329. parser = all_parsers
  330. expected = DataFrame({"a": [datetime(2010, 1, 1)], "b": [1],
  331. "c": [datetime(2010, 2, 15)]})
  332. expected = expected.set_index(["a", "b"])
  333. result = parser.read_csv(StringIO(data), index_col=[0, 1],
  334. parse_dates=parse_dates, dayfirst=True)
  335. tm.assert_frame_equal(result, expected)
  336. @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
  337. def test_multi_index_parse_dates(all_parsers, index_col):
  338. data = """index1,index2,A,B,C
  339. 20090101,one,a,1,2
  340. 20090101,two,b,3,4
  341. 20090101,three,c,4,5
  342. 20090102,one,a,1,2
  343. 20090102,two,b,3,4
  344. 20090102,three,c,4,5
  345. 20090103,one,a,1,2
  346. 20090103,two,b,3,4
  347. 20090103,three,c,4,5
  348. """
  349. parser = all_parsers
  350. index = MultiIndex.from_product([
  351. (datetime(2009, 1, 1), datetime(2009, 1, 2),
  352. datetime(2009, 1, 3)), ("one", "two", "three")],
  353. names=["index1", "index2"])
  354. # Out of order.
  355. if index_col == [1, 0]:
  356. index = index.swaplevel(0, 1)
  357. expected = DataFrame([["a", 1, 2], ["b", 3, 4], ["c", 4, 5],
  358. ["a", 1, 2], ["b", 3, 4], ["c", 4, 5],
  359. ["a", 1, 2], ["b", 3, 4], ["c", 4, 5]],
  360. columns=["A", "B", "C"], index=index)
  361. result = parser.read_csv(StringIO(data), index_col=index_col,
  362. parse_dates=True)
  363. tm.assert_frame_equal(result, expected)
  364. @pytest.mark.parametrize("kwargs", [
  365. dict(dayfirst=True), dict(day_first=True)
  366. ])
  367. def test_parse_dates_custom_euro_format(all_parsers, kwargs):
  368. parser = all_parsers
  369. data = """foo,bar,baz
  370. 31/01/2010,1,2
  371. 01/02/2010,1,NA
  372. 02/02/2010,1,2
  373. """
  374. if "dayfirst" in kwargs:
  375. df = parser.read_csv(StringIO(data), names=["time", "Q", "NTU"],
  376. date_parser=lambda d: parse_date(d, **kwargs),
  377. header=0, index_col=0, parse_dates=True,
  378. na_values=["NA"])
  379. exp_index = Index([datetime(2010, 1, 31), datetime(2010, 2, 1),
  380. datetime(2010, 2, 2)], name="time")
  381. expected = DataFrame({"Q": [1, 1, 1], "NTU": [2, np.nan, 2]},
  382. index=exp_index, columns=["Q", "NTU"])
  383. tm.assert_frame_equal(df, expected)
  384. else:
  385. msg = "got an unexpected keyword argument 'day_first'"
  386. with pytest.raises(TypeError, match=msg):
  387. parser.read_csv(StringIO(data), names=["time", "Q", "NTU"],
  388. date_parser=lambda d: parse_date(d, **kwargs),
  389. skiprows=[0], index_col=0, parse_dates=True,
  390. na_values=["NA"])
  391. def test_parse_tz_aware(all_parsers):
  392. # See gh-1693
  393. parser = all_parsers
  394. data = "Date,x\n2012-06-13T01:39:00Z,0.5"
  395. result = parser.read_csv(StringIO(data), index_col=0,
  396. parse_dates=True)
  397. expected = DataFrame({"x": [0.5]}, index=Index([Timestamp(
  398. "2012-06-13 01:39:00+00:00")], name="Date"))
  399. tm.assert_frame_equal(result, expected)
  400. assert result.index.tz is pytz.utc
  401. @pytest.mark.parametrize("parse_dates,index_col", [
  402. ({"nominal": [1, 2]}, "nominal"),
  403. ({"nominal": [1, 2]}, 0),
  404. ([[1, 2]], 0),
  405. ])
  406. def test_multiple_date_cols_index(all_parsers, parse_dates, index_col):
  407. parser = all_parsers
  408. data = """
  409. ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
  410. KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
  411. KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
  412. KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
  413. KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
  414. KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
  415. KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
  416. """
  417. expected = DataFrame([
  418. [datetime(1999, 1, 27, 19, 0), "KORD1", " 18:56:00",
  419. 0.81, 2.81, 7.2, 0.0, 280.0],
  420. [datetime(1999, 1, 27, 20, 0), "KORD2", " 19:56:00",
  421. 0.01, 2.21, 7.2, 0.0, 260.0],
  422. [datetime(1999, 1, 27, 21, 0), "KORD3", " 20:56:00",
  423. -0.59, 2.21, 5.7, 0.0, 280.0],
  424. [datetime(1999, 1, 27, 21, 0), "KORD4", " 21:18:00",
  425. -0.99, 2.01, 3.6, 0.0, 270.0],
  426. [datetime(1999, 1, 27, 22, 0), "KORD5", " 21:56:00",
  427. -0.59, 1.71, 5.1, 0.0, 290.0],
  428. [datetime(1999, 1, 27, 23, 0), "KORD6", " 22:56:00",
  429. -0.59, 1.71, 4.6, 0.0, 280.0],
  430. ], columns=["nominal", "ID", "ActualTime", "TDew",
  431. "TAir", "Windspeed", "Precip", "WindDir"])
  432. expected = expected.set_index("nominal")
  433. if not isinstance(parse_dates, dict):
  434. expected.index.name = "date_NominalTime"
  435. result = parser.read_csv(StringIO(data), parse_dates=parse_dates,
  436. index_col=index_col)
  437. tm.assert_frame_equal(result, expected)
  438. def test_multiple_date_cols_chunked(all_parsers):
  439. parser = all_parsers
  440. data = """\
  441. ID,date,nominalTime,actualTime,A,B,C,D,E
  442. KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
  443. KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
  444. KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
  445. KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
  446. KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
  447. KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
  448. """
  449. expected = DataFrame([
  450. [datetime(1999, 1, 27, 19, 0), "KORD", " 18:56:00",
  451. 0.81, 2.81, 7.2, 0.0, 280.0],
  452. [datetime(1999, 1, 27, 20, 0), "KORD", " 19:56:00",
  453. 0.01, 2.21, 7.2, 0.0, 260.0],
  454. [datetime(1999, 1, 27, 21, 0), "KORD", " 20:56:00",
  455. -0.59, 2.21, 5.7, 0.0, 280.0],
  456. [datetime(1999, 1, 27, 21, 0), "KORD", " 21:18:00",
  457. -0.99, 2.01, 3.6, 0.0, 270.0],
  458. [datetime(1999, 1, 27, 22, 0), "KORD", " 21:56:00",
  459. -0.59, 1.71, 5.1, 0.0, 290.0],
  460. [datetime(1999, 1, 27, 23, 0), "KORD", " 22:56:00",
  461. -0.59, 1.71, 4.6, 0.0, 280.0],
  462. ], columns=["nominal", "ID", "actualTime", "A", "B", "C", "D", "E"])
  463. expected = expected.set_index("nominal")
  464. reader = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]},
  465. index_col="nominal", chunksize=2)
  466. chunks = list(reader)
  467. tm.assert_frame_equal(chunks[0], expected[:2])
  468. tm.assert_frame_equal(chunks[1], expected[2:4])
  469. tm.assert_frame_equal(chunks[2], expected[4:])
  470. def test_multiple_date_col_named_index_compat(all_parsers):
  471. parser = all_parsers
  472. data = """\
  473. ID,date,nominalTime,actualTime,A,B,C,D,E
  474. KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
  475. KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
  476. KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
  477. KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
  478. KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
  479. KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
  480. """
  481. with_indices = parser.read_csv(StringIO(data),
  482. parse_dates={"nominal": [1, 2]},
  483. index_col="nominal")
  484. with_names = parser.read_csv(StringIO(data), index_col="nominal",
  485. parse_dates={"nominal": [
  486. "date", "nominalTime"]})
  487. tm.assert_frame_equal(with_indices, with_names)
  488. def test_multiple_date_col_multiple_index_compat(all_parsers):
  489. parser = all_parsers
  490. data = """\
  491. ID,date,nominalTime,actualTime,A,B,C,D,E
  492. KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
  493. KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
  494. KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
  495. KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
  496. KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
  497. KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
  498. """
  499. result = parser.read_csv(StringIO(data), index_col=["nominal", "ID"],
  500. parse_dates={"nominal": [1, 2]})
  501. expected = parser.read_csv(StringIO(data),
  502. parse_dates={"nominal": [1, 2]})
  503. expected = expected.set_index(["nominal", "ID"])
  504. tm.assert_frame_equal(result, expected)
  505. @pytest.mark.parametrize("kwargs", [dict(), dict(index_col="C")])
  506. def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs):
  507. # see gh-5636
  508. parser = all_parsers
  509. msg = ("Only booleans, lists, and dictionaries "
  510. "are accepted for the 'parse_dates' parameter")
  511. data = """A,B,C
  512. 1,2,2003-11-1"""
  513. with pytest.raises(TypeError, match=msg):
  514. parser.read_csv(StringIO(data), parse_dates="C", **kwargs)
  515. @pytest.mark.parametrize("parse_dates", [
  516. (1,), np.array([4, 5]), {1, 3, 3}
  517. ])
  518. def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates):
  519. parser = all_parsers
  520. msg = ("Only booleans, lists, and dictionaries "
  521. "are accepted for the 'parse_dates' parameter")
  522. data = """A,B,C
  523. 1,2,2003-11-1"""
  524. with pytest.raises(TypeError, match=msg):
  525. parser.read_csv(StringIO(data), parse_dates=(1,))
  526. def test_parse_dates_empty_string(all_parsers):
  527. # see gh-2263
  528. parser = all_parsers
  529. data = "Date,test\n2012-01-01,1\n,2"
  530. result = parser.read_csv(StringIO(data), parse_dates=["Date"],
  531. na_filter=False)
  532. expected = DataFrame([[datetime(2012, 1, 1), 1], [pd.NaT, 2]],
  533. columns=["Date", "test"])
  534. tm.assert_frame_equal(result, expected)
  535. @pytest.mark.parametrize("data,kwargs,expected", [
  536. ("a\n04.15.2016", dict(parse_dates=["a"]),
  537. DataFrame([datetime(2016, 4, 15)], columns=["a"])),
  538. ("a\n04.15.2016", dict(parse_dates=True, index_col=0),
  539. DataFrame(index=DatetimeIndex(["2016-04-15"], name="a"))),
  540. ("a,b\n04.15.2016,09.16.2013", dict(parse_dates=["a", "b"]),
  541. DataFrame([[datetime(2016, 4, 15), datetime(2013, 9, 16)]],
  542. columns=["a", "b"])),
  543. ("a,b\n04.15.2016,09.16.2013", dict(parse_dates=True, index_col=[0, 1]),
  544. DataFrame(index=MultiIndex.from_tuples(
  545. [(datetime(2016, 4, 15), datetime(2013, 9, 16))], names=["a", "b"]))),
  546. ])
  547. def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected):
  548. # see gh-14066
  549. parser = all_parsers
  550. result = parser.read_csv(StringIO(data), thousands=".", **kwargs)
  551. tm.assert_frame_equal(result, expected)
  552. def test_parse_date_time_multi_level_column_name(all_parsers):
  553. data = """\
  554. D,T,A,B
  555. date, time,a,b
  556. 2001-01-05, 09:00:00, 0.0, 10.
  557. 2001-01-06, 00:00:00, 1.0, 11.
  558. """
  559. parser = all_parsers
  560. result = parser.read_csv(StringIO(data), header=[0, 1],
  561. parse_dates={"date_time": [0, 1]},
  562. date_parser=conv.parse_date_time)
  563. expected_data = [[datetime(2001, 1, 5, 9, 0, 0), 0., 10.],
  564. [datetime(2001, 1, 6, 0, 0, 0), 1., 11.]]
  565. expected = DataFrame(expected_data,
  566. columns=["date_time", ("A", "a"), ("B", "b")])
  567. tm.assert_frame_equal(result, expected)
  568. @pytest.mark.parametrize("data,kwargs,expected", [
  569. ("""\
  570. date,time,a,b
  571. 2001-01-05, 10:00:00, 0.0, 10.
  572. 2001-01-05, 00:00:00, 1., 11.
  573. """, dict(header=0, parse_dates={"date_time": [0, 1]}),
  574. DataFrame([[datetime(2001, 1, 5, 10, 0, 0), 0.0, 10],
  575. [datetime(2001, 1, 5, 0, 0, 0), 1.0, 11.0]],
  576. columns=["date_time", "a", "b"])),
  577. (("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
  578. "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
  579. "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
  580. "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
  581. "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
  582. "KORD,19990127, 23:00:00, 22:56:00, -0.5900"),
  583. dict(header=None, parse_dates={"actual": [1, 2], "nominal": [1, 3]}),
  584. DataFrame([
  585. [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56),
  586. "KORD", 0.81],
  587. [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56),
  588. "KORD", 0.01],
  589. [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56),
  590. "KORD", -0.59],
  591. [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18),
  592. "KORD", -0.99],
  593. [datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56),
  594. "KORD", -0.59],
  595. [datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56),
  596. "KORD", -0.59]], columns=["actual", "nominal", 0, 4])),
  597. ])
  598. def test_parse_date_time(all_parsers, data, kwargs, expected):
  599. parser = all_parsers
  600. result = parser.read_csv(StringIO(data), date_parser=conv.parse_date_time,
  601. **kwargs)
  602. # Python can sometimes be flaky about how
  603. # the aggregated columns are entered, so
  604. # this standardizes the order.
  605. result = result[expected.columns]
  606. tm.assert_frame_equal(result, expected)
  607. def test_parse_date_fields(all_parsers):
  608. parser = all_parsers
  609. data = ("year,month,day,a\n2001,01,10,10.\n"
  610. "2001,02,1,11.")
  611. result = parser.read_csv(StringIO(data), header=0,
  612. parse_dates={"ymd": [0, 1, 2]},
  613. date_parser=conv.parse_date_fields)
  614. expected = DataFrame([[datetime(2001, 1, 10), 10.],
  615. [datetime(2001, 2, 1), 11.]], columns=["ymd", "a"])
  616. tm.assert_frame_equal(result, expected)
  617. def test_parse_date_all_fields(all_parsers):
  618. parser = all_parsers
  619. data = """\
  620. year,month,day,hour,minute,second,a,b
  621. 2001,01,05,10,00,0,0.0,10.
  622. 2001,01,5,10,0,00,1.,11.
  623. """
  624. result = parser.read_csv(StringIO(data), header=0,
  625. date_parser=conv.parse_all_fields,
  626. parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]})
  627. expected = DataFrame([[datetime(2001, 1, 5, 10, 0, 0), 0.0, 10.0],
  628. [datetime(2001, 1, 5, 10, 0, 0), 1.0, 11.0]],
  629. columns=["ymdHMS", "a", "b"])
  630. tm.assert_frame_equal(result, expected)
  631. def test_datetime_fractional_seconds(all_parsers):
  632. parser = all_parsers
  633. data = """\
  634. year,month,day,hour,minute,second,a,b
  635. 2001,01,05,10,00,0.123456,0.0,10.
  636. 2001,01,5,10,0,0.500000,1.,11.
  637. """
  638. result = parser.read_csv(StringIO(data), header=0,
  639. date_parser=conv.parse_all_fields,
  640. parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]})
  641. expected = DataFrame([[datetime(2001, 1, 5, 10, 0, 0,
  642. microsecond=123456), 0.0, 10.0],
  643. [datetime(2001, 1, 5, 10, 0, 0,
  644. microsecond=500000), 1.0, 11.0]],
  645. columns=["ymdHMS", "a", "b"])
  646. tm.assert_frame_equal(result, expected)
  647. def test_generic(all_parsers):
  648. parser = all_parsers
  649. data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."
  650. result = parser.read_csv(StringIO(data), header=0,
  651. parse_dates={"ym": [0, 1]},
  652. date_parser=lambda y, m: date(year=int(y),
  653. month=int(m),
  654. day=1))
  655. expected = DataFrame([[date(2001, 1, 1), 10, 10.],
  656. [date(2001, 2, 1), 1, 11.]],
  657. columns=["ym", "day", "a"])
  658. tm.assert_frame_equal(result, expected)
  659. def test_date_parser_resolution_if_not_ns(all_parsers):
  660. # see gh-10245
  661. parser = all_parsers
  662. data = """\
  663. date,time,prn,rxstatus
  664. 2013-11-03,19:00:00,126,00E80000
  665. 2013-11-03,19:00:00,23,00E80000
  666. 2013-11-03,19:00:00,13,00E80000
  667. """
  668. def date_parser(dt, time):
  669. return np_array_datetime64_compat(dt + "T" + time + "Z",
  670. dtype="datetime64[s]")
  671. result = parser.read_csv(StringIO(data), date_parser=date_parser,
  672. parse_dates={"datetime": ["date", "time"]},
  673. index_col=["datetime", "prn"])
  674. datetimes = np_array_datetime64_compat(["2013-11-03T19:00:00Z"] * 3,
  675. dtype="datetime64[s]")
  676. expected = DataFrame(data={"rxstatus": ["00E80000"] * 3},
  677. index=MultiIndex.from_tuples(
  678. [(datetimes[0], 126), (datetimes[1], 23),
  679. (datetimes[2], 13)], names=["datetime", "prn"]))
  680. tm.assert_frame_equal(result, expected)
  681. def test_parse_date_column_with_empty_string(all_parsers):
  682. # see gh-6428
  683. parser = all_parsers
  684. data = "case,opdate\n7,10/18/2006\n7,10/18/2008\n621, "
  685. result = parser.read_csv(StringIO(data), parse_dates=["opdate"])
  686. expected_data = [[7, "10/18/2006"],
  687. [7, "10/18/2008"],
  688. [621, " "]]
  689. expected = DataFrame(expected_data, columns=["case", "opdate"])
  690. tm.assert_frame_equal(result, expected)
  691. @pytest.mark.parametrize("data,expected", [
  692. ("a\n135217135789158401\n1352171357E+5",
  693. DataFrame({"a": [135217135789158401,
  694. 135217135700000]}, dtype="float64")),
  695. ("a\n99999999999\n123456789012345\n1234E+0",
  696. DataFrame({"a": [99999999999,
  697. 123456789012345,
  698. 1234]}, dtype="float64"))
  699. ])
  700. @pytest.mark.parametrize("parse_dates", [True, False])
  701. def test_parse_date_float(all_parsers, data, expected, parse_dates):
  702. # see gh-2697
  703. #
  704. # Date parsing should fail, so we leave the data untouched
  705. # (i.e. float precision should remain unchanged).
  706. parser = all_parsers
  707. result = parser.read_csv(StringIO(data), parse_dates=parse_dates)
  708. tm.assert_frame_equal(result, expected)
  709. def test_parse_timezone(all_parsers):
  710. # see gh-22256
  711. parser = all_parsers
  712. data = """dt,val
  713. 2018-01-04 09:01:00+09:00,23350
  714. 2018-01-04 09:02:00+09:00,23400
  715. 2018-01-04 09:03:00+09:00,23400
  716. 2018-01-04 09:04:00+09:00,23400
  717. 2018-01-04 09:05:00+09:00,23400"""
  718. result = parser.read_csv(StringIO(data), parse_dates=["dt"])
  719. dti = pd.date_range(start="2018-01-04 09:01:00",
  720. end="2018-01-04 09:05:00", freq="1min",
  721. tz=pytz.FixedOffset(540))
  722. expected_data = {"dt": dti, "val": [23350, 23400, 23400, 23400, 23400]}
  723. expected = DataFrame(expected_data)
  724. tm.assert_frame_equal(result, expected)