test_resample_api.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544
  1. # pylint: disable=E1101
  2. from datetime import datetime
  3. import numpy as np
  4. import pytest
  5. from pandas.compat import OrderedDict, range
  6. import pandas as pd
  7. from pandas import DataFrame, Series
  8. from pandas.core.indexes.datetimes import date_range
  9. import pandas.util.testing as tm
  10. from pandas.util.testing import assert_frame_equal, assert_series_equal
  11. dti = date_range(start=datetime(2005, 1, 1),
  12. end=datetime(2005, 1, 10), freq='Min')
  13. test_series = Series(np.random.rand(len(dti)), dti)
  14. test_frame = DataFrame(
  15. {'A': test_series, 'B': test_series, 'C': np.arange(len(dti))})
  16. def test_str():
  17. r = test_series.resample('H')
  18. assert ('DatetimeIndexResampler [freq=<Hour>, axis=0, closed=left, '
  19. 'label=left, convention=start, base=0]' in str(r))
  20. def test_api():
  21. r = test_series.resample('H')
  22. result = r.mean()
  23. assert isinstance(result, Series)
  24. assert len(result) == 217
  25. r = test_series.to_frame().resample('H')
  26. result = r.mean()
  27. assert isinstance(result, DataFrame)
  28. assert len(result) == 217
  29. def test_groupby_resample_api():
  30. # GH 12448
  31. # .groupby(...).resample(...) hitting warnings
  32. # when appropriate
  33. df = DataFrame({'date': pd.date_range(start='2016-01-01',
  34. periods=4,
  35. freq='W'),
  36. 'group': [1, 1, 2, 2],
  37. 'val': [5, 6, 7, 8]}).set_index('date')
  38. # replication step
  39. i = pd.date_range('2016-01-03', periods=8).tolist() + \
  40. pd.date_range('2016-01-17', periods=8).tolist()
  41. index = pd.MultiIndex.from_arrays([[1] * 8 + [2] * 8, i],
  42. names=['group', 'date'])
  43. expected = DataFrame({'val': [5] * 7 + [6] + [7] * 7 + [8]},
  44. index=index)
  45. result = df.groupby('group').apply(
  46. lambda x: x.resample('1D').ffill())[['val']]
  47. assert_frame_equal(result, expected)
  48. def test_groupby_resample_on_api():
  49. # GH 15021
  50. # .groupby(...).resample(on=...) results in an unexpected
  51. # keyword warning.
  52. df = DataFrame({'key': ['A', 'B'] * 5,
  53. 'dates': pd.date_range('2016-01-01', periods=10),
  54. 'values': np.random.randn(10)})
  55. expected = df.set_index('dates').groupby('key').resample('D').mean()
  56. result = df.groupby('key').resample('D', on='dates').mean()
  57. assert_frame_equal(result, expected)
  58. def test_pipe():
  59. # GH17905
  60. # series
  61. r = test_series.resample('H')
  62. expected = r.max() - r.mean()
  63. result = r.pipe(lambda x: x.max() - x.mean())
  64. tm.assert_series_equal(result, expected)
  65. # dataframe
  66. r = test_frame.resample('H')
  67. expected = r.max() - r.mean()
  68. result = r.pipe(lambda x: x.max() - x.mean())
  69. tm.assert_frame_equal(result, expected)
  70. def test_getitem():
  71. r = test_frame.resample('H')
  72. tm.assert_index_equal(r._selected_obj.columns, test_frame.columns)
  73. r = test_frame.resample('H')['B']
  74. assert r._selected_obj.name == test_frame.columns[1]
  75. # technically this is allowed
  76. r = test_frame.resample('H')['A', 'B']
  77. tm.assert_index_equal(r._selected_obj.columns,
  78. test_frame.columns[[0, 1]])
  79. r = test_frame.resample('H')['A', 'B']
  80. tm.assert_index_equal(r._selected_obj.columns,
  81. test_frame.columns[[0, 1]])
  82. def test_select_bad_cols():
  83. g = test_frame.resample('H')
  84. pytest.raises(KeyError, g.__getitem__, ['D'])
  85. pytest.raises(KeyError, g.__getitem__, ['A', 'D'])
  86. with pytest.raises(KeyError, match='^[^A]+$'):
  87. # A should not be referenced as a bad column...
  88. # will have to rethink regex if you change message!
  89. g[['A', 'D']]
  90. def test_attribute_access():
  91. r = test_frame.resample('H')
  92. tm.assert_series_equal(r.A.sum(), r['A'].sum())
  93. def test_api_compat_before_use():
  94. # make sure that we are setting the binner
  95. # on these attributes
  96. for attr in ['groups', 'ngroups', 'indices']:
  97. rng = pd.date_range('1/1/2012', periods=100, freq='S')
  98. ts = Series(np.arange(len(rng)), index=rng)
  99. rs = ts.resample('30s')
  100. # before use
  101. getattr(rs, attr)
  102. # after grouper is initialized is ok
  103. rs.mean()
  104. getattr(rs, attr)
  105. def tests_skip_nuisance():
  106. df = test_frame
  107. df['D'] = 'foo'
  108. r = df.resample('H')
  109. result = r[['A', 'B']].sum()
  110. expected = pd.concat([r.A.sum(), r.B.sum()], axis=1)
  111. assert_frame_equal(result, expected)
  112. expected = r[['A', 'B', 'C']].sum()
  113. result = r.sum()
  114. assert_frame_equal(result, expected)
  115. def test_downsample_but_actually_upsampling():
  116. # this is reindex / asfreq
  117. rng = pd.date_range('1/1/2012', periods=100, freq='S')
  118. ts = Series(np.arange(len(rng), dtype='int64'), index=rng)
  119. result = ts.resample('20s').asfreq()
  120. expected = Series([0, 20, 40, 60, 80],
  121. index=pd.date_range('2012-01-01 00:00:00',
  122. freq='20s',
  123. periods=5))
  124. assert_series_equal(result, expected)
  125. def test_combined_up_downsampling_of_irregular():
  126. # since we are reallydoing an operation like this
  127. # ts2.resample('2s').mean().ffill()
  128. # preserve these semantics
  129. rng = pd.date_range('1/1/2012', periods=100, freq='S')
  130. ts = Series(np.arange(len(rng)), index=rng)
  131. ts2 = ts.iloc[[0, 1, 2, 3, 5, 7, 11, 15, 16, 25, 30]]
  132. with tm.assert_produces_warning(FutureWarning,
  133. check_stacklevel=False):
  134. result = ts2.resample('2s', how='mean', fill_method='ffill')
  135. expected = ts2.resample('2s').mean().ffill()
  136. assert_series_equal(result, expected)
  137. def test_transform():
  138. r = test_series.resample('20min')
  139. expected = test_series.groupby(
  140. pd.Grouper(freq='20min')).transform('mean')
  141. result = r.transform('mean')
  142. assert_series_equal(result, expected)
  143. def test_fillna():
  144. # need to upsample here
  145. rng = pd.date_range('1/1/2012', periods=10, freq='2S')
  146. ts = Series(np.arange(len(rng), dtype='int64'), index=rng)
  147. r = ts.resample('s')
  148. expected = r.ffill()
  149. result = r.fillna(method='ffill')
  150. assert_series_equal(result, expected)
  151. expected = r.bfill()
  152. result = r.fillna(method='bfill')
  153. assert_series_equal(result, expected)
  154. with pytest.raises(ValueError):
  155. r.fillna(0)
  156. def test_apply_without_aggregation():
  157. # both resample and groupby should work w/o aggregation
  158. r = test_series.resample('20min')
  159. g = test_series.groupby(pd.Grouper(freq='20min'))
  160. for t in [g, r]:
  161. result = t.apply(lambda x: x)
  162. assert_series_equal(result, test_series)
  163. def test_agg_consistency():
  164. # make sure that we are consistent across
  165. # similar aggregations with and w/o selection list
  166. df = DataFrame(np.random.randn(1000, 3),
  167. index=pd.date_range('1/1/2012', freq='S', periods=1000),
  168. columns=['A', 'B', 'C'])
  169. r = df.resample('3T')
  170. with tm.assert_produces_warning(FutureWarning,
  171. check_stacklevel=False):
  172. expected = r[['A', 'B', 'C']].agg({'r1': 'mean', 'r2': 'sum'})
  173. result = r.agg({'r1': 'mean', 'r2': 'sum'})
  174. assert_frame_equal(result, expected)
  175. # TODO: once GH 14008 is fixed, move these tests into
  176. # `Base` test class
  177. def test_agg():
  178. # test with all three Resampler apis and TimeGrouper
  179. np.random.seed(1234)
  180. index = date_range(datetime(2005, 1, 1),
  181. datetime(2005, 1, 10), freq='D')
  182. index.name = 'date'
  183. df = DataFrame(np.random.rand(10, 2), columns=list('AB'), index=index)
  184. df_col = df.reset_index()
  185. df_mult = df_col.copy()
  186. df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index],
  187. names=['index', 'date'])
  188. r = df.resample('2D')
  189. cases = [
  190. r,
  191. df_col.resample('2D', on='date'),
  192. df_mult.resample('2D', level='date'),
  193. df.groupby(pd.Grouper(freq='2D'))
  194. ]
  195. a_mean = r['A'].mean()
  196. a_std = r['A'].std()
  197. a_sum = r['A'].sum()
  198. b_mean = r['B'].mean()
  199. b_std = r['B'].std()
  200. b_sum = r['B'].sum()
  201. expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1)
  202. expected.columns = pd.MultiIndex.from_product([['A', 'B'],
  203. ['mean', 'std']])
  204. for t in cases:
  205. result = t.aggregate([np.mean, np.std])
  206. assert_frame_equal(result, expected)
  207. expected = pd.concat([a_mean, b_std], axis=1)
  208. for t in cases:
  209. result = t.aggregate({'A': np.mean,
  210. 'B': np.std})
  211. assert_frame_equal(result, expected, check_like=True)
  212. expected = pd.concat([a_mean, a_std], axis=1)
  213. expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'),
  214. ('A', 'std')])
  215. for t in cases:
  216. result = t.aggregate({'A': ['mean', 'std']})
  217. assert_frame_equal(result, expected)
  218. expected = pd.concat([a_mean, a_sum], axis=1)
  219. expected.columns = ['mean', 'sum']
  220. for t in cases:
  221. result = t['A'].aggregate(['mean', 'sum'])
  222. assert_frame_equal(result, expected)
  223. expected = pd.concat([a_mean, a_sum], axis=1)
  224. expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'),
  225. ('A', 'sum')])
  226. for t in cases:
  227. with tm.assert_produces_warning(FutureWarning,
  228. check_stacklevel=False):
  229. result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}})
  230. assert_frame_equal(result, expected, check_like=True)
  231. expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1)
  232. expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'),
  233. ('A', 'sum'),
  234. ('B', 'mean2'),
  235. ('B', 'sum2')])
  236. for t in cases:
  237. with tm.assert_produces_warning(FutureWarning,
  238. check_stacklevel=False):
  239. result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'},
  240. 'B': {'mean2': 'mean', 'sum2': 'sum'}})
  241. assert_frame_equal(result, expected, check_like=True)
  242. expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1)
  243. expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'),
  244. ('A', 'std'),
  245. ('B', 'mean'),
  246. ('B', 'std')])
  247. for t in cases:
  248. result = t.aggregate({'A': ['mean', 'std'],
  249. 'B': ['mean', 'std']})
  250. assert_frame_equal(result, expected, check_like=True)
  251. expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1)
  252. expected.columns = pd.MultiIndex.from_tuples([('r1', 'A', 'mean'),
  253. ('r1', 'A', 'sum'),
  254. ('r2', 'B', 'mean'),
  255. ('r2', 'B', 'sum')])
  256. def test_agg_misc():
  257. # test with all three Resampler apis and TimeGrouper
  258. np.random.seed(1234)
  259. index = date_range(datetime(2005, 1, 1),
  260. datetime(2005, 1, 10), freq='D')
  261. index.name = 'date'
  262. df = DataFrame(np.random.rand(10, 2), columns=list('AB'), index=index)
  263. df_col = df.reset_index()
  264. df_mult = df_col.copy()
  265. df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index],
  266. names=['index', 'date'])
  267. r = df.resample('2D')
  268. cases = [
  269. r,
  270. df_col.resample('2D', on='date'),
  271. df_mult.resample('2D', level='date'),
  272. df.groupby(pd.Grouper(freq='2D'))
  273. ]
  274. # passed lambda
  275. for t in cases:
  276. result = t.agg({'A': np.sum,
  277. 'B': lambda x: np.std(x, ddof=1)})
  278. rcustom = t['B'].apply(lambda x: np.std(x, ddof=1))
  279. expected = pd.concat([r['A'].sum(), rcustom], axis=1)
  280. assert_frame_equal(result, expected, check_like=True)
  281. # agg with renamers
  282. expected = pd.concat([t['A'].sum(),
  283. t['B'].sum(),
  284. t['A'].mean(),
  285. t['B'].mean()],
  286. axis=1)
  287. expected.columns = pd.MultiIndex.from_tuples([('result1', 'A'),
  288. ('result1', 'B'),
  289. ('result2', 'A'),
  290. ('result2', 'B')])
  291. for t in cases:
  292. with tm.assert_produces_warning(FutureWarning,
  293. check_stacklevel=False):
  294. result = t[['A', 'B']].agg(OrderedDict([('result1', np.sum),
  295. ('result2', np.mean)]))
  296. assert_frame_equal(result, expected, check_like=True)
  297. # agg with different hows
  298. expected = pd.concat([t['A'].sum(),
  299. t['A'].std(),
  300. t['B'].mean(),
  301. t['B'].std()],
  302. axis=1)
  303. expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'),
  304. ('A', 'std'),
  305. ('B', 'mean'),
  306. ('B', 'std')])
  307. for t in cases:
  308. result = t.agg(OrderedDict([('A', ['sum', 'std']),
  309. ('B', ['mean', 'std'])]))
  310. assert_frame_equal(result, expected, check_like=True)
  311. # equivalent of using a selection list / or not
  312. for t in cases:
  313. result = t[['A', 'B']].agg({'A': ['sum', 'std'],
  314. 'B': ['mean', 'std']})
  315. assert_frame_equal(result, expected, check_like=True)
  316. # series like aggs
  317. for t in cases:
  318. with tm.assert_produces_warning(FutureWarning,
  319. check_stacklevel=False):
  320. result = t['A'].agg({'A': ['sum', 'std']})
  321. expected = pd.concat([t['A'].sum(),
  322. t['A'].std()],
  323. axis=1)
  324. expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'),
  325. ('A', 'std')])
  326. assert_frame_equal(result, expected, check_like=True)
  327. expected = pd.concat([t['A'].agg(['sum', 'std']),
  328. t['A'].agg(['mean', 'std'])],
  329. axis=1)
  330. expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'),
  331. ('A', 'std'),
  332. ('B', 'mean'),
  333. ('B', 'std')])
  334. with tm.assert_produces_warning(FutureWarning,
  335. check_stacklevel=False):
  336. result = t['A'].agg({'A': ['sum', 'std'],
  337. 'B': ['mean', 'std']})
  338. assert_frame_equal(result, expected, check_like=True)
  339. # errors
  340. # invalid names in the agg specification
  341. for t in cases:
  342. with pytest.raises(KeyError):
  343. with tm.assert_produces_warning(FutureWarning,
  344. check_stacklevel=False):
  345. t[['A']].agg({'A': ['sum', 'std'],
  346. 'B': ['mean', 'std']})
  347. def test_agg_nested_dicts():
  348. np.random.seed(1234)
  349. index = date_range(datetime(2005, 1, 1),
  350. datetime(2005, 1, 10), freq='D')
  351. index.name = 'date'
  352. df = DataFrame(np.random.rand(10, 2), columns=list('AB'), index=index)
  353. df_col = df.reset_index()
  354. df_mult = df_col.copy()
  355. df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index],
  356. names=['index', 'date'])
  357. r = df.resample('2D')
  358. cases = [
  359. r,
  360. df_col.resample('2D', on='date'),
  361. df_mult.resample('2D', level='date'),
  362. df.groupby(pd.Grouper(freq='2D'))
  363. ]
  364. for t in cases:
  365. def f():
  366. t.aggregate({'r1': {'A': ['mean', 'sum']},
  367. 'r2': {'B': ['mean', 'sum']}})
  368. pytest.raises(ValueError, f)
  369. for t in cases:
  370. expected = pd.concat([t['A'].mean(), t['A'].std(), t['B'].mean(),
  371. t['B'].std()], axis=1)
  372. expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), (
  373. 'ra', 'std'), ('rb', 'mean'), ('rb', 'std')])
  374. with tm.assert_produces_warning(FutureWarning,
  375. check_stacklevel=False):
  376. result = t[['A', 'B']].agg({'A': {'ra': ['mean', 'std']},
  377. 'B': {'rb': ['mean', 'std']}})
  378. assert_frame_equal(result, expected, check_like=True)
  379. with tm.assert_produces_warning(FutureWarning,
  380. check_stacklevel=False):
  381. result = t.agg({'A': {'ra': ['mean', 'std']},
  382. 'B': {'rb': ['mean', 'std']}})
  383. assert_frame_equal(result, expected, check_like=True)
  384. def test_try_aggregate_non_existing_column():
  385. # GH 16766
  386. data = [
  387. {'dt': datetime(2017, 6, 1, 0), 'x': 1.0, 'y': 2.0},
  388. {'dt': datetime(2017, 6, 1, 1), 'x': 2.0, 'y': 2.0},
  389. {'dt': datetime(2017, 6, 1, 2), 'x': 3.0, 'y': 1.5}
  390. ]
  391. df = DataFrame(data).set_index('dt')
  392. # Error as we don't have 'z' column
  393. with pytest.raises(KeyError):
  394. df.resample('30T').agg({'x': ['mean'],
  395. 'y': ['median'],
  396. 'z': ['sum']})
  397. def test_selection_api_validation():
  398. # GH 13500
  399. index = date_range(datetime(2005, 1, 1),
  400. datetime(2005, 1, 10), freq='D')
  401. rng = np.arange(len(index), dtype=np.int64)
  402. df = DataFrame({'date': index, 'a': rng},
  403. index=pd.MultiIndex.from_arrays([rng, index],
  404. names=['v', 'd']))
  405. df_exp = DataFrame({'a': rng}, index=index)
  406. # non DatetimeIndex
  407. with pytest.raises(TypeError):
  408. df.resample('2D', level='v')
  409. with pytest.raises(ValueError):
  410. df.resample('2D', on='date', level='d')
  411. with pytest.raises(TypeError):
  412. df.resample('2D', on=['a', 'date'])
  413. with pytest.raises(KeyError):
  414. df.resample('2D', level=['a', 'date'])
  415. # upsampling not allowed
  416. with pytest.raises(ValueError):
  417. df.resample('2D', level='d').asfreq()
  418. with pytest.raises(ValueError):
  419. df.resample('2D', on='date').asfreq()
  420. exp = df_exp.resample('2D').sum()
  421. exp.index.name = 'date'
  422. assert_frame_equal(exp, df.resample('2D', on='date').sum())
  423. exp.index.name = 'd'
  424. assert_frame_equal(exp, df.resample('2D', level='d').sum())