test_time_grouper.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
  1. from datetime import datetime
  2. from operator import methodcaller
  3. import numpy as np
  4. import pytest
  5. import pandas as pd
  6. from pandas import DataFrame, Panel, Series
  7. from pandas.core.indexes.datetimes import date_range
  8. from pandas.core.resample import TimeGrouper
  9. import pandas.util.testing as tm
  10. from pandas.util.testing import assert_frame_equal, assert_series_equal
  11. test_series = Series(np.random.randn(1000),
  12. index=date_range('1/1/2000', periods=1000))
  13. def test_apply():
  14. with tm.assert_produces_warning(FutureWarning,
  15. check_stacklevel=False):
  16. grouper = pd.TimeGrouper(freq='A', label='right', closed='right')
  17. grouped = test_series.groupby(grouper)
  18. def f(x):
  19. return x.sort_values()[-3:]
  20. applied = grouped.apply(f)
  21. expected = test_series.groupby(lambda x: x.year).apply(f)
  22. applied.index = applied.index.droplevel(0)
  23. expected.index = expected.index.droplevel(0)
  24. assert_series_equal(applied, expected)
  25. def test_count():
  26. test_series[::3] = np.nan
  27. expected = test_series.groupby(lambda x: x.year).count()
  28. with tm.assert_produces_warning(FutureWarning,
  29. check_stacklevel=False):
  30. grouper = pd.TimeGrouper(freq='A', label='right', closed='right')
  31. result = test_series.groupby(grouper).count()
  32. expected.index = result.index
  33. assert_series_equal(result, expected)
  34. result = test_series.resample('A').count()
  35. expected.index = result.index
  36. assert_series_equal(result, expected)
  37. def test_numpy_reduction():
  38. result = test_series.resample('A', closed='right').prod()
  39. expected = test_series.groupby(lambda x: x.year).agg(np.prod)
  40. expected.index = result.index
  41. assert_series_equal(result, expected)
  42. def test_apply_iteration():
  43. # #2300
  44. N = 1000
  45. ind = pd.date_range(start="2000-01-01", freq="D", periods=N)
  46. df = DataFrame({'open': 1, 'close': 2}, index=ind)
  47. tg = TimeGrouper('M')
  48. _, grouper, _ = tg._get_grouper(df)
  49. # Errors
  50. grouped = df.groupby(grouper, group_keys=False)
  51. def f(df):
  52. return df['close'] / df['open']
  53. # it works!
  54. result = grouped.apply(f)
  55. tm.assert_index_equal(result.index, df.index)
  56. @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
  57. def test_panel_aggregation():
  58. ind = pd.date_range('1/1/2000', periods=100)
  59. data = np.random.randn(2, len(ind), 4)
  60. wp = Panel(data, items=['Item1', 'Item2'], major_axis=ind,
  61. minor_axis=['A', 'B', 'C', 'D'])
  62. tg = TimeGrouper('M', axis=1)
  63. _, grouper, _ = tg._get_grouper(wp)
  64. bingrouped = wp.groupby(grouper)
  65. binagg = bingrouped.mean()
  66. def f(x):
  67. assert (isinstance(x, Panel))
  68. return x.mean(1)
  69. result = bingrouped.agg(f)
  70. tm.assert_panel_equal(result, binagg)
  71. @pytest.mark.parametrize('name, func', [
  72. ('Int64Index', tm.makeIntIndex),
  73. ('Index', tm.makeUnicodeIndex),
  74. ('Float64Index', tm.makeFloatIndex),
  75. ('MultiIndex', lambda m: tm.makeCustomIndex(m, 2))
  76. ])
  77. def test_fails_on_no_datetime_index(name, func):
  78. n = 2
  79. index = func(n)
  80. df = DataFrame({'a': np.random.randn(n)}, index=index)
  81. msg = ("Only valid with DatetimeIndex, TimedeltaIndex "
  82. "or PeriodIndex, but got an instance of %r" % name)
  83. with pytest.raises(TypeError, match=msg):
  84. df.groupby(TimeGrouper('D'))
  85. def test_aaa_group_order():
  86. # GH 12840
  87. # check TimeGrouper perform stable sorts
  88. n = 20
  89. data = np.random.randn(n, 4)
  90. df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
  91. df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2),
  92. datetime(2013, 1, 3), datetime(2013, 1, 4),
  93. datetime(2013, 1, 5)] * 4
  94. grouped = df.groupby(TimeGrouper(key='key', freq='D'))
  95. tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 1)),
  96. df[::5])
  97. tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 2)),
  98. df[1::5])
  99. tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 3)),
  100. df[2::5])
  101. tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 4)),
  102. df[3::5])
  103. tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 5)),
  104. df[4::5])
  105. def test_aggregate_normal(resample_method):
  106. """Check TimeGrouper's aggregation is identical as normal groupby."""
  107. if resample_method == 'ohlc':
  108. pytest.xfail(reason='DataError: No numeric types to aggregate')
  109. data = np.random.randn(20, 4)
  110. normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
  111. normal_df['key'] = [1, 2, 3, 4, 5] * 4
  112. dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
  113. dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2),
  114. datetime(2013, 1, 3), datetime(2013, 1, 4),
  115. datetime(2013, 1, 5)] * 4
  116. normal_grouped = normal_df.groupby('key')
  117. dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D'))
  118. expected = getattr(normal_grouped, resample_method)()
  119. dt_result = getattr(dt_grouped, resample_method)()
  120. expected.index = date_range(start='2013-01-01', freq='D',
  121. periods=5, name='key')
  122. tm.assert_equal(expected, dt_result)
  123. # if TimeGrouper is used included, 'nth' doesn't work yet
  124. """
  125. for func in ['nth']:
  126. expected = getattr(normal_grouped, func)(3)
  127. expected.index = date_range(start='2013-01-01',
  128. freq='D', periods=5, name='key')
  129. dt_result = getattr(dt_grouped, func)(3)
  130. assert_frame_equal(expected, dt_result)
  131. """
  132. @pytest.mark.parametrize('method, method_args, unit', [
  133. ('sum', dict(), 0),
  134. ('sum', dict(min_count=0), 0),
  135. ('sum', dict(min_count=1), np.nan),
  136. ('prod', dict(), 1),
  137. ('prod', dict(min_count=0), 1),
  138. ('prod', dict(min_count=1), np.nan)
  139. ])
  140. def test_resample_entirly_nat_window(method, method_args, unit):
  141. s = pd.Series([0] * 2 + [np.nan] * 2,
  142. index=pd.date_range('2017', periods=4))
  143. result = methodcaller(method, **method_args)(s.resample("2d"))
  144. expected = pd.Series([0.0, unit],
  145. index=pd.to_datetime(['2017-01-01',
  146. '2017-01-03']))
  147. tm.assert_series_equal(result, expected)
  148. @pytest.mark.parametrize('func, fill_value', [
  149. ('min', np.nan),
  150. ('max', np.nan),
  151. ('sum', 0),
  152. ('prod', 1),
  153. ('count', 0),
  154. ])
  155. def test_aggregate_with_nat(func, fill_value):
  156. # check TimeGrouper's aggregation is identical as normal groupby
  157. # if NaT is included, 'var', 'std', 'mean', 'first','last'
  158. # and 'nth' doesn't work yet
  159. n = 20
  160. data = np.random.randn(n, 4).astype('int64')
  161. normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
  162. normal_df['key'] = [1, 2, np.nan, 4, 5] * 4
  163. dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
  164. dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT,
  165. datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4
  166. normal_grouped = normal_df.groupby('key')
  167. dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D'))
  168. normal_result = getattr(normal_grouped, func)()
  169. dt_result = getattr(dt_grouped, func)()
  170. pad = DataFrame([[fill_value] * 4], index=[3],
  171. columns=['A', 'B', 'C', 'D'])
  172. expected = normal_result.append(pad)
  173. expected = expected.sort_index()
  174. expected.index = date_range(start='2013-01-01', freq='D',
  175. periods=5, name='key')
  176. assert_frame_equal(expected, dt_result)
  177. assert dt_result.index.name == 'key'
  178. def test_aggregate_with_nat_size():
  179. # GH 9925
  180. n = 20
  181. data = np.random.randn(n, 4).astype('int64')
  182. normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
  183. normal_df['key'] = [1, 2, np.nan, 4, 5] * 4
  184. dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
  185. dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT,
  186. datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4
  187. normal_grouped = normal_df.groupby('key')
  188. dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D'))
  189. normal_result = normal_grouped.size()
  190. dt_result = dt_grouped.size()
  191. pad = Series([0], index=[3])
  192. expected = normal_result.append(pad)
  193. expected = expected.sort_index()
  194. expected.index = date_range(start='2013-01-01', freq='D',
  195. periods=5, name='key')
  196. assert_series_equal(expected, dt_result)
  197. assert dt_result.index.name == 'key'
  198. def test_repr():
  199. # GH18203
  200. result = repr(TimeGrouper(key='A', freq='H'))
  201. expected = ("TimeGrouper(key='A', freq=<Hour>, axis=0, sort=True, "
  202. "closed='left', label='left', how='mean', "
  203. "convention='e', base=0)")
  204. assert result == expected
  205. @pytest.mark.parametrize('method, method_args, expected_values', [
  206. ('sum', dict(), [1, 0, 1]),
  207. ('sum', dict(min_count=0), [1, 0, 1]),
  208. ('sum', dict(min_count=1), [1, np.nan, 1]),
  209. ('sum', dict(min_count=2), [np.nan, np.nan, np.nan]),
  210. ('prod', dict(), [1, 1, 1]),
  211. ('prod', dict(min_count=0), [1, 1, 1]),
  212. ('prod', dict(min_count=1), [1, np.nan, 1]),
  213. ('prod', dict(min_count=2), [np.nan, np.nan, np.nan]),
  214. ])
  215. def test_upsample_sum(method, method_args, expected_values):
  216. s = pd.Series(1, index=pd.date_range("2017", periods=2, freq="H"))
  217. resampled = s.resample("30T")
  218. index = pd.to_datetime(['2017-01-01T00:00:00',
  219. '2017-01-01T00:30:00',
  220. '2017-01-01T01:00:00'])
  221. result = methodcaller(method, **method_args)(resampled)
  222. expected = pd.Series(expected_values, index=index)
  223. tm.assert_series_equal(result, expected)