123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544 |
- # pylint: disable=E1101
- from datetime import datetime
- import numpy as np
- import pytest
- from pandas.compat import OrderedDict, range
- import pandas as pd
- from pandas import DataFrame, Series
- from pandas.core.indexes.datetimes import date_range
- import pandas.util.testing as tm
- from pandas.util.testing import assert_frame_equal, assert_series_equal
- dti = date_range(start=datetime(2005, 1, 1),
- end=datetime(2005, 1, 10), freq='Min')
- test_series = Series(np.random.rand(len(dti)), dti)
- test_frame = DataFrame(
- {'A': test_series, 'B': test_series, 'C': np.arange(len(dti))})
- def test_str():
- r = test_series.resample('H')
- assert ('DatetimeIndexResampler [freq=<Hour>, axis=0, closed=left, '
- 'label=left, convention=start, base=0]' in str(r))
- def test_api():
- r = test_series.resample('H')
- result = r.mean()
- assert isinstance(result, Series)
- assert len(result) == 217
- r = test_series.to_frame().resample('H')
- result = r.mean()
- assert isinstance(result, DataFrame)
- assert len(result) == 217
- def test_groupby_resample_api():
- # GH 12448
- # .groupby(...).resample(...) hitting warnings
- # when appropriate
- df = DataFrame({'date': pd.date_range(start='2016-01-01',
- periods=4,
- freq='W'),
- 'group': [1, 1, 2, 2],
- 'val': [5, 6, 7, 8]}).set_index('date')
- # replication step
- i = pd.date_range('2016-01-03', periods=8).tolist() + \
- pd.date_range('2016-01-17', periods=8).tolist()
- index = pd.MultiIndex.from_arrays([[1] * 8 + [2] * 8, i],
- names=['group', 'date'])
- expected = DataFrame({'val': [5] * 7 + [6] + [7] * 7 + [8]},
- index=index)
- result = df.groupby('group').apply(
- lambda x: x.resample('1D').ffill())[['val']]
- assert_frame_equal(result, expected)
- def test_groupby_resample_on_api():
- # GH 15021
- # .groupby(...).resample(on=...) results in an unexpected
- # keyword warning.
- df = DataFrame({'key': ['A', 'B'] * 5,
- 'dates': pd.date_range('2016-01-01', periods=10),
- 'values': np.random.randn(10)})
- expected = df.set_index('dates').groupby('key').resample('D').mean()
- result = df.groupby('key').resample('D', on='dates').mean()
- assert_frame_equal(result, expected)
- def test_pipe():
- # GH17905
- # series
- r = test_series.resample('H')
- expected = r.max() - r.mean()
- result = r.pipe(lambda x: x.max() - x.mean())
- tm.assert_series_equal(result, expected)
- # dataframe
- r = test_frame.resample('H')
- expected = r.max() - r.mean()
- result = r.pipe(lambda x: x.max() - x.mean())
- tm.assert_frame_equal(result, expected)
- def test_getitem():
- r = test_frame.resample('H')
- tm.assert_index_equal(r._selected_obj.columns, test_frame.columns)
- r = test_frame.resample('H')['B']
- assert r._selected_obj.name == test_frame.columns[1]
- # technically this is allowed
- r = test_frame.resample('H')['A', 'B']
- tm.assert_index_equal(r._selected_obj.columns,
- test_frame.columns[[0, 1]])
- r = test_frame.resample('H')['A', 'B']
- tm.assert_index_equal(r._selected_obj.columns,
- test_frame.columns[[0, 1]])
- def test_select_bad_cols():
- g = test_frame.resample('H')
- pytest.raises(KeyError, g.__getitem__, ['D'])
- pytest.raises(KeyError, g.__getitem__, ['A', 'D'])
- with pytest.raises(KeyError, match='^[^A]+$'):
- # A should not be referenced as a bad column...
- # will have to rethink regex if you change message!
- g[['A', 'D']]
- def test_attribute_access():
- r = test_frame.resample('H')
- tm.assert_series_equal(r.A.sum(), r['A'].sum())
- def test_api_compat_before_use():
- # make sure that we are setting the binner
- # on these attributes
- for attr in ['groups', 'ngroups', 'indices']:
- rng = pd.date_range('1/1/2012', periods=100, freq='S')
- ts = Series(np.arange(len(rng)), index=rng)
- rs = ts.resample('30s')
- # before use
- getattr(rs, attr)
- # after grouper is initialized is ok
- rs.mean()
- getattr(rs, attr)
- def tests_skip_nuisance():
- df = test_frame
- df['D'] = 'foo'
- r = df.resample('H')
- result = r[['A', 'B']].sum()
- expected = pd.concat([r.A.sum(), r.B.sum()], axis=1)
- assert_frame_equal(result, expected)
- expected = r[['A', 'B', 'C']].sum()
- result = r.sum()
- assert_frame_equal(result, expected)
- def test_downsample_but_actually_upsampling():
- # this is reindex / asfreq
- rng = pd.date_range('1/1/2012', periods=100, freq='S')
- ts = Series(np.arange(len(rng), dtype='int64'), index=rng)
- result = ts.resample('20s').asfreq()
- expected = Series([0, 20, 40, 60, 80],
- index=pd.date_range('2012-01-01 00:00:00',
- freq='20s',
- periods=5))
- assert_series_equal(result, expected)
- def test_combined_up_downsampling_of_irregular():
- # since we are reallydoing an operation like this
- # ts2.resample('2s').mean().ffill()
- # preserve these semantics
- rng = pd.date_range('1/1/2012', periods=100, freq='S')
- ts = Series(np.arange(len(rng)), index=rng)
- ts2 = ts.iloc[[0, 1, 2, 3, 5, 7, 11, 15, 16, 25, 30]]
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- result = ts2.resample('2s', how='mean', fill_method='ffill')
- expected = ts2.resample('2s').mean().ffill()
- assert_series_equal(result, expected)
- def test_transform():
- r = test_series.resample('20min')
- expected = test_series.groupby(
- pd.Grouper(freq='20min')).transform('mean')
- result = r.transform('mean')
- assert_series_equal(result, expected)
- def test_fillna():
- # need to upsample here
- rng = pd.date_range('1/1/2012', periods=10, freq='2S')
- ts = Series(np.arange(len(rng), dtype='int64'), index=rng)
- r = ts.resample('s')
- expected = r.ffill()
- result = r.fillna(method='ffill')
- assert_series_equal(result, expected)
- expected = r.bfill()
- result = r.fillna(method='bfill')
- assert_series_equal(result, expected)
- with pytest.raises(ValueError):
- r.fillna(0)
- def test_apply_without_aggregation():
- # both resample and groupby should work w/o aggregation
- r = test_series.resample('20min')
- g = test_series.groupby(pd.Grouper(freq='20min'))
- for t in [g, r]:
- result = t.apply(lambda x: x)
- assert_series_equal(result, test_series)
- def test_agg_consistency():
- # make sure that we are consistent across
- # similar aggregations with and w/o selection list
- df = DataFrame(np.random.randn(1000, 3),
- index=pd.date_range('1/1/2012', freq='S', periods=1000),
- columns=['A', 'B', 'C'])
- r = df.resample('3T')
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- expected = r[['A', 'B', 'C']].agg({'r1': 'mean', 'r2': 'sum'})
- result = r.agg({'r1': 'mean', 'r2': 'sum'})
- assert_frame_equal(result, expected)
- # TODO: once GH 14008 is fixed, move these tests into
- # `Base` test class
- def test_agg():
- # test with all three Resampler apis and TimeGrouper
- np.random.seed(1234)
- index = date_range(datetime(2005, 1, 1),
- datetime(2005, 1, 10), freq='D')
- index.name = 'date'
- df = DataFrame(np.random.rand(10, 2), columns=list('AB'), index=index)
- df_col = df.reset_index()
- df_mult = df_col.copy()
- df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index],
- names=['index', 'date'])
- r = df.resample('2D')
- cases = [
- r,
- df_col.resample('2D', on='date'),
- df_mult.resample('2D', level='date'),
- df.groupby(pd.Grouper(freq='2D'))
- ]
- a_mean = r['A'].mean()
- a_std = r['A'].std()
- a_sum = r['A'].sum()
- b_mean = r['B'].mean()
- b_std = r['B'].std()
- b_sum = r['B'].sum()
- expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1)
- expected.columns = pd.MultiIndex.from_product([['A', 'B'],
- ['mean', 'std']])
- for t in cases:
- result = t.aggregate([np.mean, np.std])
- assert_frame_equal(result, expected)
- expected = pd.concat([a_mean, b_std], axis=1)
- for t in cases:
- result = t.aggregate({'A': np.mean,
- 'B': np.std})
- assert_frame_equal(result, expected, check_like=True)
- expected = pd.concat([a_mean, a_std], axis=1)
- expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'),
- ('A', 'std')])
- for t in cases:
- result = t.aggregate({'A': ['mean', 'std']})
- assert_frame_equal(result, expected)
- expected = pd.concat([a_mean, a_sum], axis=1)
- expected.columns = ['mean', 'sum']
- for t in cases:
- result = t['A'].aggregate(['mean', 'sum'])
- assert_frame_equal(result, expected)
- expected = pd.concat([a_mean, a_sum], axis=1)
- expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'),
- ('A', 'sum')])
- for t in cases:
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}})
- assert_frame_equal(result, expected, check_like=True)
- expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1)
- expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'),
- ('A', 'sum'),
- ('B', 'mean2'),
- ('B', 'sum2')])
- for t in cases:
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'},
- 'B': {'mean2': 'mean', 'sum2': 'sum'}})
- assert_frame_equal(result, expected, check_like=True)
- expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1)
- expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'),
- ('A', 'std'),
- ('B', 'mean'),
- ('B', 'std')])
- for t in cases:
- result = t.aggregate({'A': ['mean', 'std'],
- 'B': ['mean', 'std']})
- assert_frame_equal(result, expected, check_like=True)
- expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1)
- expected.columns = pd.MultiIndex.from_tuples([('r1', 'A', 'mean'),
- ('r1', 'A', 'sum'),
- ('r2', 'B', 'mean'),
- ('r2', 'B', 'sum')])
- def test_agg_misc():
- # test with all three Resampler apis and TimeGrouper
- np.random.seed(1234)
- index = date_range(datetime(2005, 1, 1),
- datetime(2005, 1, 10), freq='D')
- index.name = 'date'
- df = DataFrame(np.random.rand(10, 2), columns=list('AB'), index=index)
- df_col = df.reset_index()
- df_mult = df_col.copy()
- df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index],
- names=['index', 'date'])
- r = df.resample('2D')
- cases = [
- r,
- df_col.resample('2D', on='date'),
- df_mult.resample('2D', level='date'),
- df.groupby(pd.Grouper(freq='2D'))
- ]
- # passed lambda
- for t in cases:
- result = t.agg({'A': np.sum,
- 'B': lambda x: np.std(x, ddof=1)})
- rcustom = t['B'].apply(lambda x: np.std(x, ddof=1))
- expected = pd.concat([r['A'].sum(), rcustom], axis=1)
- assert_frame_equal(result, expected, check_like=True)
- # agg with renamers
- expected = pd.concat([t['A'].sum(),
- t['B'].sum(),
- t['A'].mean(),
- t['B'].mean()],
- axis=1)
- expected.columns = pd.MultiIndex.from_tuples([('result1', 'A'),
- ('result1', 'B'),
- ('result2', 'A'),
- ('result2', 'B')])
- for t in cases:
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- result = t[['A', 'B']].agg(OrderedDict([('result1', np.sum),
- ('result2', np.mean)]))
- assert_frame_equal(result, expected, check_like=True)
- # agg with different hows
- expected = pd.concat([t['A'].sum(),
- t['A'].std(),
- t['B'].mean(),
- t['B'].std()],
- axis=1)
- expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'),
- ('A', 'std'),
- ('B', 'mean'),
- ('B', 'std')])
- for t in cases:
- result = t.agg(OrderedDict([('A', ['sum', 'std']),
- ('B', ['mean', 'std'])]))
- assert_frame_equal(result, expected, check_like=True)
- # equivalent of using a selection list / or not
- for t in cases:
- result = t[['A', 'B']].agg({'A': ['sum', 'std'],
- 'B': ['mean', 'std']})
- assert_frame_equal(result, expected, check_like=True)
- # series like aggs
- for t in cases:
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- result = t['A'].agg({'A': ['sum', 'std']})
- expected = pd.concat([t['A'].sum(),
- t['A'].std()],
- axis=1)
- expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'),
- ('A', 'std')])
- assert_frame_equal(result, expected, check_like=True)
- expected = pd.concat([t['A'].agg(['sum', 'std']),
- t['A'].agg(['mean', 'std'])],
- axis=1)
- expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'),
- ('A', 'std'),
- ('B', 'mean'),
- ('B', 'std')])
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- result = t['A'].agg({'A': ['sum', 'std'],
- 'B': ['mean', 'std']})
- assert_frame_equal(result, expected, check_like=True)
- # errors
- # invalid names in the agg specification
- for t in cases:
- with pytest.raises(KeyError):
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- t[['A']].agg({'A': ['sum', 'std'],
- 'B': ['mean', 'std']})
- def test_agg_nested_dicts():
- np.random.seed(1234)
- index = date_range(datetime(2005, 1, 1),
- datetime(2005, 1, 10), freq='D')
- index.name = 'date'
- df = DataFrame(np.random.rand(10, 2), columns=list('AB'), index=index)
- df_col = df.reset_index()
- df_mult = df_col.copy()
- df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index],
- names=['index', 'date'])
- r = df.resample('2D')
- cases = [
- r,
- df_col.resample('2D', on='date'),
- df_mult.resample('2D', level='date'),
- df.groupby(pd.Grouper(freq='2D'))
- ]
- for t in cases:
- def f():
- t.aggregate({'r1': {'A': ['mean', 'sum']},
- 'r2': {'B': ['mean', 'sum']}})
- pytest.raises(ValueError, f)
- for t in cases:
- expected = pd.concat([t['A'].mean(), t['A'].std(), t['B'].mean(),
- t['B'].std()], axis=1)
- expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), (
- 'ra', 'std'), ('rb', 'mean'), ('rb', 'std')])
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- result = t[['A', 'B']].agg({'A': {'ra': ['mean', 'std']},
- 'B': {'rb': ['mean', 'std']}})
- assert_frame_equal(result, expected, check_like=True)
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- result = t.agg({'A': {'ra': ['mean', 'std']},
- 'B': {'rb': ['mean', 'std']}})
- assert_frame_equal(result, expected, check_like=True)
- def test_try_aggregate_non_existing_column():
- # GH 16766
- data = [
- {'dt': datetime(2017, 6, 1, 0), 'x': 1.0, 'y': 2.0},
- {'dt': datetime(2017, 6, 1, 1), 'x': 2.0, 'y': 2.0},
- {'dt': datetime(2017, 6, 1, 2), 'x': 3.0, 'y': 1.5}
- ]
- df = DataFrame(data).set_index('dt')
- # Error as we don't have 'z' column
- with pytest.raises(KeyError):
- df.resample('30T').agg({'x': ['mean'],
- 'y': ['median'],
- 'z': ['sum']})
- def test_selection_api_validation():
- # GH 13500
- index = date_range(datetime(2005, 1, 1),
- datetime(2005, 1, 10), freq='D')
- rng = np.arange(len(index), dtype=np.int64)
- df = DataFrame({'date': index, 'a': rng},
- index=pd.MultiIndex.from_arrays([rng, index],
- names=['v', 'd']))
- df_exp = DataFrame({'a': rng}, index=index)
- # non DatetimeIndex
- with pytest.raises(TypeError):
- df.resample('2D', level='v')
- with pytest.raises(ValueError):
- df.resample('2D', on='date', level='d')
- with pytest.raises(TypeError):
- df.resample('2D', on=['a', 'date'])
- with pytest.raises(KeyError):
- df.resample('2D', level=['a', 'date'])
- # upsampling not allowed
- with pytest.raises(ValueError):
- df.resample('2D', level='d').asfreq()
- with pytest.raises(ValueError):
- df.resample('2D', on='date').asfreq()
- exp = df_exp.resample('2D').sum()
- exp.index.name = 'date'
- assert_frame_equal(exp, df.resample('2D', on='date').sum())
- exp.index.name = 'd'
- assert_frame_equal(exp, df.resample('2D', level='d').sum())
|