# -*- coding: utf-8 -*- from __future__ import print_function from collections import defaultdict from datetime import datetime from decimal import Decimal import numpy as np import pytest from pandas.compat import ( OrderedDict, StringIO, lmap, lrange, lzip, map, range, zip) from pandas.errors import PerformanceWarning import pandas as pd from pandas import ( DataFrame, Index, MultiIndex, Panel, Series, Timestamp, compat, date_range, read_csv) import pandas.core.common as com import pandas.util.testing as tm from pandas.util.testing import ( assert_almost_equal, assert_frame_equal, assert_series_equal) def test_repr(): # GH18203 result = repr(pd.Grouper(key='A', level='B')) expected = "Grouper(key='A', level='B', axis=0, sort=False)" assert result == expected @pytest.mark.parametrize('dtype', ['int64', 'int32', 'float64', 'float32']) def test_basic(dtype): data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype) index = np.arange(9) np.random.shuffle(index) data = data.reindex(index) grouped = data.groupby(lambda x: x // 3) for k, v in grouped: assert len(v) == 3 agged = grouped.aggregate(np.mean) assert agged[1] == 1 assert_series_equal(agged, grouped.agg(np.mean)) # shorthand assert_series_equal(agged, grouped.mean()) assert_series_equal(grouped.agg(np.sum), grouped.sum()) expected = grouped.apply(lambda x: x * x.sum()) transformed = grouped.transform(lambda x: x * x.sum()) assert transformed[7] == 12 assert_series_equal(transformed, expected) value_grouped = data.groupby(data) assert_series_equal(value_grouped.aggregate(np.mean), agged, check_index_type=False) # complex agg agged = grouped.aggregate([np.mean, np.std]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): agged = grouped.aggregate({'one': np.mean, 'two': np.std}) group_constants = {0: 10, 1: 20, 2: 30} agged = grouped.agg(lambda x: group_constants[x.name] + x.mean()) assert agged[1] == 21 # corner cases msg = "Must produce aggregated value" # exception raised is type Exception with pytest.raises(Exception, match=msg): grouped.aggregate(lambda x: x * 2) def test_groupby_nonobject_dtype(mframe, df_mixed_floats): key = mframe.index.codes[0] grouped = mframe.groupby(key) result = grouped.sum() expected = mframe.groupby(key.astype('O')).sum() assert_frame_equal(result, expected) # GH 3911, mixed frame non-conversion df = df_mixed_floats.copy() df['value'] = lrange(len(df)) def max_value(group): return group.loc[group['value'].idxmax()] applied = df.groupby('A').apply(max_value) result = applied.get_dtype_counts().sort_values() expected = Series({'float64': 2, 'int64': 1, 'object': 2}).sort_values() assert_series_equal(result, expected) def test_groupby_return_type(): # GH2893, return a reduced type df1 = DataFrame( [{"val1": 1, "val2": 20}, {"val1": 1, "val2": 19}, {"val1": 2, "val2": 27}, {"val1": 2, "val2": 12} ]) def func(dataf): return dataf["val2"] - dataf["val2"].mean() result = df1.groupby("val1", squeeze=True).apply(func) assert isinstance(result, Series) df2 = DataFrame( [{"val1": 1, "val2": 20}, {"val1": 1, "val2": 19}, {"val1": 1, "val2": 27}, {"val1": 1, "val2": 12} ]) def func(dataf): return dataf["val2"] - dataf["val2"].mean() result = df2.groupby("val1", squeeze=True).apply(func) assert isinstance(result, Series) # GH3596, return a consistent type (regression in 0.11 from 0.10.1) df = DataFrame([[1, 1], [1, 1]], columns=['X', 'Y']) result = df.groupby('X', squeeze=False).count() assert isinstance(result, DataFrame) # GH5592 # inconcistent return type df = DataFrame(dict(A=['Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb', 'Pony', 'Pony'], B=Series( np.arange(7), dtype='int64'), C=date_range( '20130101', periods=7))) def f(grp): return grp.iloc[0] expected = df.groupby('A').first()[['B']] result = df.groupby('A').apply(f)[['B']] assert_frame_equal(result, expected) def f(grp): if grp.name == 'Tiger': return None return grp.iloc[0] result = df.groupby('A').apply(f)[['B']] e = expected.copy() e.loc['Tiger'] = np.nan assert_frame_equal(result, e) def f(grp): if grp.name == 'Pony': return None return grp.iloc[0] result = df.groupby('A').apply(f)[['B']] e = expected.copy() e.loc['Pony'] = np.nan assert_frame_equal(result, e) # 5592 revisited, with datetimes def f(grp): if grp.name == 'Pony': return None return grp.iloc[0] result = df.groupby('A').apply(f)[['C']] e = df.groupby('A').first()[['C']] e.loc['Pony'] = pd.NaT assert_frame_equal(result, e) # scalar outputs def f(grp): if grp.name == 'Pony': return None return grp.iloc[0].loc['C'] result = df.groupby('A').apply(f) e = df.groupby('A').first()['C'].copy() e.loc['Pony'] = np.nan e.name = None assert_series_equal(result, e) def test_pass_args_kwargs(ts, tsframe): def f(x, q=None, axis=0): return np.percentile(x, q, axis=axis) g = lambda x: np.percentile(x, 80, axis=0) # Series ts_grouped = ts.groupby(lambda x: x.month) agg_result = ts_grouped.agg(np.percentile, 80, axis=0) apply_result = ts_grouped.apply(np.percentile, 80, axis=0) trans_result = ts_grouped.transform(np.percentile, 80, axis=0) agg_expected = ts_grouped.quantile(.8) trans_expected = ts_grouped.transform(g) assert_series_equal(apply_result, agg_expected) assert_series_equal(agg_result, agg_expected, check_names=False) assert_series_equal(trans_result, trans_expected) agg_result = ts_grouped.agg(f, q=80) apply_result = ts_grouped.apply(f, q=80) trans_result = ts_grouped.transform(f, q=80) assert_series_equal(agg_result, agg_expected) assert_series_equal(apply_result, agg_expected) assert_series_equal(trans_result, trans_expected) # DataFrame df_grouped = tsframe.groupby(lambda x: x.month) agg_result = df_grouped.agg(np.percentile, 80, axis=0) apply_result = df_grouped.apply(DataFrame.quantile, .8) expected = df_grouped.quantile(.8) assert_frame_equal(apply_result, expected) assert_frame_equal(agg_result, expected, check_names=False) agg_result = df_grouped.agg(f, q=80) apply_result = df_grouped.apply(DataFrame.quantile, q=.8) assert_frame_equal(agg_result, expected, check_names=False) assert_frame_equal(apply_result, expected) def test_len(): df = tm.makeTimeDataFrame() grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]) assert len(grouped) == len(df) grouped = df.groupby([lambda x: x.year, lambda x: x.month]) expected = len({(x.year, x.month) for x in df.index}) assert len(grouped) == expected # issue 11016 df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3])) assert len(df.groupby(('a'))) == 0 assert len(df.groupby(('b'))) == 3 assert len(df.groupby(['a', 'b'])) == 3 def test_basic_regression(): # regression T = [1.0 * x for x in lrange(1, 10) * 10][:1095] result = Series(T, lrange(0, len(T))) groupings = np.random.random((1100, )) groupings = Series(groupings, lrange(0, len(groupings))) * 10. grouped = result.groupby(groupings) grouped.mean() @pytest.mark.parametrize('dtype', ['float64', 'float32', 'int64', 'int32', 'int16', 'int8']) def test_with_na_groups(dtype): index = Index(np.arange(10)) values = Series(np.ones(10), index, dtype=dtype) labels = Series([np.nan, 'foo', 'bar', 'bar', np.nan, np.nan, 'bar', 'bar', np.nan, 'foo'], index=index) # this SHOULD be an int grouped = values.groupby(labels) agged = grouped.agg(len) expected = Series([4, 2], index=['bar', 'foo']) assert_series_equal(agged, expected, check_dtype=False) # assert issubclass(agged.dtype.type, np.integer) # explicitly return a float from my function def f(x): return float(len(x)) agged = grouped.agg(f) expected = Series([4, 2], index=['bar', 'foo']) assert_series_equal(agged, expected, check_dtype=False) assert issubclass(agged.dtype.type, np.dtype(dtype).type) def test_indices_concatenation_order(): # GH 2808 def f1(x): y = x[(x.b % 2) == 1] ** 2 if y.empty: multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2, names=['b', 'c']) res = DataFrame(None, columns=['a'], index=multiindex) return res else: y = y.set_index(['b', 'c']) return y def f2(x): y = x[(x.b % 2) == 1] ** 2 if y.empty: return DataFrame() else: y = y.set_index(['b', 'c']) return y def f3(x): y = x[(x.b % 2) == 1] ** 2 if y.empty: multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2, names=['foo', 'bar']) res = DataFrame(None, columns=['a', 'b'], index=multiindex) return res else: return y df = DataFrame({'a': [1, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)}) df2 = DataFrame({'a': [3, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)}) # correct result result1 = df.groupby('a').apply(f1) result2 = df2.groupby('a').apply(f1) assert_frame_equal(result1, result2) # should fail (not the same number of levels) msg = "Cannot concat indices that do not have the same number of levels" with pytest.raises(AssertionError, match=msg): df.groupby('a').apply(f2) with pytest.raises(AssertionError, match=msg): df2.groupby('a').apply(f2) # should fail (incorrect shape) with pytest.raises(AssertionError, match=msg): df.groupby('a').apply(f3) with pytest.raises(AssertionError, match=msg): df2.groupby('a').apply(f3) def test_attr_wrapper(ts): grouped = ts.groupby(lambda x: x.weekday()) result = grouped.std() expected = grouped.agg(lambda x: np.std(x, ddof=1)) assert_series_equal(result, expected) # this is pretty cool result = grouped.describe() expected = {name: gp.describe() for name, gp in grouped} expected = DataFrame(expected).T assert_frame_equal(result, expected) # get attribute result = grouped.dtype expected = grouped.agg(lambda x: x.dtype) # make sure raises error msg = "'SeriesGroupBy' object has no attribute 'foo'" with pytest.raises(AttributeError, match=msg): getattr(grouped, 'foo') def test_frame_groupby(tsframe): grouped = tsframe.groupby(lambda x: x.weekday()) # aggregate aggregated = grouped.aggregate(np.mean) assert len(aggregated) == 5 assert len(aggregated.columns) == 4 # by string tscopy = tsframe.copy() tscopy['weekday'] = [x.weekday() for x in tscopy.index] stragged = tscopy.groupby('weekday').aggregate(np.mean) assert_frame_equal(stragged, aggregated, check_names=False) # transform grouped = tsframe.head(30).groupby(lambda x: x.weekday()) transformed = grouped.transform(lambda x: x - x.mean()) assert len(transformed) == 30 assert len(transformed.columns) == 4 # transform propagate transformed = grouped.transform(lambda x: x.mean()) for name, group in grouped: mean = group.mean() for idx in group.index: tm.assert_series_equal(transformed.xs(idx), mean, check_names=False) # iterate for weekday, group in grouped: assert group.index[0].weekday() == weekday # groups / group_indices groups = grouped.groups indices = grouped.indices for k, v in compat.iteritems(groups): samething = tsframe.index.take(indices[k]) assert (samething == v).all() def test_frame_groupby_columns(tsframe): mapping = {'A': 0, 'B': 0, 'C': 1, 'D': 1} grouped = tsframe.groupby(mapping, axis=1) # aggregate aggregated = grouped.aggregate(np.mean) assert len(aggregated) == len(tsframe) assert len(aggregated.columns) == 2 # transform tf = lambda x: x - x.mean() groupedT = tsframe.T.groupby(mapping, axis=0) assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf)) # iterate for k, v in grouped: assert len(v.columns) == 2 def test_frame_set_name_single(df): grouped = df.groupby('A') result = grouped.mean() assert result.index.name == 'A' result = df.groupby('A', as_index=False).mean() assert result.index.name != 'A' result = grouped.agg(np.mean) assert result.index.name == 'A' result = grouped.agg({'C': np.mean, 'D': np.std}) assert result.index.name == 'A' result = grouped['C'].mean() assert result.index.name == 'A' result = grouped['C'].agg(np.mean) assert result.index.name == 'A' result = grouped['C'].agg([np.mean, np.std]) assert result.index.name == 'A' with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = grouped['C'].agg({'foo': np.mean, 'bar': np.std}) assert result.index.name == 'A' def test_multi_func(df): col1 = df['A'] col2 = df['B'] grouped = df.groupby([col1.get, col2.get]) agged = grouped.mean() expected = df.groupby(['A', 'B']).mean() # TODO groupby get drops names assert_frame_equal(agged.loc[:, ['C', 'D']], expected.loc[:, ['C', 'D']], check_names=False) # some "groups" with no data df = DataFrame({'v1': np.random.randn(6), 'v2': np.random.randn(6), 'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']), 'k2': np.array(['1', '1', '1', '2', '2', '2'])}, index=['one', 'two', 'three', 'four', 'five', 'six']) # only verify that it works for now grouped = df.groupby(['k1', 'k2']) grouped.agg(np.sum) def test_multi_key_multiple_functions(df): grouped = df.groupby(['A', 'B'])['C'] agged = grouped.agg([np.mean, np.std]) expected = DataFrame({'mean': grouped.agg(np.mean), 'std': grouped.agg(np.std)}) assert_frame_equal(agged, expected) def test_frame_multi_key_function_list(): data = DataFrame( {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'foo', 'foo', 'foo'], 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', 'two', 'two', 'one'], 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', 'dull', 'shiny', 'shiny', 'shiny'], 'D': np.random.randn(11), 'E': np.random.randn(11), 'F': np.random.randn(11)}) grouped = data.groupby(['A', 'B']) funcs = [np.mean, np.std] agged = grouped.agg(funcs) expected = pd.concat([grouped['D'].agg(funcs), grouped['E'].agg(funcs), grouped['F'].agg(funcs)], keys=['D', 'E', 'F'], axis=1) assert (isinstance(agged.index, MultiIndex)) assert (isinstance(expected.index, MultiIndex)) assert_frame_equal(agged, expected) @pytest.mark.parametrize('op', [lambda x: x.sum(), lambda x: x.mean()]) @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") def test_groupby_multiple_columns(df, op): data = df grouped = data.groupby(['A', 'B']) result1 = op(grouped) expected = defaultdict(dict) for n1, gp1 in data.groupby('A'): for n2, gp2 in gp1.groupby('B'): expected[n1][n2] = op(gp2.loc[:, ['C', 'D']]) expected = {k: DataFrame(v) for k, v in compat.iteritems(expected)} expected = Panel.fromDict(expected).swapaxes(0, 1) expected.major_axis.name, expected.minor_axis.name = 'A', 'B' # a little bit crude for col in ['C', 'D']: result_col = op(grouped[col]) exp = expected[col] pivoted = result1[col].unstack() pivoted2 = result_col.unstack() assert_frame_equal(pivoted.reindex_like(exp), exp) assert_frame_equal(pivoted2.reindex_like(exp), exp) # test single series works the same result = data['C'].groupby([data['A'], data['B']]).mean() expected = data.groupby(['A', 'B']).mean()['C'] assert_series_equal(result, expected) def test_groupby_as_index_agg(df): grouped = df.groupby('A', as_index=False) # single-key result = grouped.agg(np.mean) expected = grouped.mean() assert_frame_equal(result, expected) result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]])) expected2 = grouped.mean() expected2['D'] = grouped.sum()['D'] assert_frame_equal(result2, expected2) grouped = df.groupby('A', as_index=True) expected3 = grouped['C'].sum() expected3 = DataFrame(expected3).rename(columns={'C': 'Q'}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result3 = grouped['C'].agg({'Q': np.sum}) assert_frame_equal(result3, expected3) # multi-key grouped = df.groupby(['A', 'B'], as_index=False) result = grouped.agg(np.mean) expected = grouped.mean() assert_frame_equal(result, expected) result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]])) expected2 = grouped.mean() expected2['D'] = grouped.sum()['D'] assert_frame_equal(result2, expected2) expected3 = grouped['C'].sum() expected3 = DataFrame(expected3).rename(columns={'C': 'Q'}) result3 = grouped['C'].agg({'Q': np.sum}) assert_frame_equal(result3, expected3) # GH7115 & GH8112 & GH8582 df = DataFrame(np.random.randint(0, 100, (50, 3)), columns=['jim', 'joe', 'jolie']) ts = Series(np.random.randint(5, 10, 50), name='jim') gr = df.groupby(ts) gr.nth(0) # invokes set_selection_from_grouper internally assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum)) for attr in ['mean', 'max', 'count', 'idxmax', 'cumsum', 'all']: gr = df.groupby(ts, as_index=False) left = getattr(gr, attr)() gr = df.groupby(ts.values, as_index=True) right = getattr(gr, attr)().reset_index(drop=True) assert_frame_equal(left, right) def test_as_index_series_return_frame(df): grouped = df.groupby('A', as_index=False) grouped2 = df.groupby(['A', 'B'], as_index=False) result = grouped['C'].agg(np.sum) expected = grouped.agg(np.sum).loc[:, ['A', 'C']] assert isinstance(result, DataFrame) assert_frame_equal(result, expected) result2 = grouped2['C'].agg(np.sum) expected2 = grouped2.agg(np.sum).loc[:, ['A', 'B', 'C']] assert isinstance(result2, DataFrame) assert_frame_equal(result2, expected2) result = grouped['C'].sum() expected = grouped.sum().loc[:, ['A', 'C']] assert isinstance(result, DataFrame) assert_frame_equal(result, expected) result2 = grouped2['C'].sum() expected2 = grouped2.sum().loc[:, ['A', 'B', 'C']] assert isinstance(result2, DataFrame) assert_frame_equal(result2, expected2) def test_as_index_series_column_slice_raises(df): # GH15072 grouped = df.groupby('A', as_index=False) msg = r"Column\(s\) C already selected" with pytest.raises(IndexError, match=msg): grouped['C'].__getitem__('D') def test_groupby_as_index_cython(df): data = df # single-key grouped = data.groupby('A', as_index=False) result = grouped.mean() expected = data.groupby(['A']).mean() expected.insert(0, 'A', expected.index) expected.index = np.arange(len(expected)) assert_frame_equal(result, expected) # multi-key grouped = data.groupby(['A', 'B'], as_index=False) result = grouped.mean() expected = data.groupby(['A', 'B']).mean() arrays = lzip(*expected.index.values) expected.insert(0, 'A', arrays[0]) expected.insert(1, 'B', arrays[1]) expected.index = np.arange(len(expected)) assert_frame_equal(result, expected) def test_groupby_as_index_series_scalar(df): grouped = df.groupby(['A', 'B'], as_index=False) # GH #421 result = grouped['C'].agg(len) expected = grouped.agg(len).loc[:, ['A', 'B', 'C']] assert_frame_equal(result, expected) def test_groupby_as_index_corner(df, ts): msg = "as_index=False only valid with DataFrame" with pytest.raises(TypeError, match=msg): ts.groupby(lambda x: x.weekday(), as_index=False) msg = "as_index=False only valid for axis=0" with pytest.raises(ValueError, match=msg): df.groupby(lambda x: x.lower(), as_index=False, axis=1) def test_groupby_multiple_key(df): df = tm.makeTimeDataFrame() grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]) agged = grouped.sum() assert_almost_equal(df.values, agged.values) grouped = df.T.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day], axis=1) agged = grouped.agg(lambda x: x.sum()) tm.assert_index_equal(agged.index, df.columns) assert_almost_equal(df.T.values, agged.values) agged = grouped.agg(lambda x: x.sum()) assert_almost_equal(df.T.values, agged.values) def test_groupby_multi_corner(df): # test that having an all-NA column doesn't mess you up df = df.copy() df['bad'] = np.nan agged = df.groupby(['A', 'B']).mean() expected = df.groupby(['A', 'B']).mean() expected['bad'] = np.nan assert_frame_equal(agged, expected) def test_omit_nuisance(df): grouped = df.groupby('A') result = grouped.mean() expected = df.loc[:, ['A', 'C', 'D']].groupby('A').mean() assert_frame_equal(result, expected) agged = grouped.agg(np.mean) exp = grouped.mean() assert_frame_equal(agged, exp) df = df.loc[:, ['A', 'C', 'D']] df['E'] = datetime.now() grouped = df.groupby('A') result = grouped.agg(np.sum) expected = grouped.sum() assert_frame_equal(result, expected) # won't work with axis = 1 grouped = df.groupby({'A': 0, 'C': 0, 'D': 1, 'E': 1}, axis=1) msg = (r'\("unsupported operand type\(s\) for \+: ' "'Timestamp' and 'float'\"" r", u?'occurred at index 0'\)") with pytest.raises(TypeError, match=msg): grouped.agg(lambda x: x.sum(0, numeric_only=False)) def test_omit_nuisance_python_multiple(three_group): grouped = three_group.groupby(['A', 'B']) agged = grouped.agg(np.mean) exp = grouped.mean() assert_frame_equal(agged, exp) def test_empty_groups_corner(mframe): # handle empty groups df = DataFrame({'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']), 'k2': np.array(['1', '1', '1', '2', '2', '2']), 'k3': ['foo', 'bar'] * 3, 'v1': np.random.randn(6), 'v2': np.random.randn(6)}) grouped = df.groupby(['k1', 'k2']) result = grouped.agg(np.mean) expected = grouped.mean() assert_frame_equal(result, expected) grouped = mframe[3:5].groupby(level=0) agged = grouped.apply(lambda x: x.mean()) agged_A = grouped['A'].apply(np.mean) assert_series_equal(agged['A'], agged_A) assert agged.index.name == 'first' def test_nonsense_func(): df = DataFrame([0]) msg = r"unsupported operand type\(s\) for \+: '(int|long)' and 'str'" with pytest.raises(TypeError, match=msg): df.groupby(lambda x: x + 'foo') def test_wrap_aggregated_output_multindex(mframe): df = mframe.T df['baz', 'two'] = 'peekaboo' keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] agged = df.groupby(keys).agg(np.mean) assert isinstance(agged.columns, MultiIndex) def aggfun(ser): if ser.name == ('foo', 'one'): raise TypeError else: return ser.sum() agged2 = df.groupby(keys).aggregate(aggfun) assert len(agged2.columns) + 1 == len(df.columns) def test_groupby_level_apply(mframe): result = mframe.groupby(level=0).count() assert result.index.name == 'first' result = mframe.groupby(level=1).count() assert result.index.name == 'second' result = mframe['A'].groupby(level=0).count() assert result.index.name == 'first' def test_groupby_level_mapper(mframe): deleveled = mframe.reset_index() mapper0 = {'foo': 0, 'bar': 0, 'baz': 1, 'qux': 1} mapper1 = {'one': 0, 'two': 0, 'three': 1} result0 = mframe.groupby(mapper0, level=0).sum() result1 = mframe.groupby(mapper1, level=1).sum() mapped_level0 = np.array([mapper0.get(x) for x in deleveled['first']]) mapped_level1 = np.array([mapper1.get(x) for x in deleveled['second']]) expected0 = mframe.groupby(mapped_level0).sum() expected1 = mframe.groupby(mapped_level1).sum() expected0.index.name, expected1.index.name = 'first', 'second' assert_frame_equal(result0, expected0) assert_frame_equal(result1, expected1) def test_groupby_level_nonmulti(): # GH 1313, GH 13901 s = Series([1, 2, 3, 10, 4, 5, 20, 6], Index([1, 2, 3, 1, 4, 5, 2, 6], name='foo')) expected = Series([11, 22, 3, 4, 5, 6], Index(range(1, 7), name='foo')) result = s.groupby(level=0).sum() tm.assert_series_equal(result, expected) result = s.groupby(level=[0]).sum() tm.assert_series_equal(result, expected) result = s.groupby(level=-1).sum() tm.assert_series_equal(result, expected) result = s.groupby(level=[-1]).sum() tm.assert_series_equal(result, expected) msg = "level > 0 or level < -1 only valid with MultiIndex" with pytest.raises(ValueError, match=msg): s.groupby(level=1) with pytest.raises(ValueError, match=msg): s.groupby(level=-2) msg = "No group keys passed!" with pytest.raises(ValueError, match=msg): s.groupby(level=[]) msg = "multiple levels only valid with MultiIndex" with pytest.raises(ValueError, match=msg): s.groupby(level=[0, 0]) with pytest.raises(ValueError, match=msg): s.groupby(level=[0, 1]) msg = "level > 0 or level < -1 only valid with MultiIndex" with pytest.raises(ValueError, match=msg): s.groupby(level=[1]) def test_groupby_complex(): # GH 12902 a = Series(data=np.arange(4) * (1 + 2j), index=[0, 0, 1, 1]) expected = Series((1 + 2j, 5 + 10j)) result = a.groupby(level=0).sum() assert_series_equal(result, expected) result = a.sum(level=0) assert_series_equal(result, expected) def test_mutate_groups(): # GH3380 df = DataFrame({ 'cat1': ['a'] * 8 + ['b'] * 6, 'cat2': ['c'] * 2 + ['d'] * 2 + ['e'] * 2 + ['f'] * 2 + ['c'] * 2 + ['d'] * 2 + ['e'] * 2, 'cat3': lmap(lambda x: 'g%s' % x, lrange(1, 15)), 'val': np.random.randint(100, size=14), }) def f_copy(x): x = x.copy() x['rank'] = x.val.rank(method='min') return x.groupby('cat2')['rank'].min() def f_no_copy(x): x['rank'] = x.val.rank(method='min') return x.groupby('cat2')['rank'].min() grpby_copy = df.groupby('cat1').apply(f_copy) grpby_no_copy = df.groupby('cat1').apply(f_no_copy) assert_series_equal(grpby_copy, grpby_no_copy) def test_no_mutate_but_looks_like(): # GH 8467 # first show's mutation indicator # second does not, but should yield the same results df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], 'value': range(9)}) result1 = df.groupby('key', group_keys=True).apply(lambda x: x[:].key) result2 = df.groupby('key', group_keys=True).apply(lambda x: x.key) assert_series_equal(result1, result2) def test_groupby_series_indexed_differently(): s1 = Series([5.0, -9.0, 4.0, 100., -5., 55., 6.7], index=Index(['a', 'b', 'c', 'd', 'e', 'f', 'g'])) s2 = Series([1.0, 1.0, 4.0, 5.0, 5.0, 7.0], index=Index(['a', 'b', 'd', 'f', 'g', 'h'])) grouped = s1.groupby(s2) agged = grouped.mean() exp = s1.groupby(s2.reindex(s1.index).get).mean() assert_series_equal(agged, exp) def test_groupby_with_hier_columns(): tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])) index = MultiIndex.from_tuples(tuples) columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), ( 'B', 'cat'), ('A', 'dog')]) df = DataFrame(np.random.randn(8, 4), index=index, columns=columns) result = df.groupby(level=0).mean() tm.assert_index_equal(result.columns, columns) result = df.groupby(level=0, axis=1).mean() tm.assert_index_equal(result.index, df.index) result = df.groupby(level=0).agg(np.mean) tm.assert_index_equal(result.columns, columns) result = df.groupby(level=0).apply(lambda x: x.mean()) tm.assert_index_equal(result.columns, columns) result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1)) tm.assert_index_equal(result.columns, Index(['A', 'B'])) tm.assert_index_equal(result.index, df.index) # add a nuisance column sorted_columns, _ = columns.sortlevel(0) df['A', 'foo'] = 'bar' result = df.groupby(level=0).mean() tm.assert_index_equal(result.columns, df.columns[:-1]) def test_grouping_ndarray(df): grouped = df.groupby(df['A'].values) result = grouped.sum() expected = df.groupby('A').sum() assert_frame_equal(result, expected, check_names=False ) # Note: no names when grouping by value def test_groupby_wrong_multi_labels(): data = """index,foo,bar,baz,spam,data 0,foo1,bar1,baz1,spam2,20 1,foo1,bar2,baz1,spam3,30 2,foo2,bar2,baz1,spam2,40 3,foo1,bar1,baz2,spam1,50 4,foo3,bar1,baz2,spam1,60""" data = read_csv(StringIO(data), index_col=0) grouped = data.groupby(['foo', 'bar', 'baz', 'spam']) result = grouped.agg(np.mean) expected = grouped.mean() assert_frame_equal(result, expected) def test_groupby_series_with_name(df): result = df.groupby(df['A']).mean() result2 = df.groupby(df['A'], as_index=False).mean() assert result.index.name == 'A' assert 'A' in result2 result = df.groupby([df['A'], df['B']]).mean() result2 = df.groupby([df['A'], df['B']], as_index=False).mean() assert result.index.names == ('A', 'B') assert 'A' in result2 assert 'B' in result2 def test_seriesgroupby_name_attr(df): # GH 6265 result = df.groupby('A')['C'] assert result.count().name == 'C' assert result.mean().name == 'C' testFunc = lambda x: np.sum(x) * 2 assert result.agg(testFunc).name == 'C' def test_consistency_name(): # GH 12363 df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'two', 'two', 'two', 'one', 'two'], 'C': np.random.randn(8) + 1.0, 'D': np.arange(8)}) expected = df.groupby(['A']).B.count() result = df.B.groupby(df.A).count() assert_series_equal(result, expected) def test_groupby_name_propagation(df): # GH 6124 def summarize(df, name=None): return Series({'count': 1, 'mean': 2, 'omissions': 3, }, name=name) def summarize_random_name(df): # Provide a different name for each Series. In this case, groupby # should not attempt to propagate the Series name since they are # inconsistent. return Series({ 'count': 1, 'mean': 2, 'omissions': 3, }, name=df.iloc[0]['A']) metrics = df.groupby('A').apply(summarize) assert metrics.columns.name is None metrics = df.groupby('A').apply(summarize, 'metrics') assert metrics.columns.name == 'metrics' metrics = df.groupby('A').apply(summarize_random_name) assert metrics.columns.name is None def test_groupby_nonstring_columns(): df = DataFrame([np.arange(10) for x in range(10)]) grouped = df.groupby(0) result = grouped.mean() expected = df.groupby(df[0]).mean() assert_frame_equal(result, expected) def test_groupby_mixed_type_columns(): # GH 13432, unorderable types in py3 df = DataFrame([[0, 1, 2]], columns=['A', 'B', 0]) expected = DataFrame([[1, 2]], columns=['B', 0], index=Index([0], name='A')) result = df.groupby('A').first() tm.assert_frame_equal(result, expected) result = df.groupby('A').sum() tm.assert_frame_equal(result, expected) # TODO: Ensure warning isn't emitted in the first place @pytest.mark.filterwarnings("ignore:Mean of:RuntimeWarning") def test_cython_grouper_series_bug_noncontig(): arr = np.empty((100, 100)) arr.fill(np.nan) obj = Series(arr[:, 0], index=lrange(100)) inds = np.tile(lrange(10), 10) result = obj.groupby(inds).agg(Series.median) assert result.isna().all() def test_series_grouper_noncontig_index(): index = Index(tm.rands_array(10, 100)) values = Series(np.random.randn(50), index=index[::2]) labels = np.random.randint(0, 5, 50) # it works! grouped = values.groupby(labels) # accessing the index elements causes segfault f = lambda x: len(set(map(id, x.index))) grouped.agg(f) def test_convert_objects_leave_decimal_alone(): s = Series(lrange(5)) labels = np.array(['a', 'b', 'c', 'd', 'e'], dtype='O') def convert_fast(x): return Decimal(str(x.mean())) def convert_force_pure(x): # base will be length 0 assert (len(x.values.base) > 0) return Decimal(str(x.mean())) grouped = s.groupby(labels) result = grouped.agg(convert_fast) assert result.dtype == np.object_ assert isinstance(result[0], Decimal) result = grouped.agg(convert_force_pure) assert result.dtype == np.object_ assert isinstance(result[0], Decimal) def test_groupby_dtype_inference_empty(): # GH 6733 df = DataFrame({'x': [], 'range': np.arange(0, dtype='int64')}) assert df['x'].dtype == np.float64 result = df.groupby('x').first() exp_index = Index([], name='x', dtype=np.float64) expected = DataFrame({'range': Series( [], index=exp_index, dtype='int64')}) assert_frame_equal(result, expected, by_blocks=True) def test_groupby_list_infer_array_like(df): result = df.groupby(list(df['A'])).mean() expected = df.groupby(df['A']).mean() assert_frame_equal(result, expected, check_names=False) with pytest.raises(KeyError, match=r"^'foo'$"): df.groupby(list(df['A'][:-1])) # pathological case of ambiguity df = DataFrame({'foo': [0, 1], 'bar': [3, 4], 'val': np.random.randn(2)}) result = df.groupby(['foo', 'bar']).mean() expected = df.groupby([df['foo'], df['bar']]).mean()[['val']] def test_groupby_keys_same_size_as_index(): # GH 11185 freq = 's' index = pd.date_range(start=pd.Timestamp('2015-09-29T11:34:44-0700'), periods=2, freq=freq) df = pd.DataFrame([['A', 10], ['B', 15]], columns=[ 'metric', 'values' ], index=index) result = df.groupby([pd.Grouper(level=0, freq=freq), 'metric']).mean() expected = df.set_index([df.index, 'metric']) assert_frame_equal(result, expected) def test_groupby_one_row(): # GH 11741 msg = r"^'Z'$" df1 = pd.DataFrame(np.random.randn(1, 4), columns=list('ABCD')) with pytest.raises(KeyError, match=msg): df1.groupby('Z') df2 = pd.DataFrame(np.random.randn(2, 4), columns=list('ABCD')) with pytest.raises(KeyError, match=msg): df2.groupby('Z') def test_groupby_nat_exclude(): # GH 6992 df = pd.DataFrame( {'values': np.random.randn(8), 'dt': [np.nan, pd.Timestamp('2013-01-01'), np.nan, pd.Timestamp( '2013-02-01'), np.nan, pd.Timestamp('2013-02-01'), np.nan, pd.Timestamp('2013-01-01')], 'str': [np.nan, 'a', np.nan, 'a', np.nan, 'a', np.nan, 'b']}) grouped = df.groupby('dt') expected = [pd.Index([1, 7]), pd.Index([3, 5])] keys = sorted(grouped.groups.keys()) assert len(keys) == 2 for k, e in zip(keys, expected): # grouped.groups keys are np.datetime64 with system tz # not to be affected by tz, only compare values tm.assert_index_equal(grouped.groups[k], e) # confirm obj is not filtered tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df) assert grouped.ngroups == 2 expected = { Timestamp('2013-01-01 00:00:00'): np.array([1, 7], dtype=np.int64), Timestamp('2013-02-01 00:00:00'): np.array([3, 5], dtype=np.int64) } for k in grouped.indices: tm.assert_numpy_array_equal(grouped.indices[k], expected[k]) tm.assert_frame_equal( grouped.get_group(Timestamp('2013-01-01')), df.iloc[[1, 7]]) tm.assert_frame_equal( grouped.get_group(Timestamp('2013-02-01')), df.iloc[[3, 5]]) with pytest.raises(KeyError, match=r"^NaT$"): grouped.get_group(pd.NaT) nan_df = DataFrame({'nan': [np.nan, np.nan, np.nan], 'nat': [pd.NaT, pd.NaT, pd.NaT]}) assert nan_df['nan'].dtype == 'float64' assert nan_df['nat'].dtype == 'datetime64[ns]' for key in ['nan', 'nat']: grouped = nan_df.groupby(key) assert grouped.groups == {} assert grouped.ngroups == 0 assert grouped.indices == {} with pytest.raises(KeyError, match=r"^nan$"): grouped.get_group(np.nan) with pytest.raises(KeyError, match=r"^NaT$"): grouped.get_group(pd.NaT) @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") def test_sparse_friendly(df): sdf = df[['C', 'D']].to_sparse() panel = tm.makePanel() tm.add_nans(panel) def _check_work(gp): gp.mean() gp.agg(np.mean) dict(iter(gp)) # it works! _check_work(sdf.groupby(lambda x: x // 2)) _check_work(sdf['C'].groupby(lambda x: x // 2)) _check_work(sdf.groupby(df['A'])) # do this someday # _check_work(panel.groupby(lambda x: x.month, axis=1)) @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") def test_panel_groupby(): panel = tm.makePanel() tm.add_nans(panel) grouped = panel.groupby({'ItemA': 0, 'ItemB': 0, 'ItemC': 1}, axis='items') agged = grouped.mean() agged2 = grouped.agg(lambda x: x.mean('items')) tm.assert_panel_equal(agged, agged2) tm.assert_index_equal(agged.items, Index([0, 1])) grouped = panel.groupby(lambda x: x.month, axis='major') agged = grouped.mean() exp = Index(sorted(list(set(panel.major_axis.month)))) tm.assert_index_equal(agged.major_axis, exp) grouped = panel.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1}, axis='minor') agged = grouped.mean() tm.assert_index_equal(agged.minor_axis, Index([0, 1])) def test_groupby_2d_malformed(): d = DataFrame(index=lrange(2)) d['group'] = ['g1', 'g2'] d['zeros'] = [0, 0] d['ones'] = [1, 1] d['label'] = ['l1', 'l2'] tmp = d.groupby(['group']).mean() res_values = np.array([[0, 1], [0, 1]], dtype=np.int64) tm.assert_index_equal(tmp.columns, Index(['zeros', 'ones'])) tm.assert_numpy_array_equal(tmp.values, res_values) def test_int32_overflow(): B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000) )) A = np.arange(25000) df = DataFrame({'A': A, 'B': B, 'C': A, 'D': B, 'E': np.random.randn(25000)}) left = df.groupby(['A', 'B', 'C', 'D']).sum() right = df.groupby(['D', 'C', 'B', 'A']).sum() assert len(left) == len(right) def test_groupby_sort_multi(): df = DataFrame({'a': ['foo', 'bar', 'baz'], 'b': [3, 2, 1], 'c': [0, 1, 2], 'd': np.random.randn(3)}) tups = lmap(tuple, df[['a', 'b', 'c']].values) tups = com.asarray_tuplesafe(tups) result = df.groupby(['a', 'b', 'c'], sort=True).sum() tm.assert_numpy_array_equal(result.index.values, tups[[1, 2, 0]]) tups = lmap(tuple, df[['c', 'a', 'b']].values) tups = com.asarray_tuplesafe(tups) result = df.groupby(['c', 'a', 'b'], sort=True).sum() tm.assert_numpy_array_equal(result.index.values, tups) tups = lmap(tuple, df[['b', 'c', 'a']].values) tups = com.asarray_tuplesafe(tups) result = df.groupby(['b', 'c', 'a'], sort=True).sum() tm.assert_numpy_array_equal(result.index.values, tups[[2, 1, 0]]) df = DataFrame({'a': [0, 1, 2, 0, 1, 2], 'b': [0, 0, 0, 1, 1, 1], 'd': np.random.randn(6)}) grouped = df.groupby(['a', 'b'])['d'] result = grouped.sum() def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): tups = lmap(tuple, df[keys].values) tups = com.asarray_tuplesafe(tups) expected = f(df.groupby(tups)[field]) for k, v in compat.iteritems(expected): assert (result[k] == v) _check_groupby(df, result, ['a', 'b'], 'd') def test_dont_clobber_name_column(): df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'], 'name': ['foo', 'bar', 'baz'] * 2}) result = df.groupby('key').apply(lambda x: x) assert_frame_equal(result, df) def test_skip_group_keys(): tsf = tm.makeTimeDataFrame() grouped = tsf.groupby(lambda x: x.month, group_keys=False) result = grouped.apply(lambda x: x.sort_values(by='A')[:3]) pieces = [group.sort_values(by='A')[:3] for key, group in grouped] expected = pd.concat(pieces) assert_frame_equal(result, expected) grouped = tsf['A'].groupby(lambda x: x.month, group_keys=False) result = grouped.apply(lambda x: x.sort_values()[:3]) pieces = [group.sort_values()[:3] for key, group in grouped] expected = pd.concat(pieces) assert_series_equal(result, expected) def test_no_nonsense_name(frame): # GH #995 s = frame['C'].copy() s.name = None result = s.groupby(frame['A']).agg(np.sum) assert result.name is None def test_multifunc_sum_bug(): # GH #1065 x = DataFrame(np.arange(9).reshape(3, 3)) x['test'] = 0 x['fl'] = [1.3, 1.5, 1.6] grouped = x.groupby('test') result = grouped.agg({'fl': 'sum', 2: 'size'}) assert result['fl'].dtype == np.float64 def test_handle_dict_return_value(df): def f(group): return {'max': group.max(), 'min': group.min()} def g(group): return Series({'max': group.max(), 'min': group.min()}) result = df.groupby('A')['C'].apply(f) expected = df.groupby('A')['C'].apply(g) assert isinstance(result, Series) assert_series_equal(result, expected) @pytest.mark.parametrize('grouper', ['A', ['A', 'B']]) def test_set_group_name(df, grouper): def f(group): assert group.name is not None return group def freduce(group): assert group.name is not None return group.sum() def foo(x): return freduce(x) grouped = df.groupby(grouper) # make sure all these work grouped.apply(f) grouped.aggregate(freduce) grouped.aggregate({'C': freduce, 'D': freduce}) grouped.transform(f) grouped['C'].apply(f) grouped['C'].aggregate(freduce) grouped['C'].aggregate([freduce, foo]) grouped['C'].transform(f) def test_group_name_available_in_inference_pass(): # gh-15062 df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)}) names = [] def f(group): names.append(group.name) return group.copy() df.groupby('a', sort=False, group_keys=False).apply(f) # we expect 2 zeros because we call ``f`` once to see if a faster route # can be used. expected_names = [0, 0, 1, 2] assert names == expected_names def test_no_dummy_key_names(df): # see gh-1291 result = df.groupby(df['A'].values).sum() assert result.index.name is None result = df.groupby([df['A'].values, df['B'].values]).sum() assert result.index.names == (None, None) def test_groupby_sort_multiindex_series(): # series multiindex groupby sort argument was not being passed through # _compress_group_index # GH 9444 index = MultiIndex(levels=[[1, 2], [1, 2]], codes=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]], names=['a', 'b']) mseries = Series([0, 1, 2, 3, 4, 5], index=index) index = MultiIndex(levels=[[1, 2], [1, 2]], codes=[[0, 0, 1], [1, 0, 0]], names=['a', 'b']) mseries_result = Series([0, 2, 4], index=index) result = mseries.groupby(level=['a', 'b'], sort=False).first() assert_series_equal(result, mseries_result) result = mseries.groupby(level=['a', 'b'], sort=True).first() assert_series_equal(result, mseries_result.sort_index()) def test_groupby_reindex_inside_function(): periods = 1000 ind = date_range(start='2012/1/1', freq='5min', periods=periods) df = DataFrame({'high': np.arange( periods), 'low': np.arange(periods)}, index=ind) def agg_before(hour, func, fix=False): """ Run an aggregate func on the subset of data. """ def _func(data): d = data.loc[data.index.map( lambda x: x.hour < 11)].dropna() if fix: data[data.index[0]] if len(d) == 0: return None return func(d) return _func def afunc(data): d = data.select(lambda x: x.hour < 11).dropna() return np.max(d) grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day)) closure_bad = grouped.agg({'high': agg_before(11, np.max)}) closure_good = grouped.agg({'high': agg_before(11, np.max, True)}) assert_frame_equal(closure_bad, closure_good) def test_groupby_multiindex_missing_pair(): # GH9049 df = DataFrame({'group1': ['a', 'a', 'a', 'b'], 'group2': ['c', 'c', 'd', 'c'], 'value': [1, 1, 1, 5]}) df = df.set_index(['group1', 'group2']) df_grouped = df.groupby(level=['group1', 'group2'], sort=True) res = df_grouped.agg('sum') idx = MultiIndex.from_tuples( [('a', 'c'), ('a', 'd'), ('b', 'c')], names=['group1', 'group2']) exp = DataFrame([[2], [1], [5]], index=idx, columns=['value']) tm.assert_frame_equal(res, exp) def test_groupby_multiindex_not_lexsorted(): # GH 11640 # define the lexsorted version lexsorted_mi = MultiIndex.from_tuples( [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c']) lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) assert lexsorted_df.columns.is_lexsorted() # define the non-lexsorted version not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'], data=[[1, 'b1', 'c1', 3], [1, 'b2', 'c2', 4]]) not_lexsorted_df = not_lexsorted_df.pivot_table( index='a', columns=['b', 'c'], values='d') not_lexsorted_df = not_lexsorted_df.reset_index() assert not not_lexsorted_df.columns.is_lexsorted() # compare the results tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) expected = lexsorted_df.groupby('a').mean() with tm.assert_produces_warning(PerformanceWarning): result = not_lexsorted_df.groupby('a').mean() tm.assert_frame_equal(expected, result) # a transforming function should work regardless of sort # GH 14776 df = DataFrame({'x': ['a', 'a', 'b', 'a'], 'y': [1, 1, 2, 2], 'z': [1, 2, 3, 4]}).set_index(['x', 'y']) assert not df.index.is_lexsorted() for level in [0, 1, [0, 1]]: for sort in [False, True]: result = df.groupby(level=level, sort=sort).apply( DataFrame.drop_duplicates) expected = df tm.assert_frame_equal(expected, result) result = df.sort_index().groupby(level=level, sort=sort).apply( DataFrame.drop_duplicates) expected = df.sort_index() tm.assert_frame_equal(expected, result) def test_index_label_overlaps_location(): # checking we don't have any label/location confusion in the # the wake of GH5375 df = DataFrame(list('ABCDE'), index=[2, 0, 2, 1, 1]) g = df.groupby(list('ababb')) actual = g.filter(lambda x: len(x) > 2) expected = df.iloc[[1, 3, 4]] assert_frame_equal(actual, expected) ser = df[0] g = ser.groupby(list('ababb')) actual = g.filter(lambda x: len(x) > 2) expected = ser.take([1, 3, 4]) assert_series_equal(actual, expected) # ... and again, with a generic Index of floats df.index = df.index.astype(float) g = df.groupby(list('ababb')) actual = g.filter(lambda x: len(x) > 2) expected = df.iloc[[1, 3, 4]] assert_frame_equal(actual, expected) ser = df[0] g = ser.groupby(list('ababb')) actual = g.filter(lambda x: len(x) > 2) expected = ser.take([1, 3, 4]) assert_series_equal(actual, expected) def test_transform_doesnt_clobber_ints(): # GH 7972 n = 6 x = np.arange(n) df = DataFrame({'a': x // 2, 'b': 2.0 * x, 'c': 3.0 * x}) df2 = DataFrame({'a': x // 2 * 1.0, 'b': 2.0 * x, 'c': 3.0 * x}) gb = df.groupby('a') result = gb.transform('mean') gb2 = df2.groupby('a') expected = gb2.transform('mean') tm.assert_frame_equal(result, expected) @pytest.mark.parametrize('sort_column', ['ints', 'floats', 'strings', ['ints', 'floats'], ['ints', 'strings']]) @pytest.mark.parametrize('group_column', ['int_groups', 'string_groups', ['int_groups', 'string_groups']]) def test_groupby_preserves_sort(sort_column, group_column): # Test to ensure that groupby always preserves sort order of original # object. Issue #8588 and #9651 df = DataFrame( {'int_groups': [3, 1, 0, 1, 0, 3, 3, 3], 'string_groups': ['z', 'a', 'z', 'a', 'a', 'g', 'g', 'g'], 'ints': [8, 7, 4, 5, 2, 9, 1, 1], 'floats': [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5], 'strings': ['z', 'd', 'a', 'e', 'word', 'word2', '42', '47']}) # Try sorting on different types and with different group types df = df.sort_values(by=sort_column) g = df.groupby(group_column) def test_sort(x): assert_frame_equal(x, x.sort_values(by=sort_column)) g.apply(test_sort) def test_group_shift_with_null_key(): # This test is designed to replicate the segfault in issue #13813. n_rows = 1200 # Generate a moderately large dataframe with occasional missing # values in column `B`, and then group by [`A`, `B`]. This should # force `-1` in `labels` array of `g.grouper.group_info` exactly # at those places, where the group-by key is partially missing. df = DataFrame([(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)], dtype=float, columns=["A", "B", "Z"], index=None) g = df.groupby(["A", "B"]) expected = DataFrame([(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)], dtype=float, columns=["Z"], index=None) result = g.shift(-1) assert_frame_equal(result, expected) def test_group_shift_with_fill_value(): # GH #24128 n_rows = 24 df = DataFrame([(i % 12, i % 3, i) for i in range(n_rows)], dtype=float, columns=["A", "B", "Z"], index=None) g = df.groupby(["A", "B"]) expected = DataFrame([(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)], dtype=float, columns=["Z"], index=None) result = g.shift(-1, fill_value=0)[["Z"]] assert_frame_equal(result, expected) def test_pivot_table_values_key_error(): # This test is designed to replicate the error in issue #14938 df = pd.DataFrame({'eventDate': pd.date_range(pd.datetime.today(), periods=20, freq='M').tolist(), 'thename': range(0, 20)}) df['year'] = df.set_index('eventDate').index.year df['month'] = df.set_index('eventDate').index.month with pytest.raises(KeyError, match="'badname'"): df.reset_index().pivot_table(index='year', columns='month', values='badname', aggfunc='count') def test_empty_dataframe_groupby(): # GH8093 df = DataFrame(columns=['A', 'B', 'C']) result = df.groupby('A').sum() expected = DataFrame(columns=['B', 'C'], dtype=np.float64) expected.index.name = 'A' assert_frame_equal(result, expected) def test_tuple_warns(): # https://github.com/pandas-dev/pandas/issues/18314 df = pd.DataFrame({('a', 'b'): [1, 1, 2, 2], 'a': [1, 1, 1, 2], 'b': [1, 2, 2, 2], 'c': [1, 1, 1, 1]}) with tm.assert_produces_warning(FutureWarning) as w: df[['a', 'b', 'c']].groupby(('a', 'b')).c.mean() assert "Interpreting tuple 'by' as a list" in str(w[0].message) with tm.assert_produces_warning(None): df.groupby(('a', 'b')).c.mean() def test_tuple_warns_unhashable(): # https://github.com/pandas-dev/pandas/issues/18314 business_dates = date_range(start='4/1/2014', end='6/30/2014', freq='B') df = DataFrame(1, index=business_dates, columns=['a', 'b']) with tm.assert_produces_warning(FutureWarning) as w: df.groupby((df.index.year, df.index.month)).nth([0, 3, -1]) assert "Interpreting tuple 'by' as a list" in str(w[0].message) def test_tuple_correct_keyerror(): # https://github.com/pandas-dev/pandas/issues/18798 df = pd.DataFrame(1, index=range(3), columns=pd.MultiIndex.from_product([[1, 2], [3, 4]])) with pytest.raises(KeyError, match=r"^\(7, 8\)$"): df.groupby((7, 8)).mean() def test_groupby_agg_ohlc_non_first(): # GH 21716 df = pd.DataFrame([[1], [1]], columns=['foo'], index=pd.date_range('2018-01-01', periods=2, freq='D')) expected = pd.DataFrame([ [1, 1, 1, 1, 1], [1, 1, 1, 1, 1] ], columns=pd.MultiIndex.from_tuples(( ('foo', 'ohlc', 'open'), ('foo', 'ohlc', 'high'), ('foo', 'ohlc', 'low'), ('foo', 'ohlc', 'close'), ('foo', 'sum', 'foo'))), index=pd.date_range( '2018-01-01', periods=2, freq='D')) result = df.groupby(pd.Grouper(freq='D')).agg(['sum', 'ohlc']) tm.assert_frame_equal(result, expected)