123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838 |
- # -*- coding: utf-8 -*-
- """ test where we are determining what we are grouping, or getting groups """
- import numpy as np
- import pytest
- from pandas.compat import long, lrange
- import pandas as pd
- from pandas import (
- CategoricalIndex, DataFrame, Index, MultiIndex, Series, Timestamp, compat,
- date_range)
- from pandas.core.groupby.grouper import Grouping
- import pandas.util.testing as tm
- from pandas.util.testing import (
- assert_almost_equal, assert_frame_equal, assert_panel_equal,
- assert_series_equal)
- # selection
- # --------------------------------
- class TestSelection(object):
- def test_select_bad_cols(self):
- df = DataFrame([[1, 2]], columns=['A', 'B'])
- g = df.groupby('A')
- with pytest.raises(KeyError, match='"Columns not found: \'C\'"'):
- g[['C']]
- with pytest.raises(KeyError, match='^[^A]+$'):
- # A should not be referenced as a bad column...
- # will have to rethink regex if you change message!
- g[['A', 'C']]
- def test_groupby_duplicated_column_errormsg(self):
- # GH7511
- df = DataFrame(columns=['A', 'B', 'A', 'C'],
- data=[range(4), range(2, 6), range(0, 8, 2)])
- msg = "Grouper for 'A' not 1-dimensional"
- with pytest.raises(ValueError, match=msg):
- df.groupby('A')
- with pytest.raises(ValueError, match=msg):
- df.groupby(['A', 'B'])
- grouped = df.groupby('B')
- c = grouped.count()
- assert c.columns.nlevels == 1
- assert c.columns.size == 3
- def test_column_select_via_attr(self, df):
- result = df.groupby('A').C.sum()
- expected = df.groupby('A')['C'].sum()
- assert_series_equal(result, expected)
- df['mean'] = 1.5
- result = df.groupby('A').mean()
- expected = df.groupby('A').agg(np.mean)
- assert_frame_equal(result, expected)
- def test_getitem_list_of_columns(self):
- df = DataFrame(
- {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
- 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
- 'C': np.random.randn(8),
- 'D': np.random.randn(8),
- 'E': np.random.randn(8)})
- result = df.groupby('A')[['C', 'D']].mean()
- result2 = df.groupby('A')['C', 'D'].mean()
- result3 = df.groupby('A')[df.columns[2:4]].mean()
- expected = df.loc[:, ['A', 'C', 'D']].groupby('A').mean()
- assert_frame_equal(result, expected)
- assert_frame_equal(result2, expected)
- assert_frame_equal(result3, expected)
- def test_getitem_numeric_column_names(self):
- # GH #13731
- df = DataFrame({0: list('abcd') * 2,
- 2: np.random.randn(8),
- 4: np.random.randn(8),
- 6: np.random.randn(8)})
- result = df.groupby(0)[df.columns[1:3]].mean()
- result2 = df.groupby(0)[2, 4].mean()
- result3 = df.groupby(0)[[2, 4]].mean()
- expected = df.loc[:, [0, 2, 4]].groupby(0).mean()
- assert_frame_equal(result, expected)
- assert_frame_equal(result2, expected)
- assert_frame_equal(result3, expected)
- # grouping
- # --------------------------------
- class TestGrouping():
- def test_grouper_index_types(self):
- # related GH5375
- # groupby misbehaving when using a Floatlike index
- df = DataFrame(np.arange(10).reshape(5, 2), columns=list('AB'))
- for index in [tm.makeFloatIndex, tm.makeStringIndex,
- tm.makeUnicodeIndex, tm.makeIntIndex, tm.makeDateIndex,
- tm.makePeriodIndex]:
- df.index = index(len(df))
- df.groupby(list('abcde')).apply(lambda x: x)
- df.index = list(reversed(df.index.tolist()))
- df.groupby(list('abcde')).apply(lambda x: x)
- def test_grouper_multilevel_freq(self):
- # GH 7885
- # with level and freq specified in a pd.Grouper
- from datetime import date, timedelta
- d0 = date.today() - timedelta(days=14)
- dates = date_range(d0, date.today())
- date_index = pd.MultiIndex.from_product(
- [dates, dates], names=['foo', 'bar'])
- df = pd.DataFrame(np.random.randint(0, 100, 225), index=date_index)
- # Check string level
- expected = df.reset_index().groupby([pd.Grouper(
- key='foo', freq='W'), pd.Grouper(key='bar', freq='W')]).sum()
- # reset index changes columns dtype to object
- expected.columns = pd.Index([0], dtype='int64')
- result = df.groupby([pd.Grouper(level='foo', freq='W'), pd.Grouper(
- level='bar', freq='W')]).sum()
- assert_frame_equal(result, expected)
- # Check integer level
- result = df.groupby([pd.Grouper(level=0, freq='W'), pd.Grouper(
- level=1, freq='W')]).sum()
- assert_frame_equal(result, expected)
- def test_grouper_creation_bug(self):
- # GH 8795
- df = DataFrame({'A': [0, 0, 1, 1, 2, 2], 'B': [1, 2, 3, 4, 5, 6]})
- g = df.groupby('A')
- expected = g.sum()
- g = df.groupby(pd.Grouper(key='A'))
- result = g.sum()
- assert_frame_equal(result, expected)
- result = g.apply(lambda x: x.sum())
- assert_frame_equal(result, expected)
- g = df.groupby(pd.Grouper(key='A', axis=0))
- result = g.sum()
- assert_frame_equal(result, expected)
- # GH14334
- # pd.Grouper(key=...) may be passed in a list
- df = DataFrame({'A': [0, 0, 0, 1, 1, 1],
- 'B': [1, 1, 2, 2, 3, 3],
- 'C': [1, 2, 3, 4, 5, 6]})
- # Group by single column
- expected = df.groupby('A').sum()
- g = df.groupby([pd.Grouper(key='A')])
- result = g.sum()
- assert_frame_equal(result, expected)
- # Group by two columns
- # using a combination of strings and Grouper objects
- expected = df.groupby(['A', 'B']).sum()
- # Group with two Grouper objects
- g = df.groupby([pd.Grouper(key='A'), pd.Grouper(key='B')])
- result = g.sum()
- assert_frame_equal(result, expected)
- # Group with a string and a Grouper object
- g = df.groupby(['A', pd.Grouper(key='B')])
- result = g.sum()
- assert_frame_equal(result, expected)
- # Group with a Grouper object and a string
- g = df.groupby([pd.Grouper(key='A'), 'B'])
- result = g.sum()
- assert_frame_equal(result, expected)
- # GH8866
- s = Series(np.arange(8, dtype='int64'),
- index=pd.MultiIndex.from_product(
- [list('ab'), range(2),
- date_range('20130101', periods=2)],
- names=['one', 'two', 'three']))
- result = s.groupby(pd.Grouper(level='three', freq='M')).sum()
- expected = Series([28], index=Index(
- [Timestamp('2013-01-31')], freq='M', name='three'))
- assert_series_equal(result, expected)
- # just specifying a level breaks
- result = s.groupby(pd.Grouper(level='one')).sum()
- expected = s.groupby(level='one').sum()
- assert_series_equal(result, expected)
- def test_grouper_column_and_index(self):
- # GH 14327
- # Grouping a multi-index frame by a column and an index level should
- # be equivalent to resetting the index and grouping by two columns
- idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3),
- ('b', 1), ('b', 2), ('b', 3)])
- idx.names = ['outer', 'inner']
- df_multi = pd.DataFrame({"A": np.arange(6),
- 'B': ['one', 'one', 'two',
- 'two', 'one', 'one']},
- index=idx)
- result = df_multi.groupby(['B', pd.Grouper(level='inner')]).mean()
- expected = df_multi.reset_index().groupby(['B', 'inner']).mean()
- assert_frame_equal(result, expected)
- # Test the reverse grouping order
- result = df_multi.groupby([pd.Grouper(level='inner'), 'B']).mean()
- expected = df_multi.reset_index().groupby(['inner', 'B']).mean()
- assert_frame_equal(result, expected)
- # Grouping a single-index frame by a column and the index should
- # be equivalent to resetting the index and grouping by two columns
- df_single = df_multi.reset_index('outer')
- result = df_single.groupby(['B', pd.Grouper(level='inner')]).mean()
- expected = df_single.reset_index().groupby(['B', 'inner']).mean()
- assert_frame_equal(result, expected)
- # Test the reverse grouping order
- result = df_single.groupby([pd.Grouper(level='inner'), 'B']).mean()
- expected = df_single.reset_index().groupby(['inner', 'B']).mean()
- assert_frame_equal(result, expected)
- def test_groupby_levels_and_columns(self):
- # GH9344, GH9049
- idx_names = ['x', 'y']
- idx = pd.MultiIndex.from_tuples(
- [(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names)
- df = pd.DataFrame(np.arange(12).reshape(-1, 3), index=idx)
- by_levels = df.groupby(level=idx_names).mean()
- # reset_index changes columns dtype to object
- by_columns = df.reset_index().groupby(idx_names).mean()
- tm.assert_frame_equal(by_levels, by_columns, check_column_type=False)
- by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64)
- tm.assert_frame_equal(by_levels, by_columns)
- def test_groupby_categorical_index_and_columns(self, observed):
- # GH18432
- columns = ['A', 'B', 'A', 'B']
- categories = ['B', 'A']
- data = np.ones((5, 4), int)
- cat_columns = CategoricalIndex(columns,
- categories=categories,
- ordered=True)
- df = DataFrame(data=data, columns=cat_columns)
- result = df.groupby(axis=1, level=0, observed=observed).sum()
- expected_data = 2 * np.ones((5, 2), int)
- if observed:
- # if we are not-observed we undergo a reindex
- # so need to adjust the output as our expected sets us up
- # to be non-observed
- expected_columns = CategoricalIndex(['A', 'B'],
- categories=categories,
- ordered=True)
- else:
- expected_columns = CategoricalIndex(categories,
- categories=categories,
- ordered=True)
- expected = DataFrame(data=expected_data, columns=expected_columns)
- assert_frame_equal(result, expected)
- # test transposed version
- df = DataFrame(data.T, index=cat_columns)
- result = df.groupby(axis=0, level=0, observed=observed).sum()
- expected = DataFrame(data=expected_data.T, index=expected_columns)
- assert_frame_equal(result, expected)
- def test_grouper_getting_correct_binner(self):
- # GH 10063
- # using a non-time-based grouper and a time-based grouper
- # and specifying levels
- df = DataFrame({'A': 1}, index=pd.MultiIndex.from_product(
- [list('ab'), date_range('20130101', periods=80)], names=['one',
- 'two']))
- result = df.groupby([pd.Grouper(level='one'), pd.Grouper(
- level='two', freq='M')]).sum()
- expected = DataFrame({'A': [31, 28, 21, 31, 28, 21]},
- index=MultiIndex.from_product(
- [list('ab'),
- date_range('20130101', freq='M', periods=3)],
- names=['one', 'two']))
- assert_frame_equal(result, expected)
- def test_grouper_iter(self, df):
- assert sorted(df.groupby('A').grouper) == ['bar', 'foo']
- def test_empty_groups(self, df):
- # see gh-1048
- with pytest.raises(ValueError, match="No group keys passed!"):
- df.groupby([])
- def test_groupby_grouper(self, df):
- grouped = df.groupby('A')
- result = df.groupby(grouped.grouper).mean()
- expected = grouped.mean()
- tm.assert_frame_equal(result, expected)
- def test_groupby_dict_mapping(self):
- # GH #679
- from pandas import Series
- s = Series({'T1': 5})
- result = s.groupby({'T1': 'T2'}).agg(sum)
- expected = s.groupby(['T2']).agg(sum)
- assert_series_equal(result, expected)
- s = Series([1., 2., 3., 4.], index=list('abcd'))
- mapping = {'a': 0, 'b': 0, 'c': 1, 'd': 1}
- result = s.groupby(mapping).mean()
- result2 = s.groupby(mapping).agg(np.mean)
- expected = s.groupby([0, 0, 1, 1]).mean()
- expected2 = s.groupby([0, 0, 1, 1]).mean()
- assert_series_equal(result, expected)
- assert_series_equal(result, result2)
- assert_series_equal(result, expected2)
- def test_groupby_grouper_f_sanity_checked(self):
- dates = date_range('01-Jan-2013', periods=12, freq='MS')
- ts = Series(np.random.randn(12), index=dates)
- # GH3035
- # index.map is used to apply grouper to the index
- # if it fails on the elements, map tries it on the entire index as
- # a sequence. That can yield invalid results that cause trouble
- # down the line.
- # the surprise comes from using key[0:6] rather then str(key)[0:6]
- # when the elements are Timestamp.
- # the result is Index[0:6], very confusing.
- msg = r"Grouper result violates len\(labels\) == len\(data\)"
- with pytest.raises(AssertionError, match=msg):
- ts.groupby(lambda key: key[0:6])
- def test_grouping_error_on_multidim_input(self, df):
- msg = ("Grouper for '<class 'pandas.core.frame.DataFrame'>'"
- " not 1-dimensional")
- with pytest.raises(ValueError, match=msg):
- Grouping(df.index, df[['A', 'A']])
- def test_multiindex_passthru(self):
- # GH 7997
- # regression from 0.14.1
- df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
- df.columns = pd.MultiIndex.from_tuples([(0, 1), (1, 1), (2, 1)])
- result = df.groupby(axis=1, level=[0, 1]).first()
- assert_frame_equal(result, df)
- def test_multiindex_negative_level(self, mframe):
- # GH 13901
- result = mframe.groupby(level=-1).sum()
- expected = mframe.groupby(level='second').sum()
- assert_frame_equal(result, expected)
- result = mframe.groupby(level=-2).sum()
- expected = mframe.groupby(level='first').sum()
- assert_frame_equal(result, expected)
- result = mframe.groupby(level=[-2, -1]).sum()
- expected = mframe
- assert_frame_equal(result, expected)
- result = mframe.groupby(level=[-1, 'first']).sum()
- expected = mframe.groupby(level=['second', 'first']).sum()
- assert_frame_equal(result, expected)
- def test_multifunc_select_col_integer_cols(self, df):
- df.columns = np.arange(len(df.columns))
- # it works!
- df.groupby(1, as_index=False)[2].agg({'Q': np.mean})
- def test_multiindex_columns_empty_level(self):
- lst = [['count', 'values'], ['to filter', '']]
- midx = MultiIndex.from_tuples(lst)
- df = DataFrame([[long(1), 'A']], columns=midx)
- grouped = df.groupby('to filter').groups
- assert grouped['A'] == [0]
- grouped = df.groupby([('to filter', '')]).groups
- assert grouped['A'] == [0]
- df = DataFrame([[long(1), 'A'], [long(2), 'B']], columns=midx)
- expected = df.groupby('to filter').groups
- result = df.groupby([('to filter', '')]).groups
- assert result == expected
- df = DataFrame([[long(1), 'A'], [long(2), 'A']], columns=midx)
- expected = df.groupby('to filter').groups
- result = df.groupby([('to filter', '')]).groups
- tm.assert_dict_equal(result, expected)
- def test_groupby_multiindex_tuple(self):
- # GH 17979
- df = pd.DataFrame([[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]],
- columns=pd.MultiIndex.from_arrays(
- [['a', 'b', 'b', 'c'],
- [1, 1, 2, 2]]))
- expected = df.groupby([('b', 1)]).groups
- result = df.groupby(('b', 1)).groups
- tm.assert_dict_equal(expected, result)
- df2 = pd.DataFrame(df.values,
- columns=pd.MultiIndex.from_arrays(
- [['a', 'b', 'b', 'c'],
- ['d', 'd', 'e', 'e']]))
- expected = df2.groupby([('b', 'd')]).groups
- result = df.groupby(('b', 1)).groups
- tm.assert_dict_equal(expected, result)
- df3 = pd.DataFrame(df.values,
- columns=[('a', 'd'), ('b', 'd'), ('b', 'e'), 'c'])
- expected = df3.groupby([('b', 'd')]).groups
- result = df.groupby(('b', 1)).groups
- tm.assert_dict_equal(expected, result)
- @pytest.mark.parametrize('sort', [True, False])
- def test_groupby_level(self, sort, mframe, df):
- # GH 17537
- frame = mframe
- deleveled = frame.reset_index()
- result0 = frame.groupby(level=0, sort=sort).sum()
- result1 = frame.groupby(level=1, sort=sort).sum()
- expected0 = frame.groupby(deleveled['first'].values, sort=sort).sum()
- expected1 = frame.groupby(deleveled['second'].values, sort=sort).sum()
- expected0.index.name = 'first'
- expected1.index.name = 'second'
- assert result0.index.name == 'first'
- assert result1.index.name == 'second'
- assert_frame_equal(result0, expected0)
- assert_frame_equal(result1, expected1)
- assert result0.index.name == frame.index.names[0]
- assert result1.index.name == frame.index.names[1]
- # groupby level name
- result0 = frame.groupby(level='first', sort=sort).sum()
- result1 = frame.groupby(level='second', sort=sort).sum()
- assert_frame_equal(result0, expected0)
- assert_frame_equal(result1, expected1)
- # axis=1
- result0 = frame.T.groupby(level=0, axis=1, sort=sort).sum()
- result1 = frame.T.groupby(level=1, axis=1, sort=sort).sum()
- assert_frame_equal(result0, expected0.T)
- assert_frame_equal(result1, expected1.T)
- # raise exception for non-MultiIndex
- msg = "level > 0 or level < -1 only valid with MultiIndex"
- with pytest.raises(ValueError, match=msg):
- df.groupby(level=1)
- def test_groupby_level_index_names(self):
- # GH4014 this used to raise ValueError since 'exp'>1 (in py2)
- df = DataFrame({'exp': ['A'] * 3 + ['B'] * 3,
- 'var1': lrange(6), }).set_index('exp')
- df.groupby(level='exp')
- msg = "level name foo is not the name of the index"
- with pytest.raises(ValueError, match=msg):
- df.groupby(level='foo')
- @pytest.mark.parametrize('sort', [True, False])
- def test_groupby_level_with_nas(self, sort):
- # GH 17537
- index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]],
- codes=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1,
- 2, 3]])
- # factorizing doesn't confuse things
- s = Series(np.arange(8.), index=index)
- result = s.groupby(level=0, sort=sort).sum()
- expected = Series([6., 22.], index=[0, 1])
- assert_series_equal(result, expected)
- index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]],
- codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0,
- 1, 2, 3]])
- # factorizing doesn't confuse things
- s = Series(np.arange(8.), index=index)
- result = s.groupby(level=0, sort=sort).sum()
- expected = Series([6., 18.], index=[0.0, 1.0])
- assert_series_equal(result, expected)
- def test_groupby_args(self, mframe):
- # PR8618 and issue 8015
- frame = mframe
- msg = "You have to supply one of 'by' and 'level'"
- with pytest.raises(TypeError, match=msg):
- frame.groupby()
- msg = "You have to supply one of 'by' and 'level'"
- with pytest.raises(TypeError, match=msg):
- frame.groupby(by=None, level=None)
- @pytest.mark.parametrize('sort,labels', [
- [True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]],
- [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]]
- ])
- def test_level_preserve_order(self, sort, labels, mframe):
- # GH 17537
- grouped = mframe.groupby(level=0, sort=sort)
- exp_labels = np.array(labels, np.intp)
- assert_almost_equal(grouped.grouper.labels[0], exp_labels)
- def test_grouping_labels(self, mframe):
- grouped = mframe.groupby(mframe.index.get_level_values(0))
- exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp)
- assert_almost_equal(grouped.grouper.labels[0], exp_labels)
- def test_list_grouper_with_nat(self):
- # GH 14715
- df = pd.DataFrame({'date': pd.date_range('1/1/2011',
- periods=365, freq='D')})
- df.iloc[-1] = pd.NaT
- grouper = pd.Grouper(key='date', freq='AS')
- # Grouper in a list grouping
- result = df.groupby([grouper])
- expected = {pd.Timestamp('2011-01-01'): pd.Index(list(range(364)))}
- tm.assert_dict_equal(result.groups, expected)
- # Test case without a list
- result = df.groupby(grouper)
- expected = {pd.Timestamp('2011-01-01'): 365}
- tm.assert_dict_equal(result.groups, expected)
- # get_group
- # --------------------------------
- class TestGetGroup():
- @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
- def test_get_group(self):
- wp = tm.makePanel()
- grouped = wp.groupby(lambda x: x.month, axis='major')
- gp = grouped.get_group(1)
- expected = wp.reindex(
- major=[x for x in wp.major_axis if x.month == 1])
- assert_panel_equal(gp, expected)
- # GH 5267
- # be datelike friendly
- df = DataFrame({'DATE': pd.to_datetime(
- ['10-Oct-2013', '10-Oct-2013', '10-Oct-2013', '11-Oct-2013',
- '11-Oct-2013', '11-Oct-2013']),
- 'label': ['foo', 'foo', 'bar', 'foo', 'foo', 'bar'],
- 'VAL': [1, 2, 3, 4, 5, 6]})
- g = df.groupby('DATE')
- key = list(g.groups)[0]
- result1 = g.get_group(key)
- result2 = g.get_group(Timestamp(key).to_pydatetime())
- result3 = g.get_group(str(Timestamp(key)))
- assert_frame_equal(result1, result2)
- assert_frame_equal(result1, result3)
- g = df.groupby(['DATE', 'label'])
- key = list(g.groups)[0]
- result1 = g.get_group(key)
- result2 = g.get_group((Timestamp(key[0]).to_pydatetime(), key[1]))
- result3 = g.get_group((str(Timestamp(key[0])), key[1]))
- assert_frame_equal(result1, result2)
- assert_frame_equal(result1, result3)
- # must pass a same-length tuple with multiple keys
- msg = "must supply a tuple to get_group with multiple grouping keys"
- with pytest.raises(ValueError, match=msg):
- g.get_group('foo')
- with pytest.raises(ValueError, match=msg):
- g.get_group(('foo'))
- msg = ("must supply a same-length tuple to get_group with multiple"
- " grouping keys")
- with pytest.raises(ValueError, match=msg):
- g.get_group(('foo', 'bar', 'baz'))
- def test_get_group_empty_bins(self, observed):
- d = pd.DataFrame([3, 1, 7, 6])
- bins = [0, 5, 10, 15]
- g = d.groupby(pd.cut(d[0], bins), observed=observed)
- # TODO: should prob allow a str of Interval work as well
- # IOW '(0, 5]'
- result = g.get_group(pd.Interval(0, 5))
- expected = DataFrame([3, 1], index=[0, 1])
- assert_frame_equal(result, expected)
- msg = r"Interval\(10, 15, closed='right'\)"
- with pytest.raises(KeyError, match=msg):
- g.get_group(pd.Interval(10, 15))
- def test_get_group_grouped_by_tuple(self):
- # GH 8121
- df = DataFrame([[(1, ), (1, 2), (1, ), (1, 2)]], index=['ids']).T
- gr = df.groupby('ids')
- expected = DataFrame({'ids': [(1, ), (1, )]}, index=[0, 2])
- result = gr.get_group((1, ))
- assert_frame_equal(result, expected)
- dt = pd.to_datetime(['2010-01-01', '2010-01-02', '2010-01-01',
- '2010-01-02'])
- df = DataFrame({'ids': [(x, ) for x in dt]})
- gr = df.groupby('ids')
- result = gr.get_group(('2010-01-01', ))
- expected = DataFrame({'ids': [(dt[0], ), (dt[0], )]}, index=[0, 2])
- assert_frame_equal(result, expected)
- def test_groupby_with_empty(self):
- index = pd.DatetimeIndex(())
- data = ()
- series = pd.Series(data, index)
- grouper = pd.Grouper(freq='D')
- grouped = series.groupby(grouper)
- assert next(iter(grouped), None) is None
- def test_groupby_with_single_column(self):
- df = pd.DataFrame({'a': list('abssbab')})
- tm.assert_frame_equal(df.groupby('a').get_group('a'), df.iloc[[0, 5]])
- # GH 13530
- exp = pd.DataFrame([], index=pd.Index(['a', 'b', 's'], name='a'))
- tm.assert_frame_equal(df.groupby('a').count(), exp)
- tm.assert_frame_equal(df.groupby('a').sum(), exp)
- tm.assert_frame_equal(df.groupby('a').nth(1), exp)
- def test_gb_key_len_equal_axis_len(self):
- # GH16843
- # test ensures that index and column keys are recognized correctly
- # when number of keys equals axis length of groupby
- df = pd.DataFrame([['foo', 'bar', 'B', 1],
- ['foo', 'bar', 'B', 2],
- ['foo', 'baz', 'C', 3]],
- columns=['first', 'second', 'third', 'one'])
- df = df.set_index(['first', 'second'])
- df = df.groupby(['first', 'second', 'third']).size()
- assert df.loc[('foo', 'bar', 'B')] == 2
- assert df.loc[('foo', 'baz', 'C')] == 1
- # groups & iteration
- # --------------------------------
- class TestIteration():
- def test_groups(self, df):
- grouped = df.groupby(['A'])
- groups = grouped.groups
- assert groups is grouped.groups # caching works
- for k, v in compat.iteritems(grouped.groups):
- assert (df.loc[v]['A'] == k).all()
- grouped = df.groupby(['A', 'B'])
- groups = grouped.groups
- assert groups is grouped.groups # caching works
- for k, v in compat.iteritems(grouped.groups):
- assert (df.loc[v]['A'] == k[0]).all()
- assert (df.loc[v]['B'] == k[1]).all()
- def test_grouping_is_iterable(self, tsframe):
- # this code path isn't used anywhere else
- # not sure it's useful
- grouped = tsframe.groupby([lambda x: x.weekday(), lambda x: x.year])
- # test it works
- for g in grouped.grouper.groupings[0]:
- pass
- def test_multi_iter(self):
- s = Series(np.arange(6))
- k1 = np.array(['a', 'a', 'a', 'b', 'b', 'b'])
- k2 = np.array(['1', '2', '1', '2', '1', '2'])
- grouped = s.groupby([k1, k2])
- iterated = list(grouped)
- expected = [('a', '1', s[[0, 2]]), ('a', '2', s[[1]]),
- ('b', '1', s[[4]]), ('b', '2', s[[3, 5]])]
- for i, ((one, two), three) in enumerate(iterated):
- e1, e2, e3 = expected[i]
- assert e1 == one
- assert e2 == two
- assert_series_equal(three, e3)
- def test_multi_iter_frame(self, three_group):
- k1 = np.array(['b', 'b', 'b', 'a', 'a', 'a'])
- k2 = np.array(['1', '2', '1', '2', '1', '2'])
- df = DataFrame({'v1': np.random.randn(6),
- 'v2': np.random.randn(6),
- 'k1': k1, 'k2': k2},
- index=['one', 'two', 'three', 'four', 'five', 'six'])
- grouped = df.groupby(['k1', 'k2'])
- # things get sorted!
- iterated = list(grouped)
- idx = df.index
- expected = [('a', '1', df.loc[idx[[4]]]),
- ('a', '2', df.loc[idx[[3, 5]]]),
- ('b', '1', df.loc[idx[[0, 2]]]),
- ('b', '2', df.loc[idx[[1]]])]
- for i, ((one, two), three) in enumerate(iterated):
- e1, e2, e3 = expected[i]
- assert e1 == one
- assert e2 == two
- assert_frame_equal(three, e3)
- # don't iterate through groups with no data
- df['k1'] = np.array(['b', 'b', 'b', 'a', 'a', 'a'])
- df['k2'] = np.array(['1', '1', '1', '2', '2', '2'])
- grouped = df.groupby(['k1', 'k2'])
- groups = {key: gp for key, gp in grouped}
- assert len(groups) == 2
- # axis = 1
- three_levels = three_group.groupby(['A', 'B', 'C']).mean()
- grouped = three_levels.T.groupby(axis=1, level=(1, 2))
- for key, group in grouped:
- pass
- @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
- def test_multi_iter_panel(self):
- wp = tm.makePanel()
- grouped = wp.groupby([lambda x: x.month, lambda x: x.weekday()],
- axis=1)
- for (month, wd), group in grouped:
- exp_axis = [x
- for x in wp.major_axis
- if x.month == month and x.weekday() == wd]
- expected = wp.reindex(major=exp_axis)
- assert_panel_equal(group, expected)
- def test_dictify(self, df):
- dict(iter(df.groupby('A')))
- dict(iter(df.groupby(['A', 'B'])))
- dict(iter(df['C'].groupby(df['A'])))
- dict(iter(df['C'].groupby([df['A'], df['B']])))
- dict(iter(df.groupby('A')['C']))
- dict(iter(df.groupby(['A', 'B'])['C']))
- def test_groupby_with_small_elem(self):
- # GH 8542
- # length=2
- df = pd.DataFrame({'event': ['start', 'start'],
- 'change': [1234, 5678]},
- index=pd.DatetimeIndex(['2014-09-10', '2013-10-10']))
- grouped = df.groupby([pd.Grouper(freq='M'), 'event'])
- assert len(grouped.groups) == 2
- assert grouped.ngroups == 2
- assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups
- assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups
- res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start'))
- tm.assert_frame_equal(res, df.iloc[[0], :])
- res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start'))
- tm.assert_frame_equal(res, df.iloc[[1], :])
- df = pd.DataFrame({'event': ['start', 'start', 'start'],
- 'change': [1234, 5678, 9123]},
- index=pd.DatetimeIndex(['2014-09-10', '2013-10-10',
- '2014-09-15']))
- grouped = df.groupby([pd.Grouper(freq='M'), 'event'])
- assert len(grouped.groups) == 2
- assert grouped.ngroups == 2
- assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups
- assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups
- res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start'))
- tm.assert_frame_equal(res, df.iloc[[0, 2], :])
- res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start'))
- tm.assert_frame_equal(res, df.iloc[[1], :])
- # length=3
- df = pd.DataFrame({'event': ['start', 'start', 'start'],
- 'change': [1234, 5678, 9123]},
- index=pd.DatetimeIndex(['2014-09-10', '2013-10-10',
- '2014-08-05']))
- grouped = df.groupby([pd.Grouper(freq='M'), 'event'])
- assert len(grouped.groups) == 3
- assert grouped.ngroups == 3
- assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups
- assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups
- assert (pd.Timestamp('2014-08-31'), 'start') in grouped.groups
- res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start'))
- tm.assert_frame_equal(res, df.iloc[[0], :])
- res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start'))
- tm.assert_frame_equal(res, df.iloc[[1], :])
- res = grouped.get_group((pd.Timestamp('2014-08-31'), 'start'))
- tm.assert_frame_equal(res, df.iloc[[2], :])
- def test_grouping_string_repr(self):
- # GH 13394
- mi = MultiIndex.from_arrays([list("AAB"), list("aba")])
- df = DataFrame([[1, 2, 3]], columns=mi)
- gr = df.groupby(df[('A', 'a')])
- result = gr.grouper.groupings[0].__repr__()
- expected = "Grouping(('A', 'a'))"
- assert result == expected
|