123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746 |
- # -*- coding: utf-8 -*-
- from __future__ import print_function
- from collections import defaultdict
- from datetime import datetime
- from decimal import Decimal
- import numpy as np
- import pytest
- from pandas.compat import (
- OrderedDict, StringIO, lmap, lrange, lzip, map, range, zip)
- from pandas.errors import PerformanceWarning
- import pandas as pd
- from pandas import (
- DataFrame, Index, MultiIndex, Panel, Series, Timestamp, compat, date_range,
- read_csv)
- import pandas.core.common as com
- import pandas.util.testing as tm
- from pandas.util.testing import (
- assert_almost_equal, assert_frame_equal, assert_series_equal)
- def test_repr():
- # GH18203
- result = repr(pd.Grouper(key='A', level='B'))
- expected = "Grouper(key='A', level='B', axis=0, sort=False)"
- assert result == expected
- @pytest.mark.parametrize('dtype', ['int64', 'int32', 'float64', 'float32'])
- def test_basic(dtype):
- data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype)
- index = np.arange(9)
- np.random.shuffle(index)
- data = data.reindex(index)
- grouped = data.groupby(lambda x: x // 3)
- for k, v in grouped:
- assert len(v) == 3
- agged = grouped.aggregate(np.mean)
- assert agged[1] == 1
- assert_series_equal(agged, grouped.agg(np.mean)) # shorthand
- assert_series_equal(agged, grouped.mean())
- assert_series_equal(grouped.agg(np.sum), grouped.sum())
- expected = grouped.apply(lambda x: x * x.sum())
- transformed = grouped.transform(lambda x: x * x.sum())
- assert transformed[7] == 12
- assert_series_equal(transformed, expected)
- value_grouped = data.groupby(data)
- assert_series_equal(value_grouped.aggregate(np.mean), agged,
- check_index_type=False)
- # complex agg
- agged = grouped.aggregate([np.mean, np.std])
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- agged = grouped.aggregate({'one': np.mean, 'two': np.std})
- group_constants = {0: 10, 1: 20, 2: 30}
- agged = grouped.agg(lambda x: group_constants[x.name] + x.mean())
- assert agged[1] == 21
- # corner cases
- msg = "Must produce aggregated value"
- # exception raised is type Exception
- with pytest.raises(Exception, match=msg):
- grouped.aggregate(lambda x: x * 2)
- def test_groupby_nonobject_dtype(mframe, df_mixed_floats):
- key = mframe.index.codes[0]
- grouped = mframe.groupby(key)
- result = grouped.sum()
- expected = mframe.groupby(key.astype('O')).sum()
- assert_frame_equal(result, expected)
- # GH 3911, mixed frame non-conversion
- df = df_mixed_floats.copy()
- df['value'] = lrange(len(df))
- def max_value(group):
- return group.loc[group['value'].idxmax()]
- applied = df.groupby('A').apply(max_value)
- result = applied.get_dtype_counts().sort_values()
- expected = Series({'float64': 2,
- 'int64': 1,
- 'object': 2}).sort_values()
- assert_series_equal(result, expected)
- def test_groupby_return_type():
- # GH2893, return a reduced type
- df1 = DataFrame(
- [{"val1": 1, "val2": 20},
- {"val1": 1, "val2": 19},
- {"val1": 2, "val2": 27},
- {"val1": 2, "val2": 12}
- ])
- def func(dataf):
- return dataf["val2"] - dataf["val2"].mean()
- result = df1.groupby("val1", squeeze=True).apply(func)
- assert isinstance(result, Series)
- df2 = DataFrame(
- [{"val1": 1, "val2": 20},
- {"val1": 1, "val2": 19},
- {"val1": 1, "val2": 27},
- {"val1": 1, "val2": 12}
- ])
- def func(dataf):
- return dataf["val2"] - dataf["val2"].mean()
- result = df2.groupby("val1", squeeze=True).apply(func)
- assert isinstance(result, Series)
- # GH3596, return a consistent type (regression in 0.11 from 0.10.1)
- df = DataFrame([[1, 1], [1, 1]], columns=['X', 'Y'])
- result = df.groupby('X', squeeze=False).count()
- assert isinstance(result, DataFrame)
- # GH5592
- # inconcistent return type
- df = DataFrame(dict(A=['Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb',
- 'Pony', 'Pony'], B=Series(
- np.arange(7), dtype='int64'), C=date_range(
- '20130101', periods=7)))
- def f(grp):
- return grp.iloc[0]
- expected = df.groupby('A').first()[['B']]
- result = df.groupby('A').apply(f)[['B']]
- assert_frame_equal(result, expected)
- def f(grp):
- if grp.name == 'Tiger':
- return None
- return grp.iloc[0]
- result = df.groupby('A').apply(f)[['B']]
- e = expected.copy()
- e.loc['Tiger'] = np.nan
- assert_frame_equal(result, e)
- def f(grp):
- if grp.name == 'Pony':
- return None
- return grp.iloc[0]
- result = df.groupby('A').apply(f)[['B']]
- e = expected.copy()
- e.loc['Pony'] = np.nan
- assert_frame_equal(result, e)
- # 5592 revisited, with datetimes
- def f(grp):
- if grp.name == 'Pony':
- return None
- return grp.iloc[0]
- result = df.groupby('A').apply(f)[['C']]
- e = df.groupby('A').first()[['C']]
- e.loc['Pony'] = pd.NaT
- assert_frame_equal(result, e)
- # scalar outputs
- def f(grp):
- if grp.name == 'Pony':
- return None
- return grp.iloc[0].loc['C']
- result = df.groupby('A').apply(f)
- e = df.groupby('A').first()['C'].copy()
- e.loc['Pony'] = np.nan
- e.name = None
- assert_series_equal(result, e)
- def test_pass_args_kwargs(ts, tsframe):
- def f(x, q=None, axis=0):
- return np.percentile(x, q, axis=axis)
- g = lambda x: np.percentile(x, 80, axis=0)
- # Series
- ts_grouped = ts.groupby(lambda x: x.month)
- agg_result = ts_grouped.agg(np.percentile, 80, axis=0)
- apply_result = ts_grouped.apply(np.percentile, 80, axis=0)
- trans_result = ts_grouped.transform(np.percentile, 80, axis=0)
- agg_expected = ts_grouped.quantile(.8)
- trans_expected = ts_grouped.transform(g)
- assert_series_equal(apply_result, agg_expected)
- assert_series_equal(agg_result, agg_expected, check_names=False)
- assert_series_equal(trans_result, trans_expected)
- agg_result = ts_grouped.agg(f, q=80)
- apply_result = ts_grouped.apply(f, q=80)
- trans_result = ts_grouped.transform(f, q=80)
- assert_series_equal(agg_result, agg_expected)
- assert_series_equal(apply_result, agg_expected)
- assert_series_equal(trans_result, trans_expected)
- # DataFrame
- df_grouped = tsframe.groupby(lambda x: x.month)
- agg_result = df_grouped.agg(np.percentile, 80, axis=0)
- apply_result = df_grouped.apply(DataFrame.quantile, .8)
- expected = df_grouped.quantile(.8)
- assert_frame_equal(apply_result, expected)
- assert_frame_equal(agg_result, expected, check_names=False)
- agg_result = df_grouped.agg(f, q=80)
- apply_result = df_grouped.apply(DataFrame.quantile, q=.8)
- assert_frame_equal(agg_result, expected, check_names=False)
- assert_frame_equal(apply_result, expected)
- def test_len():
- df = tm.makeTimeDataFrame()
- grouped = df.groupby([lambda x: x.year, lambda x: x.month,
- lambda x: x.day])
- assert len(grouped) == len(df)
- grouped = df.groupby([lambda x: x.year, lambda x: x.month])
- expected = len({(x.year, x.month) for x in df.index})
- assert len(grouped) == expected
- # issue 11016
- df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3]))
- assert len(df.groupby(('a'))) == 0
- assert len(df.groupby(('b'))) == 3
- assert len(df.groupby(['a', 'b'])) == 3
- def test_basic_regression():
- # regression
- T = [1.0 * x for x in lrange(1, 10) * 10][:1095]
- result = Series(T, lrange(0, len(T)))
- groupings = np.random.random((1100, ))
- groupings = Series(groupings, lrange(0, len(groupings))) * 10.
- grouped = result.groupby(groupings)
- grouped.mean()
- @pytest.mark.parametrize('dtype', ['float64', 'float32', 'int64',
- 'int32', 'int16', 'int8'])
- def test_with_na_groups(dtype):
- index = Index(np.arange(10))
- values = Series(np.ones(10), index, dtype=dtype)
- labels = Series([np.nan, 'foo', 'bar', 'bar', np.nan, np.nan,
- 'bar', 'bar', np.nan, 'foo'], index=index)
- # this SHOULD be an int
- grouped = values.groupby(labels)
- agged = grouped.agg(len)
- expected = Series([4, 2], index=['bar', 'foo'])
- assert_series_equal(agged, expected, check_dtype=False)
- # assert issubclass(agged.dtype.type, np.integer)
- # explicitly return a float from my function
- def f(x):
- return float(len(x))
- agged = grouped.agg(f)
- expected = Series([4, 2], index=['bar', 'foo'])
- assert_series_equal(agged, expected, check_dtype=False)
- assert issubclass(agged.dtype.type, np.dtype(dtype).type)
- def test_indices_concatenation_order():
- # GH 2808
- def f1(x):
- y = x[(x.b % 2) == 1] ** 2
- if y.empty:
- multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2,
- names=['b', 'c'])
- res = DataFrame(None, columns=['a'], index=multiindex)
- return res
- else:
- y = y.set_index(['b', 'c'])
- return y
- def f2(x):
- y = x[(x.b % 2) == 1] ** 2
- if y.empty:
- return DataFrame()
- else:
- y = y.set_index(['b', 'c'])
- return y
- def f3(x):
- y = x[(x.b % 2) == 1] ** 2
- if y.empty:
- multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2,
- names=['foo', 'bar'])
- res = DataFrame(None, columns=['a', 'b'], index=multiindex)
- return res
- else:
- return y
- df = DataFrame({'a': [1, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)})
- df2 = DataFrame({'a': [3, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)})
- # correct result
- result1 = df.groupby('a').apply(f1)
- result2 = df2.groupby('a').apply(f1)
- assert_frame_equal(result1, result2)
- # should fail (not the same number of levels)
- msg = "Cannot concat indices that do not have the same number of levels"
- with pytest.raises(AssertionError, match=msg):
- df.groupby('a').apply(f2)
- with pytest.raises(AssertionError, match=msg):
- df2.groupby('a').apply(f2)
- # should fail (incorrect shape)
- with pytest.raises(AssertionError, match=msg):
- df.groupby('a').apply(f3)
- with pytest.raises(AssertionError, match=msg):
- df2.groupby('a').apply(f3)
- def test_attr_wrapper(ts):
- grouped = ts.groupby(lambda x: x.weekday())
- result = grouped.std()
- expected = grouped.agg(lambda x: np.std(x, ddof=1))
- assert_series_equal(result, expected)
- # this is pretty cool
- result = grouped.describe()
- expected = {name: gp.describe() for name, gp in grouped}
- expected = DataFrame(expected).T
- assert_frame_equal(result, expected)
- # get attribute
- result = grouped.dtype
- expected = grouped.agg(lambda x: x.dtype)
- # make sure raises error
- msg = "'SeriesGroupBy' object has no attribute 'foo'"
- with pytest.raises(AttributeError, match=msg):
- getattr(grouped, 'foo')
- def test_frame_groupby(tsframe):
- grouped = tsframe.groupby(lambda x: x.weekday())
- # aggregate
- aggregated = grouped.aggregate(np.mean)
- assert len(aggregated) == 5
- assert len(aggregated.columns) == 4
- # by string
- tscopy = tsframe.copy()
- tscopy['weekday'] = [x.weekday() for x in tscopy.index]
- stragged = tscopy.groupby('weekday').aggregate(np.mean)
- assert_frame_equal(stragged, aggregated, check_names=False)
- # transform
- grouped = tsframe.head(30).groupby(lambda x: x.weekday())
- transformed = grouped.transform(lambda x: x - x.mean())
- assert len(transformed) == 30
- assert len(transformed.columns) == 4
- # transform propagate
- transformed = grouped.transform(lambda x: x.mean())
- for name, group in grouped:
- mean = group.mean()
- for idx in group.index:
- tm.assert_series_equal(transformed.xs(idx), mean,
- check_names=False)
- # iterate
- for weekday, group in grouped:
- assert group.index[0].weekday() == weekday
- # groups / group_indices
- groups = grouped.groups
- indices = grouped.indices
- for k, v in compat.iteritems(groups):
- samething = tsframe.index.take(indices[k])
- assert (samething == v).all()
- def test_frame_groupby_columns(tsframe):
- mapping = {'A': 0, 'B': 0, 'C': 1, 'D': 1}
- grouped = tsframe.groupby(mapping, axis=1)
- # aggregate
- aggregated = grouped.aggregate(np.mean)
- assert len(aggregated) == len(tsframe)
- assert len(aggregated.columns) == 2
- # transform
- tf = lambda x: x - x.mean()
- groupedT = tsframe.T.groupby(mapping, axis=0)
- assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf))
- # iterate
- for k, v in grouped:
- assert len(v.columns) == 2
- def test_frame_set_name_single(df):
- grouped = df.groupby('A')
- result = grouped.mean()
- assert result.index.name == 'A'
- result = df.groupby('A', as_index=False).mean()
- assert result.index.name != 'A'
- result = grouped.agg(np.mean)
- assert result.index.name == 'A'
- result = grouped.agg({'C': np.mean, 'D': np.std})
- assert result.index.name == 'A'
- result = grouped['C'].mean()
- assert result.index.name == 'A'
- result = grouped['C'].agg(np.mean)
- assert result.index.name == 'A'
- result = grouped['C'].agg([np.mean, np.std])
- assert result.index.name == 'A'
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- result = grouped['C'].agg({'foo': np.mean, 'bar': np.std})
- assert result.index.name == 'A'
- def test_multi_func(df):
- col1 = df['A']
- col2 = df['B']
- grouped = df.groupby([col1.get, col2.get])
- agged = grouped.mean()
- expected = df.groupby(['A', 'B']).mean()
- # TODO groupby get drops names
- assert_frame_equal(agged.loc[:, ['C', 'D']],
- expected.loc[:, ['C', 'D']],
- check_names=False)
- # some "groups" with no data
- df = DataFrame({'v1': np.random.randn(6),
- 'v2': np.random.randn(6),
- 'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']),
- 'k2': np.array(['1', '1', '1', '2', '2', '2'])},
- index=['one', 'two', 'three', 'four', 'five', 'six'])
- # only verify that it works for now
- grouped = df.groupby(['k1', 'k2'])
- grouped.agg(np.sum)
- def test_multi_key_multiple_functions(df):
- grouped = df.groupby(['A', 'B'])['C']
- agged = grouped.agg([np.mean, np.std])
- expected = DataFrame({'mean': grouped.agg(np.mean),
- 'std': grouped.agg(np.std)})
- assert_frame_equal(agged, expected)
- def test_frame_multi_key_function_list():
- data = DataFrame(
- {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
- 'foo', 'foo', 'foo'],
- 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
- 'two', 'two', 'one'],
- 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
- 'dull', 'shiny', 'shiny', 'shiny'],
- 'D': np.random.randn(11),
- 'E': np.random.randn(11),
- 'F': np.random.randn(11)})
- grouped = data.groupby(['A', 'B'])
- funcs = [np.mean, np.std]
- agged = grouped.agg(funcs)
- expected = pd.concat([grouped['D'].agg(funcs), grouped['E'].agg(funcs),
- grouped['F'].agg(funcs)],
- keys=['D', 'E', 'F'], axis=1)
- assert (isinstance(agged.index, MultiIndex))
- assert (isinstance(expected.index, MultiIndex))
- assert_frame_equal(agged, expected)
- @pytest.mark.parametrize('op', [lambda x: x.sum(), lambda x: x.mean()])
- @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
- def test_groupby_multiple_columns(df, op):
- data = df
- grouped = data.groupby(['A', 'B'])
- result1 = op(grouped)
- expected = defaultdict(dict)
- for n1, gp1 in data.groupby('A'):
- for n2, gp2 in gp1.groupby('B'):
- expected[n1][n2] = op(gp2.loc[:, ['C', 'D']])
- expected = {k: DataFrame(v)
- for k, v in compat.iteritems(expected)}
- expected = Panel.fromDict(expected).swapaxes(0, 1)
- expected.major_axis.name, expected.minor_axis.name = 'A', 'B'
- # a little bit crude
- for col in ['C', 'D']:
- result_col = op(grouped[col])
- exp = expected[col]
- pivoted = result1[col].unstack()
- pivoted2 = result_col.unstack()
- assert_frame_equal(pivoted.reindex_like(exp), exp)
- assert_frame_equal(pivoted2.reindex_like(exp), exp)
- # test single series works the same
- result = data['C'].groupby([data['A'], data['B']]).mean()
- expected = data.groupby(['A', 'B']).mean()['C']
- assert_series_equal(result, expected)
- def test_groupby_as_index_agg(df):
- grouped = df.groupby('A', as_index=False)
- # single-key
- result = grouped.agg(np.mean)
- expected = grouped.mean()
- assert_frame_equal(result, expected)
- result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]]))
- expected2 = grouped.mean()
- expected2['D'] = grouped.sum()['D']
- assert_frame_equal(result2, expected2)
- grouped = df.groupby('A', as_index=True)
- expected3 = grouped['C'].sum()
- expected3 = DataFrame(expected3).rename(columns={'C': 'Q'})
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- result3 = grouped['C'].agg({'Q': np.sum})
- assert_frame_equal(result3, expected3)
- # multi-key
- grouped = df.groupby(['A', 'B'], as_index=False)
- result = grouped.agg(np.mean)
- expected = grouped.mean()
- assert_frame_equal(result, expected)
- result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]]))
- expected2 = grouped.mean()
- expected2['D'] = grouped.sum()['D']
- assert_frame_equal(result2, expected2)
- expected3 = grouped['C'].sum()
- expected3 = DataFrame(expected3).rename(columns={'C': 'Q'})
- result3 = grouped['C'].agg({'Q': np.sum})
- assert_frame_equal(result3, expected3)
- # GH7115 & GH8112 & GH8582
- df = DataFrame(np.random.randint(0, 100, (50, 3)),
- columns=['jim', 'joe', 'jolie'])
- ts = Series(np.random.randint(5, 10, 50), name='jim')
- gr = df.groupby(ts)
- gr.nth(0) # invokes set_selection_from_grouper internally
- assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum))
- for attr in ['mean', 'max', 'count', 'idxmax', 'cumsum', 'all']:
- gr = df.groupby(ts, as_index=False)
- left = getattr(gr, attr)()
- gr = df.groupby(ts.values, as_index=True)
- right = getattr(gr, attr)().reset_index(drop=True)
- assert_frame_equal(left, right)
- def test_as_index_series_return_frame(df):
- grouped = df.groupby('A', as_index=False)
- grouped2 = df.groupby(['A', 'B'], as_index=False)
- result = grouped['C'].agg(np.sum)
- expected = grouped.agg(np.sum).loc[:, ['A', 'C']]
- assert isinstance(result, DataFrame)
- assert_frame_equal(result, expected)
- result2 = grouped2['C'].agg(np.sum)
- expected2 = grouped2.agg(np.sum).loc[:, ['A', 'B', 'C']]
- assert isinstance(result2, DataFrame)
- assert_frame_equal(result2, expected2)
- result = grouped['C'].sum()
- expected = grouped.sum().loc[:, ['A', 'C']]
- assert isinstance(result, DataFrame)
- assert_frame_equal(result, expected)
- result2 = grouped2['C'].sum()
- expected2 = grouped2.sum().loc[:, ['A', 'B', 'C']]
- assert isinstance(result2, DataFrame)
- assert_frame_equal(result2, expected2)
- def test_as_index_series_column_slice_raises(df):
- # GH15072
- grouped = df.groupby('A', as_index=False)
- msg = r"Column\(s\) C already selected"
- with pytest.raises(IndexError, match=msg):
- grouped['C'].__getitem__('D')
- def test_groupby_as_index_cython(df):
- data = df
- # single-key
- grouped = data.groupby('A', as_index=False)
- result = grouped.mean()
- expected = data.groupby(['A']).mean()
- expected.insert(0, 'A', expected.index)
- expected.index = np.arange(len(expected))
- assert_frame_equal(result, expected)
- # multi-key
- grouped = data.groupby(['A', 'B'], as_index=False)
- result = grouped.mean()
- expected = data.groupby(['A', 'B']).mean()
- arrays = lzip(*expected.index.values)
- expected.insert(0, 'A', arrays[0])
- expected.insert(1, 'B', arrays[1])
- expected.index = np.arange(len(expected))
- assert_frame_equal(result, expected)
- def test_groupby_as_index_series_scalar(df):
- grouped = df.groupby(['A', 'B'], as_index=False)
- # GH #421
- result = grouped['C'].agg(len)
- expected = grouped.agg(len).loc[:, ['A', 'B', 'C']]
- assert_frame_equal(result, expected)
- def test_groupby_as_index_corner(df, ts):
- msg = "as_index=False only valid with DataFrame"
- with pytest.raises(TypeError, match=msg):
- ts.groupby(lambda x: x.weekday(), as_index=False)
- msg = "as_index=False only valid for axis=0"
- with pytest.raises(ValueError, match=msg):
- df.groupby(lambda x: x.lower(), as_index=False, axis=1)
- def test_groupby_multiple_key(df):
- df = tm.makeTimeDataFrame()
- grouped = df.groupby([lambda x: x.year, lambda x: x.month,
- lambda x: x.day])
- agged = grouped.sum()
- assert_almost_equal(df.values, agged.values)
- grouped = df.T.groupby([lambda x: x.year,
- lambda x: x.month,
- lambda x: x.day], axis=1)
- agged = grouped.agg(lambda x: x.sum())
- tm.assert_index_equal(agged.index, df.columns)
- assert_almost_equal(df.T.values, agged.values)
- agged = grouped.agg(lambda x: x.sum())
- assert_almost_equal(df.T.values, agged.values)
- def test_groupby_multi_corner(df):
- # test that having an all-NA column doesn't mess you up
- df = df.copy()
- df['bad'] = np.nan
- agged = df.groupby(['A', 'B']).mean()
- expected = df.groupby(['A', 'B']).mean()
- expected['bad'] = np.nan
- assert_frame_equal(agged, expected)
- def test_omit_nuisance(df):
- grouped = df.groupby('A')
- result = grouped.mean()
- expected = df.loc[:, ['A', 'C', 'D']].groupby('A').mean()
- assert_frame_equal(result, expected)
- agged = grouped.agg(np.mean)
- exp = grouped.mean()
- assert_frame_equal(agged, exp)
- df = df.loc[:, ['A', 'C', 'D']]
- df['E'] = datetime.now()
- grouped = df.groupby('A')
- result = grouped.agg(np.sum)
- expected = grouped.sum()
- assert_frame_equal(result, expected)
- # won't work with axis = 1
- grouped = df.groupby({'A': 0, 'C': 0, 'D': 1, 'E': 1}, axis=1)
- msg = (r'\("unsupported operand type\(s\) for \+: '
- "'Timestamp' and 'float'\""
- r", u?'occurred at index 0'\)")
- with pytest.raises(TypeError, match=msg):
- grouped.agg(lambda x: x.sum(0, numeric_only=False))
- def test_omit_nuisance_python_multiple(three_group):
- grouped = three_group.groupby(['A', 'B'])
- agged = grouped.agg(np.mean)
- exp = grouped.mean()
- assert_frame_equal(agged, exp)
- def test_empty_groups_corner(mframe):
- # handle empty groups
- df = DataFrame({'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']),
- 'k2': np.array(['1', '1', '1', '2', '2', '2']),
- 'k3': ['foo', 'bar'] * 3,
- 'v1': np.random.randn(6),
- 'v2': np.random.randn(6)})
- grouped = df.groupby(['k1', 'k2'])
- result = grouped.agg(np.mean)
- expected = grouped.mean()
- assert_frame_equal(result, expected)
- grouped = mframe[3:5].groupby(level=0)
- agged = grouped.apply(lambda x: x.mean())
- agged_A = grouped['A'].apply(np.mean)
- assert_series_equal(agged['A'], agged_A)
- assert agged.index.name == 'first'
- def test_nonsense_func():
- df = DataFrame([0])
- msg = r"unsupported operand type\(s\) for \+: '(int|long)' and 'str'"
- with pytest.raises(TypeError, match=msg):
- df.groupby(lambda x: x + 'foo')
- def test_wrap_aggregated_output_multindex(mframe):
- df = mframe.T
- df['baz', 'two'] = 'peekaboo'
- keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
- agged = df.groupby(keys).agg(np.mean)
- assert isinstance(agged.columns, MultiIndex)
- def aggfun(ser):
- if ser.name == ('foo', 'one'):
- raise TypeError
- else:
- return ser.sum()
- agged2 = df.groupby(keys).aggregate(aggfun)
- assert len(agged2.columns) + 1 == len(df.columns)
- def test_groupby_level_apply(mframe):
- result = mframe.groupby(level=0).count()
- assert result.index.name == 'first'
- result = mframe.groupby(level=1).count()
- assert result.index.name == 'second'
- result = mframe['A'].groupby(level=0).count()
- assert result.index.name == 'first'
- def test_groupby_level_mapper(mframe):
- deleveled = mframe.reset_index()
- mapper0 = {'foo': 0, 'bar': 0, 'baz': 1, 'qux': 1}
- mapper1 = {'one': 0, 'two': 0, 'three': 1}
- result0 = mframe.groupby(mapper0, level=0).sum()
- result1 = mframe.groupby(mapper1, level=1).sum()
- mapped_level0 = np.array([mapper0.get(x) for x in deleveled['first']])
- mapped_level1 = np.array([mapper1.get(x) for x in deleveled['second']])
- expected0 = mframe.groupby(mapped_level0).sum()
- expected1 = mframe.groupby(mapped_level1).sum()
- expected0.index.name, expected1.index.name = 'first', 'second'
- assert_frame_equal(result0, expected0)
- assert_frame_equal(result1, expected1)
- def test_groupby_level_nonmulti():
- # GH 1313, GH 13901
- s = Series([1, 2, 3, 10, 4, 5, 20, 6],
- Index([1, 2, 3, 1, 4, 5, 2, 6], name='foo'))
- expected = Series([11, 22, 3, 4, 5, 6],
- Index(range(1, 7), name='foo'))
- result = s.groupby(level=0).sum()
- tm.assert_series_equal(result, expected)
- result = s.groupby(level=[0]).sum()
- tm.assert_series_equal(result, expected)
- result = s.groupby(level=-1).sum()
- tm.assert_series_equal(result, expected)
- result = s.groupby(level=[-1]).sum()
- tm.assert_series_equal(result, expected)
- msg = "level > 0 or level < -1 only valid with MultiIndex"
- with pytest.raises(ValueError, match=msg):
- s.groupby(level=1)
- with pytest.raises(ValueError, match=msg):
- s.groupby(level=-2)
- msg = "No group keys passed!"
- with pytest.raises(ValueError, match=msg):
- s.groupby(level=[])
- msg = "multiple levels only valid with MultiIndex"
- with pytest.raises(ValueError, match=msg):
- s.groupby(level=[0, 0])
- with pytest.raises(ValueError, match=msg):
- s.groupby(level=[0, 1])
- msg = "level > 0 or level < -1 only valid with MultiIndex"
- with pytest.raises(ValueError, match=msg):
- s.groupby(level=[1])
- def test_groupby_complex():
- # GH 12902
- a = Series(data=np.arange(4) * (1 + 2j), index=[0, 0, 1, 1])
- expected = Series((1 + 2j, 5 + 10j))
- result = a.groupby(level=0).sum()
- assert_series_equal(result, expected)
- result = a.sum(level=0)
- assert_series_equal(result, expected)
- def test_mutate_groups():
- # GH3380
- df = DataFrame({
- 'cat1': ['a'] * 8 + ['b'] * 6,
- 'cat2': ['c'] * 2 + ['d'] * 2 + ['e'] * 2 + ['f'] * 2 + ['c'] * 2 +
- ['d'] * 2 + ['e'] * 2,
- 'cat3': lmap(lambda x: 'g%s' % x, lrange(1, 15)),
- 'val': np.random.randint(100, size=14),
- })
- def f_copy(x):
- x = x.copy()
- x['rank'] = x.val.rank(method='min')
- return x.groupby('cat2')['rank'].min()
- def f_no_copy(x):
- x['rank'] = x.val.rank(method='min')
- return x.groupby('cat2')['rank'].min()
- grpby_copy = df.groupby('cat1').apply(f_copy)
- grpby_no_copy = df.groupby('cat1').apply(f_no_copy)
- assert_series_equal(grpby_copy, grpby_no_copy)
- def test_no_mutate_but_looks_like():
- # GH 8467
- # first show's mutation indicator
- # second does not, but should yield the same results
- df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], 'value': range(9)})
- result1 = df.groupby('key', group_keys=True).apply(lambda x: x[:].key)
- result2 = df.groupby('key', group_keys=True).apply(lambda x: x.key)
- assert_series_equal(result1, result2)
- def test_groupby_series_indexed_differently():
- s1 = Series([5.0, -9.0, 4.0, 100., -5., 55., 6.7],
- index=Index(['a', 'b', 'c', 'd', 'e', 'f', 'g']))
- s2 = Series([1.0, 1.0, 4.0, 5.0, 5.0, 7.0],
- index=Index(['a', 'b', 'd', 'f', 'g', 'h']))
- grouped = s1.groupby(s2)
- agged = grouped.mean()
- exp = s1.groupby(s2.reindex(s1.index).get).mean()
- assert_series_equal(agged, exp)
- def test_groupby_with_hier_columns():
- tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux',
- 'qux'], ['one', 'two', 'one', 'two', 'one', 'two',
- 'one', 'two']]))
- index = MultiIndex.from_tuples(tuples)
- columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), (
- 'B', 'cat'), ('A', 'dog')])
- df = DataFrame(np.random.randn(8, 4), index=index, columns=columns)
- result = df.groupby(level=0).mean()
- tm.assert_index_equal(result.columns, columns)
- result = df.groupby(level=0, axis=1).mean()
- tm.assert_index_equal(result.index, df.index)
- result = df.groupby(level=0).agg(np.mean)
- tm.assert_index_equal(result.columns, columns)
- result = df.groupby(level=0).apply(lambda x: x.mean())
- tm.assert_index_equal(result.columns, columns)
- result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1))
- tm.assert_index_equal(result.columns, Index(['A', 'B']))
- tm.assert_index_equal(result.index, df.index)
- # add a nuisance column
- sorted_columns, _ = columns.sortlevel(0)
- df['A', 'foo'] = 'bar'
- result = df.groupby(level=0).mean()
- tm.assert_index_equal(result.columns, df.columns[:-1])
- def test_grouping_ndarray(df):
- grouped = df.groupby(df['A'].values)
- result = grouped.sum()
- expected = df.groupby('A').sum()
- assert_frame_equal(result, expected, check_names=False
- ) # Note: no names when grouping by value
- def test_groupby_wrong_multi_labels():
- data = """index,foo,bar,baz,spam,data
- 0,foo1,bar1,baz1,spam2,20
- 1,foo1,bar2,baz1,spam3,30
- 2,foo2,bar2,baz1,spam2,40
- 3,foo1,bar1,baz2,spam1,50
- 4,foo3,bar1,baz2,spam1,60"""
- data = read_csv(StringIO(data), index_col=0)
- grouped = data.groupby(['foo', 'bar', 'baz', 'spam'])
- result = grouped.agg(np.mean)
- expected = grouped.mean()
- assert_frame_equal(result, expected)
- def test_groupby_series_with_name(df):
- result = df.groupby(df['A']).mean()
- result2 = df.groupby(df['A'], as_index=False).mean()
- assert result.index.name == 'A'
- assert 'A' in result2
- result = df.groupby([df['A'], df['B']]).mean()
- result2 = df.groupby([df['A'], df['B']],
- as_index=False).mean()
- assert result.index.names == ('A', 'B')
- assert 'A' in result2
- assert 'B' in result2
- def test_seriesgroupby_name_attr(df):
- # GH 6265
- result = df.groupby('A')['C']
- assert result.count().name == 'C'
- assert result.mean().name == 'C'
- testFunc = lambda x: np.sum(x) * 2
- assert result.agg(testFunc).name == 'C'
- def test_consistency_name():
- # GH 12363
- df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
- 'foo', 'bar', 'foo', 'foo'],
- 'B': ['one', 'one', 'two', 'two',
- 'two', 'two', 'one', 'two'],
- 'C': np.random.randn(8) + 1.0,
- 'D': np.arange(8)})
- expected = df.groupby(['A']).B.count()
- result = df.B.groupby(df.A).count()
- assert_series_equal(result, expected)
- def test_groupby_name_propagation(df):
- # GH 6124
- def summarize(df, name=None):
- return Series({'count': 1, 'mean': 2, 'omissions': 3, }, name=name)
- def summarize_random_name(df):
- # Provide a different name for each Series. In this case, groupby
- # should not attempt to propagate the Series name since they are
- # inconsistent.
- return Series({
- 'count': 1,
- 'mean': 2,
- 'omissions': 3,
- }, name=df.iloc[0]['A'])
- metrics = df.groupby('A').apply(summarize)
- assert metrics.columns.name is None
- metrics = df.groupby('A').apply(summarize, 'metrics')
- assert metrics.columns.name == 'metrics'
- metrics = df.groupby('A').apply(summarize_random_name)
- assert metrics.columns.name is None
- def test_groupby_nonstring_columns():
- df = DataFrame([np.arange(10) for x in range(10)])
- grouped = df.groupby(0)
- result = grouped.mean()
- expected = df.groupby(df[0]).mean()
- assert_frame_equal(result, expected)
- def test_groupby_mixed_type_columns():
- # GH 13432, unorderable types in py3
- df = DataFrame([[0, 1, 2]], columns=['A', 'B', 0])
- expected = DataFrame([[1, 2]], columns=['B', 0],
- index=Index([0], name='A'))
- result = df.groupby('A').first()
- tm.assert_frame_equal(result, expected)
- result = df.groupby('A').sum()
- tm.assert_frame_equal(result, expected)
- # TODO: Ensure warning isn't emitted in the first place
- @pytest.mark.filterwarnings("ignore:Mean of:RuntimeWarning")
- def test_cython_grouper_series_bug_noncontig():
- arr = np.empty((100, 100))
- arr.fill(np.nan)
- obj = Series(arr[:, 0], index=lrange(100))
- inds = np.tile(lrange(10), 10)
- result = obj.groupby(inds).agg(Series.median)
- assert result.isna().all()
- def test_series_grouper_noncontig_index():
- index = Index(tm.rands_array(10, 100))
- values = Series(np.random.randn(50), index=index[::2])
- labels = np.random.randint(0, 5, 50)
- # it works!
- grouped = values.groupby(labels)
- # accessing the index elements causes segfault
- f = lambda x: len(set(map(id, x.index)))
- grouped.agg(f)
- def test_convert_objects_leave_decimal_alone():
- s = Series(lrange(5))
- labels = np.array(['a', 'b', 'c', 'd', 'e'], dtype='O')
- def convert_fast(x):
- return Decimal(str(x.mean()))
- def convert_force_pure(x):
- # base will be length 0
- assert (len(x.values.base) > 0)
- return Decimal(str(x.mean()))
- grouped = s.groupby(labels)
- result = grouped.agg(convert_fast)
- assert result.dtype == np.object_
- assert isinstance(result[0], Decimal)
- result = grouped.agg(convert_force_pure)
- assert result.dtype == np.object_
- assert isinstance(result[0], Decimal)
- def test_groupby_dtype_inference_empty():
- # GH 6733
- df = DataFrame({'x': [], 'range': np.arange(0, dtype='int64')})
- assert df['x'].dtype == np.float64
- result = df.groupby('x').first()
- exp_index = Index([], name='x', dtype=np.float64)
- expected = DataFrame({'range': Series(
- [], index=exp_index, dtype='int64')})
- assert_frame_equal(result, expected, by_blocks=True)
- def test_groupby_list_infer_array_like(df):
- result = df.groupby(list(df['A'])).mean()
- expected = df.groupby(df['A']).mean()
- assert_frame_equal(result, expected, check_names=False)
- with pytest.raises(KeyError, match=r"^'foo'$"):
- df.groupby(list(df['A'][:-1]))
- # pathological case of ambiguity
- df = DataFrame({'foo': [0, 1],
- 'bar': [3, 4],
- 'val': np.random.randn(2)})
- result = df.groupby(['foo', 'bar']).mean()
- expected = df.groupby([df['foo'], df['bar']]).mean()[['val']]
- def test_groupby_keys_same_size_as_index():
- # GH 11185
- freq = 's'
- index = pd.date_range(start=pd.Timestamp('2015-09-29T11:34:44-0700'),
- periods=2, freq=freq)
- df = pd.DataFrame([['A', 10], ['B', 15]], columns=[
- 'metric', 'values'
- ], index=index)
- result = df.groupby([pd.Grouper(level=0, freq=freq), 'metric']).mean()
- expected = df.set_index([df.index, 'metric'])
- assert_frame_equal(result, expected)
- def test_groupby_one_row():
- # GH 11741
- msg = r"^'Z'$"
- df1 = pd.DataFrame(np.random.randn(1, 4), columns=list('ABCD'))
- with pytest.raises(KeyError, match=msg):
- df1.groupby('Z')
- df2 = pd.DataFrame(np.random.randn(2, 4), columns=list('ABCD'))
- with pytest.raises(KeyError, match=msg):
- df2.groupby('Z')
- def test_groupby_nat_exclude():
- # GH 6992
- df = pd.DataFrame(
- {'values': np.random.randn(8),
- 'dt': [np.nan, pd.Timestamp('2013-01-01'), np.nan, pd.Timestamp(
- '2013-02-01'), np.nan, pd.Timestamp('2013-02-01'), np.nan,
- pd.Timestamp('2013-01-01')],
- 'str': [np.nan, 'a', np.nan, 'a', np.nan, 'a', np.nan, 'b']})
- grouped = df.groupby('dt')
- expected = [pd.Index([1, 7]), pd.Index([3, 5])]
- keys = sorted(grouped.groups.keys())
- assert len(keys) == 2
- for k, e in zip(keys, expected):
- # grouped.groups keys are np.datetime64 with system tz
- # not to be affected by tz, only compare values
- tm.assert_index_equal(grouped.groups[k], e)
- # confirm obj is not filtered
- tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df)
- assert grouped.ngroups == 2
- expected = {
- Timestamp('2013-01-01 00:00:00'): np.array([1, 7], dtype=np.int64),
- Timestamp('2013-02-01 00:00:00'): np.array([3, 5], dtype=np.int64)
- }
- for k in grouped.indices:
- tm.assert_numpy_array_equal(grouped.indices[k], expected[k])
- tm.assert_frame_equal(
- grouped.get_group(Timestamp('2013-01-01')), df.iloc[[1, 7]])
- tm.assert_frame_equal(
- grouped.get_group(Timestamp('2013-02-01')), df.iloc[[3, 5]])
- with pytest.raises(KeyError, match=r"^NaT$"):
- grouped.get_group(pd.NaT)
- nan_df = DataFrame({'nan': [np.nan, np.nan, np.nan],
- 'nat': [pd.NaT, pd.NaT, pd.NaT]})
- assert nan_df['nan'].dtype == 'float64'
- assert nan_df['nat'].dtype == 'datetime64[ns]'
- for key in ['nan', 'nat']:
- grouped = nan_df.groupby(key)
- assert grouped.groups == {}
- assert grouped.ngroups == 0
- assert grouped.indices == {}
- with pytest.raises(KeyError, match=r"^nan$"):
- grouped.get_group(np.nan)
- with pytest.raises(KeyError, match=r"^NaT$"):
- grouped.get_group(pd.NaT)
- @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
- def test_sparse_friendly(df):
- sdf = df[['C', 'D']].to_sparse()
- panel = tm.makePanel()
- tm.add_nans(panel)
- def _check_work(gp):
- gp.mean()
- gp.agg(np.mean)
- dict(iter(gp))
- # it works!
- _check_work(sdf.groupby(lambda x: x // 2))
- _check_work(sdf['C'].groupby(lambda x: x // 2))
- _check_work(sdf.groupby(df['A']))
- # do this someday
- # _check_work(panel.groupby(lambda x: x.month, axis=1))
- @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
- def test_panel_groupby():
- panel = tm.makePanel()
- tm.add_nans(panel)
- grouped = panel.groupby({'ItemA': 0, 'ItemB': 0, 'ItemC': 1},
- axis='items')
- agged = grouped.mean()
- agged2 = grouped.agg(lambda x: x.mean('items'))
- tm.assert_panel_equal(agged, agged2)
- tm.assert_index_equal(agged.items, Index([0, 1]))
- grouped = panel.groupby(lambda x: x.month, axis='major')
- agged = grouped.mean()
- exp = Index(sorted(list(set(panel.major_axis.month))))
- tm.assert_index_equal(agged.major_axis, exp)
- grouped = panel.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1},
- axis='minor')
- agged = grouped.mean()
- tm.assert_index_equal(agged.minor_axis, Index([0, 1]))
- def test_groupby_2d_malformed():
- d = DataFrame(index=lrange(2))
- d['group'] = ['g1', 'g2']
- d['zeros'] = [0, 0]
- d['ones'] = [1, 1]
- d['label'] = ['l1', 'l2']
- tmp = d.groupby(['group']).mean()
- res_values = np.array([[0, 1], [0, 1]], dtype=np.int64)
- tm.assert_index_equal(tmp.columns, Index(['zeros', 'ones']))
- tm.assert_numpy_array_equal(tmp.values, res_values)
- def test_int32_overflow():
- B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000)
- ))
- A = np.arange(25000)
- df = DataFrame({'A': A,
- 'B': B,
- 'C': A,
- 'D': B,
- 'E': np.random.randn(25000)})
- left = df.groupby(['A', 'B', 'C', 'D']).sum()
- right = df.groupby(['D', 'C', 'B', 'A']).sum()
- assert len(left) == len(right)
- def test_groupby_sort_multi():
- df = DataFrame({'a': ['foo', 'bar', 'baz'],
- 'b': [3, 2, 1],
- 'c': [0, 1, 2],
- 'd': np.random.randn(3)})
- tups = lmap(tuple, df[['a', 'b', 'c']].values)
- tups = com.asarray_tuplesafe(tups)
- result = df.groupby(['a', 'b', 'c'], sort=True).sum()
- tm.assert_numpy_array_equal(result.index.values, tups[[1, 2, 0]])
- tups = lmap(tuple, df[['c', 'a', 'b']].values)
- tups = com.asarray_tuplesafe(tups)
- result = df.groupby(['c', 'a', 'b'], sort=True).sum()
- tm.assert_numpy_array_equal(result.index.values, tups)
- tups = lmap(tuple, df[['b', 'c', 'a']].values)
- tups = com.asarray_tuplesafe(tups)
- result = df.groupby(['b', 'c', 'a'], sort=True).sum()
- tm.assert_numpy_array_equal(result.index.values, tups[[2, 1, 0]])
- df = DataFrame({'a': [0, 1, 2, 0, 1, 2],
- 'b': [0, 0, 0, 1, 1, 1],
- 'd': np.random.randn(6)})
- grouped = df.groupby(['a', 'b'])['d']
- result = grouped.sum()
- def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
- tups = lmap(tuple, df[keys].values)
- tups = com.asarray_tuplesafe(tups)
- expected = f(df.groupby(tups)[field])
- for k, v in compat.iteritems(expected):
- assert (result[k] == v)
- _check_groupby(df, result, ['a', 'b'], 'd')
- def test_dont_clobber_name_column():
- df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'],
- 'name': ['foo', 'bar', 'baz'] * 2})
- result = df.groupby('key').apply(lambda x: x)
- assert_frame_equal(result, df)
- def test_skip_group_keys():
- tsf = tm.makeTimeDataFrame()
- grouped = tsf.groupby(lambda x: x.month, group_keys=False)
- result = grouped.apply(lambda x: x.sort_values(by='A')[:3])
- pieces = [group.sort_values(by='A')[:3] for key, group in grouped]
- expected = pd.concat(pieces)
- assert_frame_equal(result, expected)
- grouped = tsf['A'].groupby(lambda x: x.month, group_keys=False)
- result = grouped.apply(lambda x: x.sort_values()[:3])
- pieces = [group.sort_values()[:3] for key, group in grouped]
- expected = pd.concat(pieces)
- assert_series_equal(result, expected)
- def test_no_nonsense_name(frame):
- # GH #995
- s = frame['C'].copy()
- s.name = None
- result = s.groupby(frame['A']).agg(np.sum)
- assert result.name is None
- def test_multifunc_sum_bug():
- # GH #1065
- x = DataFrame(np.arange(9).reshape(3, 3))
- x['test'] = 0
- x['fl'] = [1.3, 1.5, 1.6]
- grouped = x.groupby('test')
- result = grouped.agg({'fl': 'sum', 2: 'size'})
- assert result['fl'].dtype == np.float64
- def test_handle_dict_return_value(df):
- def f(group):
- return {'max': group.max(), 'min': group.min()}
- def g(group):
- return Series({'max': group.max(), 'min': group.min()})
- result = df.groupby('A')['C'].apply(f)
- expected = df.groupby('A')['C'].apply(g)
- assert isinstance(result, Series)
- assert_series_equal(result, expected)
- @pytest.mark.parametrize('grouper', ['A', ['A', 'B']])
- def test_set_group_name(df, grouper):
- def f(group):
- assert group.name is not None
- return group
- def freduce(group):
- assert group.name is not None
- return group.sum()
- def foo(x):
- return freduce(x)
- grouped = df.groupby(grouper)
- # make sure all these work
- grouped.apply(f)
- grouped.aggregate(freduce)
- grouped.aggregate({'C': freduce, 'D': freduce})
- grouped.transform(f)
- grouped['C'].apply(f)
- grouped['C'].aggregate(freduce)
- grouped['C'].aggregate([freduce, foo])
- grouped['C'].transform(f)
- def test_group_name_available_in_inference_pass():
- # gh-15062
- df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)})
- names = []
- def f(group):
- names.append(group.name)
- return group.copy()
- df.groupby('a', sort=False, group_keys=False).apply(f)
- # we expect 2 zeros because we call ``f`` once to see if a faster route
- # can be used.
- expected_names = [0, 0, 1, 2]
- assert names == expected_names
- def test_no_dummy_key_names(df):
- # see gh-1291
- result = df.groupby(df['A'].values).sum()
- assert result.index.name is None
- result = df.groupby([df['A'].values, df['B'].values]).sum()
- assert result.index.names == (None, None)
- def test_groupby_sort_multiindex_series():
- # series multiindex groupby sort argument was not being passed through
- # _compress_group_index
- # GH 9444
- index = MultiIndex(levels=[[1, 2], [1, 2]],
- codes=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]],
- names=['a', 'b'])
- mseries = Series([0, 1, 2, 3, 4, 5], index=index)
- index = MultiIndex(levels=[[1, 2], [1, 2]],
- codes=[[0, 0, 1], [1, 0, 0]], names=['a', 'b'])
- mseries_result = Series([0, 2, 4], index=index)
- result = mseries.groupby(level=['a', 'b'], sort=False).first()
- assert_series_equal(result, mseries_result)
- result = mseries.groupby(level=['a', 'b'], sort=True).first()
- assert_series_equal(result, mseries_result.sort_index())
- def test_groupby_reindex_inside_function():
- periods = 1000
- ind = date_range(start='2012/1/1', freq='5min', periods=periods)
- df = DataFrame({'high': np.arange(
- periods), 'low': np.arange(periods)}, index=ind)
- def agg_before(hour, func, fix=False):
- """
- Run an aggregate func on the subset of data.
- """
- def _func(data):
- d = data.loc[data.index.map(
- lambda x: x.hour < 11)].dropna()
- if fix:
- data[data.index[0]]
- if len(d) == 0:
- return None
- return func(d)
- return _func
- def afunc(data):
- d = data.select(lambda x: x.hour < 11).dropna()
- return np.max(d)
- grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
- closure_bad = grouped.agg({'high': agg_before(11, np.max)})
- closure_good = grouped.agg({'high': agg_before(11, np.max, True)})
- assert_frame_equal(closure_bad, closure_good)
- def test_groupby_multiindex_missing_pair():
- # GH9049
- df = DataFrame({'group1': ['a', 'a', 'a', 'b'],
- 'group2': ['c', 'c', 'd', 'c'],
- 'value': [1, 1, 1, 5]})
- df = df.set_index(['group1', 'group2'])
- df_grouped = df.groupby(level=['group1', 'group2'], sort=True)
- res = df_grouped.agg('sum')
- idx = MultiIndex.from_tuples(
- [('a', 'c'), ('a', 'd'), ('b', 'c')], names=['group1', 'group2'])
- exp = DataFrame([[2], [1], [5]], index=idx, columns=['value'])
- tm.assert_frame_equal(res, exp)
- def test_groupby_multiindex_not_lexsorted():
- # GH 11640
- # define the lexsorted version
- lexsorted_mi = MultiIndex.from_tuples(
- [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c'])
- lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
- assert lexsorted_df.columns.is_lexsorted()
- # define the non-lexsorted version
- not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'],
- data=[[1, 'b1', 'c1', 3],
- [1, 'b2', 'c2', 4]])
- not_lexsorted_df = not_lexsorted_df.pivot_table(
- index='a', columns=['b', 'c'], values='d')
- not_lexsorted_df = not_lexsorted_df.reset_index()
- assert not not_lexsorted_df.columns.is_lexsorted()
- # compare the results
- tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)
- expected = lexsorted_df.groupby('a').mean()
- with tm.assert_produces_warning(PerformanceWarning):
- result = not_lexsorted_df.groupby('a').mean()
- tm.assert_frame_equal(expected, result)
- # a transforming function should work regardless of sort
- # GH 14776
- df = DataFrame({'x': ['a', 'a', 'b', 'a'],
- 'y': [1, 1, 2, 2],
- 'z': [1, 2, 3, 4]}).set_index(['x', 'y'])
- assert not df.index.is_lexsorted()
- for level in [0, 1, [0, 1]]:
- for sort in [False, True]:
- result = df.groupby(level=level, sort=sort).apply(
- DataFrame.drop_duplicates)
- expected = df
- tm.assert_frame_equal(expected, result)
- result = df.sort_index().groupby(level=level, sort=sort).apply(
- DataFrame.drop_duplicates)
- expected = df.sort_index()
- tm.assert_frame_equal(expected, result)
- def test_index_label_overlaps_location():
- # checking we don't have any label/location confusion in the
- # the wake of GH5375
- df = DataFrame(list('ABCDE'), index=[2, 0, 2, 1, 1])
- g = df.groupby(list('ababb'))
- actual = g.filter(lambda x: len(x) > 2)
- expected = df.iloc[[1, 3, 4]]
- assert_frame_equal(actual, expected)
- ser = df[0]
- g = ser.groupby(list('ababb'))
- actual = g.filter(lambda x: len(x) > 2)
- expected = ser.take([1, 3, 4])
- assert_series_equal(actual, expected)
- # ... and again, with a generic Index of floats
- df.index = df.index.astype(float)
- g = df.groupby(list('ababb'))
- actual = g.filter(lambda x: len(x) > 2)
- expected = df.iloc[[1, 3, 4]]
- assert_frame_equal(actual, expected)
- ser = df[0]
- g = ser.groupby(list('ababb'))
- actual = g.filter(lambda x: len(x) > 2)
- expected = ser.take([1, 3, 4])
- assert_series_equal(actual, expected)
- def test_transform_doesnt_clobber_ints():
- # GH 7972
- n = 6
- x = np.arange(n)
- df = DataFrame({'a': x // 2, 'b': 2.0 * x, 'c': 3.0 * x})
- df2 = DataFrame({'a': x // 2 * 1.0, 'b': 2.0 * x, 'c': 3.0 * x})
- gb = df.groupby('a')
- result = gb.transform('mean')
- gb2 = df2.groupby('a')
- expected = gb2.transform('mean')
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize('sort_column', ['ints', 'floats', 'strings',
- ['ints', 'floats'],
- ['ints', 'strings']])
- @pytest.mark.parametrize('group_column', ['int_groups', 'string_groups',
- ['int_groups', 'string_groups']])
- def test_groupby_preserves_sort(sort_column, group_column):
- # Test to ensure that groupby always preserves sort order of original
- # object. Issue #8588 and #9651
- df = DataFrame(
- {'int_groups': [3, 1, 0, 1, 0, 3, 3, 3],
- 'string_groups': ['z', 'a', 'z', 'a', 'a', 'g', 'g', 'g'],
- 'ints': [8, 7, 4, 5, 2, 9, 1, 1],
- 'floats': [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5],
- 'strings': ['z', 'd', 'a', 'e', 'word', 'word2', '42', '47']})
- # Try sorting on different types and with different group types
- df = df.sort_values(by=sort_column)
- g = df.groupby(group_column)
- def test_sort(x):
- assert_frame_equal(x, x.sort_values(by=sort_column))
- g.apply(test_sort)
- def test_group_shift_with_null_key():
- # This test is designed to replicate the segfault in issue #13813.
- n_rows = 1200
- # Generate a moderately large dataframe with occasional missing
- # values in column `B`, and then group by [`A`, `B`]. This should
- # force `-1` in `labels` array of `g.grouper.group_info` exactly
- # at those places, where the group-by key is partially missing.
- df = DataFrame([(i % 12, i % 3 if i % 3 else np.nan, i)
- for i in range(n_rows)], dtype=float,
- columns=["A", "B", "Z"], index=None)
- g = df.groupby(["A", "B"])
- expected = DataFrame([(i + 12 if i % 3 and i < n_rows - 12
- else np.nan)
- for i in range(n_rows)], dtype=float,
- columns=["Z"], index=None)
- result = g.shift(-1)
- assert_frame_equal(result, expected)
- def test_group_shift_with_fill_value():
- # GH #24128
- n_rows = 24
- df = DataFrame([(i % 12, i % 3, i)
- for i in range(n_rows)], dtype=float,
- columns=["A", "B", "Z"], index=None)
- g = df.groupby(["A", "B"])
- expected = DataFrame([(i + 12 if i < n_rows - 12
- else 0)
- for i in range(n_rows)], dtype=float,
- columns=["Z"], index=None)
- result = g.shift(-1, fill_value=0)[["Z"]]
- assert_frame_equal(result, expected)
- def test_pivot_table_values_key_error():
- # This test is designed to replicate the error in issue #14938
- df = pd.DataFrame({'eventDate':
- pd.date_range(pd.datetime.today(),
- periods=20, freq='M').tolist(),
- 'thename': range(0, 20)})
- df['year'] = df.set_index('eventDate').index.year
- df['month'] = df.set_index('eventDate').index.month
- with pytest.raises(KeyError, match="'badname'"):
- df.reset_index().pivot_table(index='year', columns='month',
- values='badname', aggfunc='count')
- def test_empty_dataframe_groupby():
- # GH8093
- df = DataFrame(columns=['A', 'B', 'C'])
- result = df.groupby('A').sum()
- expected = DataFrame(columns=['B', 'C'], dtype=np.float64)
- expected.index.name = 'A'
- assert_frame_equal(result, expected)
- def test_tuple_warns():
- # https://github.com/pandas-dev/pandas/issues/18314
- df = pd.DataFrame({('a', 'b'): [1, 1, 2, 2], 'a': [1, 1, 1, 2],
- 'b': [1, 2, 2, 2], 'c': [1, 1, 1, 1]})
- with tm.assert_produces_warning(FutureWarning) as w:
- df[['a', 'b', 'c']].groupby(('a', 'b')).c.mean()
- assert "Interpreting tuple 'by' as a list" in str(w[0].message)
- with tm.assert_produces_warning(None):
- df.groupby(('a', 'b')).c.mean()
- def test_tuple_warns_unhashable():
- # https://github.com/pandas-dev/pandas/issues/18314
- business_dates = date_range(start='4/1/2014', end='6/30/2014',
- freq='B')
- df = DataFrame(1, index=business_dates, columns=['a', 'b'])
- with tm.assert_produces_warning(FutureWarning) as w:
- df.groupby((df.index.year, df.index.month)).nth([0, 3, -1])
- assert "Interpreting tuple 'by' as a list" in str(w[0].message)
- def test_tuple_correct_keyerror():
- # https://github.com/pandas-dev/pandas/issues/18798
- df = pd.DataFrame(1, index=range(3),
- columns=pd.MultiIndex.from_product([[1, 2],
- [3, 4]]))
- with pytest.raises(KeyError, match=r"^\(7, 8\)$"):
- df.groupby((7, 8)).mean()
- def test_groupby_agg_ohlc_non_first():
- # GH 21716
- df = pd.DataFrame([[1], [1]], columns=['foo'],
- index=pd.date_range('2018-01-01', periods=2, freq='D'))
- expected = pd.DataFrame([
- [1, 1, 1, 1, 1],
- [1, 1, 1, 1, 1]
- ], columns=pd.MultiIndex.from_tuples((
- ('foo', 'ohlc', 'open'), ('foo', 'ohlc', 'high'),
- ('foo', 'ohlc', 'low'), ('foo', 'ohlc', 'close'),
- ('foo', 'sum', 'foo'))), index=pd.date_range(
- '2018-01-01', periods=2, freq='D'))
- result = df.groupby(pd.Grouper(freq='D')).agg(['sum', 'ohlc'])
- tm.assert_frame_equal(result, expected)
|