123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143 |
- from string import ascii_lowercase
- import numpy as np
- import pytest
- from pandas.compat import product as cart_product
- from pandas.errors import UnsupportedFunctionCall
- import pandas as pd
- from pandas import (
- DataFrame, Index, MultiIndex, Series, Timestamp, compat, date_range, isna)
- import pandas.core.nanops as nanops
- from pandas.util import testing as tm
- @pytest.mark.parametrize("agg_func", ['any', 'all'])
- @pytest.mark.parametrize("skipna", [True, False])
- @pytest.mark.parametrize("vals", [
- ['foo', 'bar', 'baz'], ['foo', '', ''], ['', '', ''],
- [1, 2, 3], [1, 0, 0], [0, 0, 0],
- [1., 2., 3.], [1., 0., 0.], [0., 0., 0.],
- [True, True, True], [True, False, False], [False, False, False],
- [np.nan, np.nan, np.nan]
- ])
- def test_groupby_bool_aggs(agg_func, skipna, vals):
- df = DataFrame({'key': ['a'] * 3 + ['b'] * 3, 'val': vals * 2})
- # Figure out expectation using Python builtin
- exp = getattr(compat.builtins, agg_func)(vals)
- # edge case for missing data with skipna and 'any'
- if skipna and all(isna(vals)) and agg_func == 'any':
- exp = False
- exp_df = DataFrame([exp] * 2, columns=['val'], index=Index(
- ['a', 'b'], name='key'))
- result = getattr(df.groupby('key'), agg_func)(skipna=skipna)
- tm.assert_frame_equal(result, exp_df)
- def test_max_min_non_numeric():
- # #2700
- aa = DataFrame({'nn': [11, 11, 22, 22],
- 'ii': [1, 2, 3, 4],
- 'ss': 4 * ['mama']})
- result = aa.groupby('nn').max()
- assert 'ss' in result
- result = aa.groupby('nn').max(numeric_only=False)
- assert 'ss' in result
- result = aa.groupby('nn').min()
- assert 'ss' in result
- result = aa.groupby('nn').min(numeric_only=False)
- assert 'ss' in result
- def test_intercept_builtin_sum():
- s = Series([1., 2., np.nan, 3.])
- grouped = s.groupby([0, 1, 2, 2])
- result = grouped.agg(compat.builtins.sum)
- result2 = grouped.apply(compat.builtins.sum)
- expected = grouped.sum()
- tm.assert_series_equal(result, expected)
- tm.assert_series_equal(result2, expected)
- # @pytest.mark.parametrize("f", [max, min, sum])
- # def test_builtins_apply(f):
- @pytest.mark.parametrize("f", [max, min, sum])
- @pytest.mark.parametrize('keys', [
- "jim", # Single key
- ["jim", "joe"] # Multi-key
- ])
- def test_builtins_apply(keys, f):
- # see gh-8155
- df = pd.DataFrame(np.random.randint(1, 50, (1000, 2)),
- columns=["jim", "joe"])
- df["jolie"] = np.random.randn(1000)
- fname = f.__name__
- result = df.groupby(keys).apply(f)
- ngroups = len(df.drop_duplicates(subset=keys))
- assert_msg = ("invalid frame shape: {} "
- "(expected ({}, 3))".format(result.shape, ngroups))
- assert result.shape == (ngroups, 3), assert_msg
- tm.assert_frame_equal(result, # numpy's equivalent function
- df.groupby(keys).apply(getattr(np, fname)))
- if f != sum:
- expected = df.groupby(keys).agg(fname).reset_index()
- expected.set_index(keys, inplace=True, drop=False)
- tm.assert_frame_equal(result, expected, check_dtype=False)
- tm.assert_series_equal(getattr(result, fname)(),
- getattr(df, fname)())
- def test_arg_passthru():
- # make sure that we are passing thru kwargs
- # to our agg functions
- # GH3668
- # GH5724
- df = pd.DataFrame(
- {'group': [1, 1, 2],
- 'int': [1, 2, 3],
- 'float': [4., 5., 6.],
- 'string': list('abc'),
- 'category_string': pd.Series(list('abc')).astype('category'),
- 'category_int': [7, 8, 9],
- 'datetime': pd.date_range('20130101', periods=3),
- 'datetimetz': pd.date_range('20130101',
- periods=3,
- tz='US/Eastern'),
- 'timedelta': pd.timedelta_range('1 s', periods=3, freq='s')},
- columns=['group', 'int', 'float', 'string',
- 'category_string', 'category_int',
- 'datetime', 'datetimetz',
- 'timedelta'])
- expected_columns_numeric = Index(['int', 'float', 'category_int'])
- # mean / median
- expected = pd.DataFrame(
- {'category_int': [7.5, 9],
- 'float': [4.5, 6.],
- 'timedelta': [pd.Timedelta('1.5s'),
- pd.Timedelta('3s')],
- 'int': [1.5, 3],
- 'datetime': [pd.Timestamp('2013-01-01 12:00:00'),
- pd.Timestamp('2013-01-03 00:00:00')],
- 'datetimetz': [
- pd.Timestamp('2013-01-01 12:00:00', tz='US/Eastern'),
- pd.Timestamp('2013-01-03 00:00:00', tz='US/Eastern')]},
- index=Index([1, 2], name='group'),
- columns=['int', 'float', 'category_int',
- 'datetime', 'datetimetz', 'timedelta'])
- for attr in ['mean', 'median']:
- f = getattr(df.groupby('group'), attr)
- result = f()
- tm.assert_index_equal(result.columns, expected_columns_numeric)
- result = f(numeric_only=False)
- tm.assert_frame_equal(result.reindex_like(expected), expected)
- # TODO: min, max *should* handle
- # categorical (ordered) dtype
- expected_columns = Index(['int', 'float', 'string',
- 'category_int',
- 'datetime', 'datetimetz',
- 'timedelta'])
- for attr in ['min', 'max']:
- f = getattr(df.groupby('group'), attr)
- result = f()
- tm.assert_index_equal(result.columns, expected_columns)
- result = f(numeric_only=False)
- tm.assert_index_equal(result.columns, expected_columns)
- expected_columns = Index(['int', 'float', 'string',
- 'category_string', 'category_int',
- 'datetime', 'datetimetz',
- 'timedelta'])
- for attr in ['first', 'last']:
- f = getattr(df.groupby('group'), attr)
- result = f()
- tm.assert_index_equal(result.columns, expected_columns)
- result = f(numeric_only=False)
- tm.assert_index_equal(result.columns, expected_columns)
- expected_columns = Index(['int', 'float', 'string',
- 'category_int', 'timedelta'])
- for attr in ['sum']:
- f = getattr(df.groupby('group'), attr)
- result = f()
- tm.assert_index_equal(result.columns, expected_columns_numeric)
- result = f(numeric_only=False)
- tm.assert_index_equal(result.columns, expected_columns)
- expected_columns = Index(['int', 'float', 'category_int'])
- for attr in ['prod', 'cumprod']:
- f = getattr(df.groupby('group'), attr)
- result = f()
- tm.assert_index_equal(result.columns, expected_columns_numeric)
- result = f(numeric_only=False)
- tm.assert_index_equal(result.columns, expected_columns)
- # like min, max, but don't include strings
- expected_columns = Index(['int', 'float',
- 'category_int',
- 'datetime', 'datetimetz',
- 'timedelta'])
- for attr in ['cummin', 'cummax']:
- f = getattr(df.groupby('group'), attr)
- result = f()
- # GH 15561: numeric_only=False set by default like min/max
- tm.assert_index_equal(result.columns, expected_columns)
- result = f(numeric_only=False)
- tm.assert_index_equal(result.columns, expected_columns)
- expected_columns = Index(['int', 'float', 'category_int',
- 'timedelta'])
- for attr in ['cumsum']:
- f = getattr(df.groupby('group'), attr)
- result = f()
- tm.assert_index_equal(result.columns, expected_columns_numeric)
- result = f(numeric_only=False)
- tm.assert_index_equal(result.columns, expected_columns)
- def test_non_cython_api():
- # GH5610
- # non-cython calls should not include the grouper
- df = DataFrame(
- [[1, 2, 'foo'],
- [1, np.nan, 'bar'],
- [3, np.nan, 'baz']],
- columns=['A', 'B', 'C'])
- g = df.groupby('A')
- gni = df.groupby('A', as_index=False)
- # mad
- expected = DataFrame([[0], [np.nan]], columns=['B'], index=[1, 3])
- expected.index.name = 'A'
- result = g.mad()
- tm.assert_frame_equal(result, expected)
- expected = DataFrame([[0., 0.], [0, np.nan]], columns=['A', 'B'],
- index=[0, 1])
- result = gni.mad()
- tm.assert_frame_equal(result, expected)
- # describe
- expected_index = pd.Index([1, 3], name='A')
- expected_col = pd.MultiIndex(levels=[['B'],
- ['count', 'mean', 'std', 'min',
- '25%', '50%', '75%', 'max']],
- codes=[[0] * 8, list(range(8))])
- expected = pd.DataFrame([[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
- [0.0, np.nan, np.nan, np.nan, np.nan, np.nan,
- np.nan, np.nan]],
- index=expected_index,
- columns=expected_col)
- result = g.describe()
- tm.assert_frame_equal(result, expected)
- expected = pd.concat([df[df.A == 1].describe().unstack().to_frame().T,
- df[df.A == 3].describe().unstack().to_frame().T])
- expected.index = pd.Index([0, 1])
- result = gni.describe()
- tm.assert_frame_equal(result, expected)
- # any
- expected = DataFrame([[True, True], [False, True]], columns=['B', 'C'],
- index=[1, 3])
- expected.index.name = 'A'
- result = g.any()
- tm.assert_frame_equal(result, expected)
- # idxmax
- expected = DataFrame([[0.0], [np.nan]], columns=['B'], index=[1, 3])
- expected.index.name = 'A'
- result = g.idxmax()
- tm.assert_frame_equal(result, expected)
- def test_cython_api2():
- # this takes the fast apply path
- # cumsum (GH5614)
- df = DataFrame(
- [[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]
- ], columns=['A', 'B', 'C'])
- expected = DataFrame(
- [[2, np.nan], [np.nan, 9], [4, 9]], columns=['B', 'C'])
- result = df.groupby('A').cumsum()
- tm.assert_frame_equal(result, expected)
- # GH 5755 - cumsum is a transformer and should ignore as_index
- result = df.groupby('A', as_index=False).cumsum()
- tm.assert_frame_equal(result, expected)
- # GH 13994
- result = df.groupby('A').cumsum(axis=1)
- expected = df.cumsum(axis=1)
- tm.assert_frame_equal(result, expected)
- result = df.groupby('A').cumprod(axis=1)
- expected = df.cumprod(axis=1)
- tm.assert_frame_equal(result, expected)
- def test_cython_median():
- df = DataFrame(np.random.randn(1000))
- df.values[::2] = np.nan
- labels = np.random.randint(0, 50, size=1000).astype(float)
- labels[::17] = np.nan
- result = df.groupby(labels).median()
- exp = df.groupby(labels).agg(nanops.nanmedian)
- tm.assert_frame_equal(result, exp)
- df = DataFrame(np.random.randn(1000, 5))
- rs = df.groupby(labels).agg(np.median)
- xp = df.groupby(labels).median()
- tm.assert_frame_equal(rs, xp)
- def test_median_empty_bins(observed):
- df = pd.DataFrame(np.random.randint(0, 44, 500))
- grps = range(0, 55, 5)
- bins = pd.cut(df[0], grps)
- result = df.groupby(bins, observed=observed).median()
- expected = df.groupby(bins, observed=observed).agg(lambda x: x.median())
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("dtype", [
- 'int8', 'int16', 'int32', 'int64', 'float32', 'float64'])
- @pytest.mark.parametrize("method,data", [
- ('first', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}),
- ('last', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}),
- ('min', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}),
- ('max', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}),
- ('nth', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}],
- 'args': [1]}),
- ('count', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}],
- 'out_type': 'int64'})
- ])
- def test_groupby_non_arithmetic_agg_types(dtype, method, data):
- # GH9311, GH6620
- df = pd.DataFrame(
- [{'a': 1, 'b': 1},
- {'a': 1, 'b': 2},
- {'a': 2, 'b': 3},
- {'a': 2, 'b': 4}])
- df['b'] = df.b.astype(dtype)
- if 'args' not in data:
- data['args'] = []
- if 'out_type' in data:
- out_type = data['out_type']
- else:
- out_type = dtype
- exp = data['df']
- df_out = pd.DataFrame(exp)
- df_out['b'] = df_out.b.astype(out_type)
- df_out.set_index('a', inplace=True)
- grpd = df.groupby('a')
- t = getattr(grpd, method)(*data['args'])
- tm.assert_frame_equal(t, df_out)
- @pytest.mark.parametrize("i", [
- (Timestamp("2011-01-15 12:50:28.502376"),
- Timestamp("2011-01-20 12:50:28.593448")),
- (24650000000000001, 24650000000000002)
- ])
- def test_groupby_non_arithmetic_agg_int_like_precision(i):
- # see gh-6620, gh-9311
- df = pd.DataFrame([{"a": 1, "b": i[0]}, {"a": 1, "b": i[1]}])
- grp_exp = {"first": {"expected": i[0]},
- "last": {"expected": i[1]},
- "min": {"expected": i[0]},
- "max": {"expected": i[1]},
- "nth": {"expected": i[1],
- "args": [1]},
- "count": {"expected": 2}}
- for method, data in compat.iteritems(grp_exp):
- if "args" not in data:
- data["args"] = []
- grouped = df.groupby("a")
- res = getattr(grouped, method)(*data["args"])
- assert res.iloc[0].b == data["expected"]
- def test_fill_consistency():
- # GH9221
- # pass thru keyword arguments to the generated wrapper
- # are set if the passed kw is None (only)
- df = DataFrame(index=pd.MultiIndex.from_product(
- [['value1', 'value2'], date_range('2014-01-01', '2014-01-06')]),
- columns=Index(
- ['1', '2'], name='id'))
- df['1'] = [np.nan, 1, np.nan, np.nan, 11, np.nan, np.nan, 2, np.nan,
- np.nan, 22, np.nan]
- df['2'] = [np.nan, 3, np.nan, np.nan, 33, np.nan, np.nan, 4, np.nan,
- np.nan, 44, np.nan]
- expected = df.groupby(level=0, axis=0).fillna(method='ffill')
- result = df.T.groupby(level=0, axis=1).fillna(method='ffill').T
- tm.assert_frame_equal(result, expected)
- def test_groupby_cumprod():
- # GH 4095
- df = pd.DataFrame({'key': ['b'] * 10, 'value': 2})
- actual = df.groupby('key')['value'].cumprod()
- expected = df.groupby('key')['value'].apply(lambda x: x.cumprod())
- expected.name = 'value'
- tm.assert_series_equal(actual, expected)
- df = pd.DataFrame({'key': ['b'] * 100, 'value': 2})
- actual = df.groupby('key')['value'].cumprod()
- # if overflows, groupby product casts to float
- # while numpy passes back invalid values
- df['value'] = df['value'].astype(float)
- expected = df.groupby('key')['value'].apply(lambda x: x.cumprod())
- expected.name = 'value'
- tm.assert_series_equal(actual, expected)
- def test_ops_general():
- ops = [('mean', np.mean),
- ('median', np.median),
- ('std', np.std),
- ('var', np.var),
- ('sum', np.sum),
- ('prod', np.prod),
- ('min', np.min),
- ('max', np.max),
- ('first', lambda x: x.iloc[0]),
- ('last', lambda x: x.iloc[-1]),
- ('count', np.size), ]
- try:
- from scipy.stats import sem
- except ImportError:
- pass
- else:
- ops.append(('sem', sem))
- df = DataFrame(np.random.randn(1000))
- labels = np.random.randint(0, 50, size=1000).astype(float)
- for op, targop in ops:
- result = getattr(df.groupby(labels), op)().astype(float)
- expected = df.groupby(labels).agg(targop)
- try:
- tm.assert_frame_equal(result, expected)
- except BaseException as exc:
- exc.args += ('operation: %s' % op, )
- raise
- def test_max_nan_bug():
- raw = """,Date,app,File
- -04-23,2013-04-23 00:00:00,,log080001.log
- -05-06,2013-05-06 00:00:00,,log.log
- -05-07,2013-05-07 00:00:00,OE,xlsx"""
- df = pd.read_csv(compat.StringIO(raw), parse_dates=[0])
- gb = df.groupby('Date')
- r = gb[['File']].max()
- e = gb['File'].max().to_frame()
- tm.assert_frame_equal(r, e)
- assert not r['File'].isna().any()
- def test_nlargest():
- a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
- b = Series(list('a' * 5 + 'b' * 5))
- gb = a.groupby(b)
- r = gb.nlargest(3)
- e = Series([
- 7, 5, 3, 10, 9, 6
- ], index=MultiIndex.from_arrays([list('aaabbb'), [3, 2, 1, 9, 5, 8]]))
- tm.assert_series_equal(r, e)
- a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
- gb = a.groupby(b)
- e = Series([
- 3, 2, 1, 3, 3, 2
- ], index=MultiIndex.from_arrays([list('aaabbb'), [2, 3, 1, 6, 5, 7]]))
- tm.assert_series_equal(gb.nlargest(3, keep='last'), e)
- def test_nsmallest():
- a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
- b = Series(list('a' * 5 + 'b' * 5))
- gb = a.groupby(b)
- r = gb.nsmallest(3)
- e = Series([
- 1, 2, 3, 0, 4, 6
- ], index=MultiIndex.from_arrays([list('aaabbb'), [0, 4, 1, 6, 7, 8]]))
- tm.assert_series_equal(r, e)
- a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
- gb = a.groupby(b)
- e = Series([
- 0, 1, 1, 0, 1, 2
- ], index=MultiIndex.from_arrays([list('aaabbb'), [4, 1, 0, 9, 8, 7]]))
- tm.assert_series_equal(gb.nsmallest(3, keep='last'), e)
- @pytest.mark.parametrize("func", [
- 'mean', 'var', 'std', 'cumprod', 'cumsum'
- ])
- def test_numpy_compat(func):
- # see gh-12811
- df = pd.DataFrame({'A': [1, 2, 1], 'B': [1, 2, 3]})
- g = df.groupby('A')
- msg = "numpy operations are not valid with groupby"
- with pytest.raises(UnsupportedFunctionCall, match=msg):
- getattr(g, func)(1, 2, 3)
- with pytest.raises(UnsupportedFunctionCall, match=msg):
- getattr(g, func)(foo=1)
- def test_cummin_cummax():
- # GH 15048
- num_types = [np.int32, np.int64, np.float32, np.float64]
- num_mins = [np.iinfo(np.int32).min, np.iinfo(np.int64).min,
- np.finfo(np.float32).min, np.finfo(np.float64).min]
- num_max = [np.iinfo(np.int32).max, np.iinfo(np.int64).max,
- np.finfo(np.float32).max, np.finfo(np.float64).max]
- base_df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 2, 2],
- 'B': [3, 4, 3, 2, 2, 3, 2, 1]})
- expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
- expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]
- for dtype, min_val, max_val in zip(num_types, num_mins, num_max):
- df = base_df.astype(dtype)
- # cummin
- expected = pd.DataFrame({'B': expected_mins}).astype(dtype)
- result = df.groupby('A').cummin()
- tm.assert_frame_equal(result, expected)
- result = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
- tm.assert_frame_equal(result, expected)
- # Test cummin w/ min value for dtype
- df.loc[[2, 6], 'B'] = min_val
- expected.loc[[2, 3, 6, 7], 'B'] = min_val
- result = df.groupby('A').cummin()
- tm.assert_frame_equal(result, expected)
- expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
- tm.assert_frame_equal(result, expected)
- # cummax
- expected = pd.DataFrame({'B': expected_maxs}).astype(dtype)
- result = df.groupby('A').cummax()
- tm.assert_frame_equal(result, expected)
- result = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
- tm.assert_frame_equal(result, expected)
- # Test cummax w/ max value for dtype
- df.loc[[2, 6], 'B'] = max_val
- expected.loc[[2, 3, 6, 7], 'B'] = max_val
- result = df.groupby('A').cummax()
- tm.assert_frame_equal(result, expected)
- expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
- tm.assert_frame_equal(result, expected)
- # Test nan in some values
- base_df.loc[[0, 2, 4, 6], 'B'] = np.nan
- expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 2,
- np.nan, 3, np.nan, 1]})
- result = base_df.groupby('A').cummin()
- tm.assert_frame_equal(result, expected)
- expected = (base_df.groupby('A')
- .B
- .apply(lambda x: x.cummin())
- .to_frame())
- tm.assert_frame_equal(result, expected)
- expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 4,
- np.nan, 3, np.nan, 3]})
- result = base_df.groupby('A').cummax()
- tm.assert_frame_equal(result, expected)
- expected = (base_df.groupby('A')
- .B
- .apply(lambda x: x.cummax())
- .to_frame())
- tm.assert_frame_equal(result, expected)
- # Test nan in entire column
- base_df['B'] = np.nan
- expected = pd.DataFrame({'B': [np.nan] * 8})
- result = base_df.groupby('A').cummin()
- tm.assert_frame_equal(expected, result)
- result = base_df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
- tm.assert_frame_equal(expected, result)
- result = base_df.groupby('A').cummax()
- tm.assert_frame_equal(expected, result)
- result = base_df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
- tm.assert_frame_equal(expected, result)
- # GH 15561
- df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(['2001'])))
- expected = pd.Series(pd.to_datetime('2001'), index=[0], name='b')
- for method in ['cummax', 'cummin']:
- result = getattr(df.groupby('a')['b'], method)()
- tm.assert_series_equal(expected, result)
- # GH 15635
- df = pd.DataFrame(dict(a=[1, 2, 1], b=[2, 1, 1]))
- result = df.groupby('a').b.cummax()
- expected = pd.Series([2, 1, 2], name='b')
- tm.assert_series_equal(result, expected)
- df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2]))
- result = df.groupby('a').b.cummin()
- expected = pd.Series([1, 2, 1], name='b')
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize('in_vals, out_vals', [
- # Basics: strictly increasing (T), strictly decreasing (F),
- # abs val increasing (F), non-strictly increasing (T)
- ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1],
- [True, False, False, True]),
- # Test with inf vals
- ([1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf],
- [True, False, True, False]),
- # Test with nan vals; should always be False
- ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
- [False, False, False, False]),
- ])
- def test_is_monotonic_increasing(in_vals, out_vals):
- # GH 17015
- source_dict = {
- 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'],
- 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'],
- 'C': in_vals}
- df = pd.DataFrame(source_dict)
- result = df.groupby('B').C.is_monotonic_increasing
- index = Index(list('abcd'), name='B')
- expected = pd.Series(index=index, data=out_vals, name='C')
- tm.assert_series_equal(result, expected)
- # Also check result equal to manually taking x.is_monotonic_increasing.
- expected = (
- df.groupby(['B']).C.apply(lambda x: x.is_monotonic_increasing))
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize('in_vals, out_vals', [
- # Basics: strictly decreasing (T), strictly increasing (F),
- # abs val decreasing (F), non-strictly increasing (T)
- ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1],
- [True, False, False, True]),
- # Test with inf vals
- ([np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf],
- [True, True, False, True]),
- # Test with nan vals; should always be False
- ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
- [False, False, False, False]),
- ])
- def test_is_monotonic_decreasing(in_vals, out_vals):
- # GH 17015
- source_dict = {
- 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'],
- 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'],
- 'C': in_vals}
- df = pd.DataFrame(source_dict)
- result = df.groupby('B').C.is_monotonic_decreasing
- index = Index(list('abcd'), name='B')
- expected = pd.Series(index=index, data=out_vals, name='C')
- tm.assert_series_equal(result, expected)
- # describe
- # --------------------------------
- def test_apply_describe_bug(mframe):
- grouped = mframe.groupby(level='first')
- grouped.describe() # it works!
- def test_series_describe_multikey():
- ts = tm.makeTimeSeries()
- grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
- result = grouped.describe()
- tm.assert_series_equal(result['mean'], grouped.mean(),
- check_names=False)
- tm.assert_series_equal(result['std'], grouped.std(), check_names=False)
- tm.assert_series_equal(result['min'], grouped.min(), check_names=False)
- def test_series_describe_single():
- ts = tm.makeTimeSeries()
- grouped = ts.groupby(lambda x: x.month)
- result = grouped.apply(lambda x: x.describe())
- expected = grouped.describe().stack()
- tm.assert_series_equal(result, expected)
- def test_series_index_name(df):
- grouped = df.loc[:, ['C']].groupby(df['A'])
- result = grouped.agg(lambda x: x.mean())
- assert result.index.name == 'A'
- def test_frame_describe_multikey(tsframe):
- grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
- result = grouped.describe()
- desc_groups = []
- for col in tsframe:
- group = grouped[col].describe()
- # GH 17464 - Remove duplicate MultiIndex levels
- group_col = pd.MultiIndex(
- levels=[[col], group.columns],
- codes=[[0] * len(group.columns), range(len(group.columns))])
- group = pd.DataFrame(group.values,
- columns=group_col,
- index=group.index)
- desc_groups.append(group)
- expected = pd.concat(desc_groups, axis=1)
- tm.assert_frame_equal(result, expected)
- groupedT = tsframe.groupby({'A': 0, 'B': 0,
- 'C': 1, 'D': 1}, axis=1)
- result = groupedT.describe()
- expected = tsframe.describe().T
- expected.index = pd.MultiIndex(
- levels=[[0, 1], expected.index],
- codes=[[0, 0, 1, 1], range(len(expected.index))])
- tm.assert_frame_equal(result, expected)
- def test_frame_describe_tupleindex():
- # GH 14848 - regression from 0.19.0 to 0.19.1
- df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3,
- 'y': [10, 20, 30, 40, 50] * 3,
- 'z': [100, 200, 300, 400, 500] * 3})
- df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
- df2 = df1.rename(columns={'k': 'key'})
- msg = "Names should be list-like for a MultiIndex"
- with pytest.raises(ValueError, match=msg):
- df1.groupby('k').describe()
- with pytest.raises(ValueError, match=msg):
- df2.groupby('key').describe()
- def test_frame_describe_unstacked_format():
- # GH 4792
- prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990,
- pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499,
- pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499}
- volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000,
- pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000,
- pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000}
- df = pd.DataFrame({'PRICE': prices,
- 'VOLUME': volumes})
- result = df.groupby('PRICE').VOLUME.describe()
- data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
- df[df.PRICE == 25499].VOLUME.describe().values.tolist()]
- expected = pd.DataFrame(data,
- index=pd.Index([24990, 25499], name='PRICE'),
- columns=['count', 'mean', 'std', 'min',
- '25%', '50%', '75%', 'max'])
- tm.assert_frame_equal(result, expected)
- # nunique
- # --------------------------------
- @pytest.mark.parametrize('n', 10 ** np.arange(2, 6))
- @pytest.mark.parametrize('m', [10, 100, 1000])
- @pytest.mark.parametrize('sort', [False, True])
- @pytest.mark.parametrize('dropna', [False, True])
- def test_series_groupby_nunique(n, m, sort, dropna):
- def check_nunique(df, keys, as_index=True):
- gr = df.groupby(keys, as_index=as_index, sort=sort)
- left = gr['julie'].nunique(dropna=dropna)
- gr = df.groupby(keys, as_index=as_index, sort=sort)
- right = gr['julie'].apply(Series.nunique, dropna=dropna)
- if not as_index:
- right = right.reset_index(drop=True)
- tm.assert_series_equal(left, right, check_names=False)
- days = date_range('2015-08-23', periods=10)
- frame = DataFrame({'jim': np.random.choice(list(ascii_lowercase), n),
- 'joe': np.random.choice(days, n),
- 'julie': np.random.randint(0, m, n)})
- check_nunique(frame, ['jim'])
- check_nunique(frame, ['jim', 'joe'])
- frame.loc[1::17, 'jim'] = None
- frame.loc[3::37, 'joe'] = None
- frame.loc[7::19, 'julie'] = None
- frame.loc[8::19, 'julie'] = None
- frame.loc[9::19, 'julie'] = None
- check_nunique(frame, ['jim'])
- check_nunique(frame, ['jim', 'joe'])
- check_nunique(frame, ['jim'], as_index=False)
- check_nunique(frame, ['jim', 'joe'], as_index=False)
- def test_nunique():
- df = DataFrame({
- 'A': list('abbacc'),
- 'B': list('abxacc'),
- 'C': list('abbacx'),
- })
- expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]})
- result = df.groupby('A', as_index=False).nunique()
- tm.assert_frame_equal(result, expected)
- # as_index
- expected.index = list('abc')
- expected.index.name = 'A'
- result = df.groupby('A').nunique()
- tm.assert_frame_equal(result, expected)
- # with na
- result = df.replace({'x': None}).groupby('A').nunique(dropna=False)
- tm.assert_frame_equal(result, expected)
- # dropna
- expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3},
- index=list('abc'))
- expected.index.name = 'A'
- result = df.replace({'x': None}).groupby('A').nunique()
- tm.assert_frame_equal(result, expected)
- def test_nunique_with_object():
- # GH 11077
- data = pd.DataFrame(
- [[100, 1, 'Alice'],
- [200, 2, 'Bob'],
- [300, 3, 'Charlie'],
- [-400, 4, 'Dan'],
- [500, 5, 'Edith']],
- columns=['amount', 'id', 'name']
- )
- result = data.groupby(['id', 'amount'])['name'].nunique()
- index = MultiIndex.from_arrays([data.id, data.amount])
- expected = pd.Series([1] * 5, name='name', index=index)
- tm.assert_series_equal(result, expected)
- def test_nunique_with_empty_series():
- # GH 12553
- data = pd.Series(name='name')
- result = data.groupby(level=0).nunique()
- expected = pd.Series(name='name', dtype='int64')
- tm.assert_series_equal(result, expected)
- def test_nunique_with_timegrouper():
- # GH 13453
- test = pd.DataFrame({
- 'time': [Timestamp('2016-06-28 09:35:35'),
- Timestamp('2016-06-28 16:09:30'),
- Timestamp('2016-06-28 16:46:28')],
- 'data': ['1', '2', '3']}).set_index('time')
- result = test.groupby(pd.Grouper(freq='h'))['data'].nunique()
- expected = test.groupby(
- pd.Grouper(freq='h')
- )['data'].apply(pd.Series.nunique)
- tm.assert_series_equal(result, expected)
- # count
- # --------------------------------
- def test_groupby_timedelta_cython_count():
- df = DataFrame({'g': list('ab' * 2),
- 'delt': np.arange(4).astype('timedelta64[ns]')})
- expected = Series([
- 2, 2
- ], index=pd.Index(['a', 'b'], name='g'), name='delt')
- result = df.groupby('g').delt.count()
- tm.assert_series_equal(expected, result)
- def test_count():
- n = 1 << 15
- dr = date_range('2015-08-30', periods=n // 10, freq='T')
- df = DataFrame({
- '1st': np.random.choice(
- list(ascii_lowercase), n),
- '2nd': np.random.randint(0, 5, n),
- '3rd': np.random.randn(n).round(3),
- '4th': np.random.randint(-10, 10, n),
- '5th': np.random.choice(dr, n),
- '6th': np.random.randn(n).round(3),
- '7th': np.random.randn(n).round(3),
- '8th': np.random.choice(dr, n) - np.random.choice(dr, 1),
- '9th': np.random.choice(
- list(ascii_lowercase), n)
- })
- for col in df.columns.drop(['1st', '2nd', '4th']):
- df.loc[np.random.choice(n, n // 10), col] = np.nan
- df['9th'] = df['9th'].astype('category')
- for key in '1st', '2nd', ['1st', '2nd']:
- left = df.groupby(key).count()
- right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1)
- tm.assert_frame_equal(left, right)
- # GH5610
- # count counts non-nulls
- df = pd.DataFrame([[1, 2, 'foo'],
- [1, np.nan, 'bar'],
- [3, np.nan, np.nan]],
- columns=['A', 'B', 'C'])
- count_as = df.groupby('A').count()
- count_not_as = df.groupby('A', as_index=False).count()
- expected = DataFrame([[1, 2], [0, 0]], columns=['B', 'C'],
- index=[1, 3])
- expected.index.name = 'A'
- tm.assert_frame_equal(count_not_as, expected.reset_index())
- tm.assert_frame_equal(count_as, expected)
- count_B = df.groupby('A')['B'].count()
- tm.assert_series_equal(count_B, expected['B'])
- def test_count_object():
- df = pd.DataFrame({'a': ['a'] * 3 + ['b'] * 3, 'c': [2] * 3 + [3] * 3})
- result = df.groupby('c').a.count()
- expected = pd.Series([
- 3, 3
- ], index=pd.Index([2, 3], name='c'), name='a')
- tm.assert_series_equal(result, expected)
- df = pd.DataFrame({'a': ['a', np.nan, np.nan] + ['b'] * 3,
- 'c': [2] * 3 + [3] * 3})
- result = df.groupby('c').a.count()
- expected = pd.Series([
- 1, 3
- ], index=pd.Index([2, 3], name='c'), name='a')
- tm.assert_series_equal(result, expected)
- def test_count_cross_type():
- # GH8169
- vals = np.hstack((np.random.randint(0, 5, (100, 2)), np.random.randint(
- 0, 2, (100, 2))))
- df = pd.DataFrame(vals, columns=['a', 'b', 'c', 'd'])
- df[df == 2] = np.nan
- expected = df.groupby(['c', 'd']).count()
- for t in ['float32', 'object']:
- df['a'] = df['a'].astype(t)
- df['b'] = df['b'].astype(t)
- result = df.groupby(['c', 'd']).count()
- tm.assert_frame_equal(result, expected)
- def test_lower_int_prec_count():
- df = DataFrame({'a': np.array(
- [0, 1, 2, 100], np.int8),
- 'b': np.array(
- [1, 2, 3, 6], np.uint32),
- 'c': np.array(
- [4, 5, 6, 8], np.int16),
- 'grp': list('ab' * 2)})
- result = df.groupby('grp').count()
- expected = DataFrame({'a': [2, 2],
- 'b': [2, 2],
- 'c': [2, 2]}, index=pd.Index(list('ab'),
- name='grp'))
- tm.assert_frame_equal(result, expected)
- def test_count_uses_size_on_exception():
- class RaisingObjectException(Exception):
- pass
- class RaisingObject(object):
- def __init__(self, msg='I will raise inside Cython'):
- super(RaisingObject, self).__init__()
- self.msg = msg
- def __eq__(self, other):
- # gets called in Cython to check that raising calls the method
- raise RaisingObjectException(self.msg)
- df = DataFrame({'a': [RaisingObject() for _ in range(4)],
- 'grp': list('ab' * 2)})
- result = df.groupby('grp').count()
- expected = DataFrame({'a': [2, 2]}, index=pd.Index(
- list('ab'), name='grp'))
- tm.assert_frame_equal(result, expected)
- # size
- # --------------------------------
- def test_size(df):
- grouped = df.groupby(['A', 'B'])
- result = grouped.size()
- for key, group in grouped:
- assert result[key] == len(group)
- grouped = df.groupby('A')
- result = grouped.size()
- for key, group in grouped:
- assert result[key] == len(group)
- grouped = df.groupby('B')
- result = grouped.size()
- for key, group in grouped:
- assert result[key] == len(group)
- df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc'))
- for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])):
- left = df.groupby(key, sort=sort).size()
- right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0])
- tm.assert_series_equal(left, right, check_names=False)
- # GH11699
- df = DataFrame([], columns=['A', 'B'])
- out = Series([], dtype='int64', index=Index([], name='A'))
- tm.assert_series_equal(df.groupby('A').size(), out)
- # pipe
- # --------------------------------
- def test_pipe():
- # Test the pipe method of DataFrameGroupBy.
- # Issue #17871
- random_state = np.random.RandomState(1234567890)
- df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
- 'foo', 'bar', 'foo', 'foo'],
- 'B': random_state.randn(8),
- 'C': random_state.randn(8)})
- def f(dfgb):
- return dfgb.B.max() - dfgb.C.min().min()
- def square(srs):
- return srs ** 2
- # Note that the transformations are
- # GroupBy -> Series
- # Series -> Series
- # This then chains the GroupBy.pipe and the
- # NDFrame.pipe methods
- result = df.groupby('A').pipe(f).pipe(square)
- index = Index([u'bar', u'foo'], dtype='object', name=u'A')
- expected = pd.Series([8.99110003361, 8.17516964785], name='B',
- index=index)
- tm.assert_series_equal(expected, result)
- def test_pipe_args():
- # Test passing args to the pipe method of DataFrameGroupBy.
- # Issue #17871
- df = pd.DataFrame({'group': ['A', 'A', 'B', 'B', 'C'],
- 'x': [1.0, 2.0, 3.0, 2.0, 5.0],
- 'y': [10.0, 100.0, 1000.0, -100.0, -1000.0]})
- def f(dfgb, arg1):
- return (dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False)
- .groupby(dfgb.grouper))
- def g(dfgb, arg2):
- return dfgb.sum() / dfgb.sum().sum() + arg2
- def h(df, arg3):
- return df.x + df.y - arg3
- result = (df
- .groupby('group')
- .pipe(f, 0)
- .pipe(g, 10)
- .pipe(h, 100))
- # Assert the results here
- index = pd.Index(['A', 'B', 'C'], name='group')
- expected = pd.Series([-79.5160891089, -78.4839108911, -80],
- index=index)
- tm.assert_series_equal(expected, result)
- # test SeriesGroupby.pipe
- ser = pd.Series([1, 1, 2, 2, 3, 3])
- result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count())
- expected = pd.Series([4, 8, 12], index=pd.Int64Index([1, 2, 3]))
- tm.assert_series_equal(result, expected)
- def test_groupby_mean_no_overflow():
- # Regression test for (#22487)
- df = pd.DataFrame({
- "user": ["A", "A", "A", "A", "A"],
- "connections": [4970, 4749, 4719, 4704, 18446744073699999744]
- })
- assert df.groupby('user')['connections'].mean()['A'] == 3689348814740003840
|