test_aggregate.py 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289
  1. # -*- coding: utf-8 -*-
  2. """
  3. test .agg behavior / note that .apply is tested generally in test_groupby.py
  4. """
  5. import numpy as np
  6. import pytest
  7. from pandas.compat import OrderedDict
  8. import pandas as pd
  9. from pandas import DataFrame, Index, MultiIndex, Series, concat
  10. from pandas.core.base import SpecificationError
  11. from pandas.core.groupby.grouper import Grouping
  12. import pandas.util.testing as tm
  13. def test_agg_regression1(tsframe):
  14. grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
  15. result = grouped.agg(np.mean)
  16. expected = grouped.mean()
  17. tm.assert_frame_equal(result, expected)
  18. def test_agg_must_agg(df):
  19. grouped = df.groupby('A')['C']
  20. msg = "Must produce aggregated value"
  21. with pytest.raises(Exception, match=msg):
  22. grouped.agg(lambda x: x.describe())
  23. with pytest.raises(Exception, match=msg):
  24. grouped.agg(lambda x: x.index[:2])
  25. def test_agg_ser_multi_key(df):
  26. # TODO(wesm): unused
  27. ser = df.C # noqa
  28. f = lambda x: x.sum()
  29. results = df.C.groupby([df.A, df.B]).aggregate(f)
  30. expected = df.groupby(['A', 'B']).sum()['C']
  31. tm.assert_series_equal(results, expected)
  32. def test_groupby_aggregation_mixed_dtype():
  33. # GH 6212
  34. expected = DataFrame({
  35. 'v1': [5, 5, 7, np.nan, 3, 3, 4, 1],
  36. 'v2': [55, 55, 77, np.nan, 33, 33, 44, 11]},
  37. index=MultiIndex.from_tuples([(1, 95), (1, 99), (2, 95), (2, 99),
  38. ('big', 'damp'),
  39. ('blue', 'dry'),
  40. ('red', 'red'), ('red', 'wet')],
  41. names=['by1', 'by2']))
  42. df = DataFrame({
  43. 'v1': [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9],
  44. 'v2': [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99],
  45. 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan,
  46. 12],
  47. 'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99,
  48. np.nan, np.nan]
  49. })
  50. g = df.groupby(['by1', 'by2'])
  51. result = g[['v1', 'v2']].mean()
  52. tm.assert_frame_equal(result, expected)
  53. def test_agg_apply_corner(ts, tsframe):
  54. # nothing to group, all NA
  55. grouped = ts.groupby(ts * np.nan)
  56. assert ts.dtype == np.float64
  57. # groupby float64 values results in Float64Index
  58. exp = Series([], dtype=np.float64,
  59. index=pd.Index([], dtype=np.float64))
  60. tm.assert_series_equal(grouped.sum(), exp)
  61. tm.assert_series_equal(grouped.agg(np.sum), exp)
  62. tm.assert_series_equal(grouped.apply(np.sum), exp,
  63. check_index_type=False)
  64. # DataFrame
  65. grouped = tsframe.groupby(tsframe['A'] * np.nan)
  66. exp_df = DataFrame(columns=tsframe.columns, dtype=float,
  67. index=pd.Index([], dtype=np.float64))
  68. tm.assert_frame_equal(grouped.sum(), exp_df, check_names=False)
  69. tm.assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False)
  70. tm.assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0],
  71. check_names=False)
  72. def test_agg_grouping_is_list_tuple(ts):
  73. df = tm.makeTimeDataFrame()
  74. grouped = df.groupby(lambda x: x.year)
  75. grouper = grouped.grouper.groupings[0].grouper
  76. grouped.grouper.groupings[0] = Grouping(ts.index, list(grouper))
  77. result = grouped.agg(np.mean)
  78. expected = grouped.mean()
  79. tm.assert_frame_equal(result, expected)
  80. grouped.grouper.groupings[0] = Grouping(ts.index, tuple(grouper))
  81. result = grouped.agg(np.mean)
  82. expected = grouped.mean()
  83. tm.assert_frame_equal(result, expected)
  84. def test_agg_python_multiindex(mframe):
  85. grouped = mframe.groupby(['A', 'B'])
  86. result = grouped.agg(np.mean)
  87. expected = grouped.mean()
  88. tm.assert_frame_equal(result, expected)
  89. @pytest.mark.parametrize('groupbyfunc', [
  90. lambda x: x.weekday(),
  91. [lambda x: x.month, lambda x: x.weekday()],
  92. ])
  93. def test_aggregate_str_func(tsframe, groupbyfunc):
  94. grouped = tsframe.groupby(groupbyfunc)
  95. # single series
  96. result = grouped['A'].agg('std')
  97. expected = grouped['A'].std()
  98. tm.assert_series_equal(result, expected)
  99. # group frame by function name
  100. result = grouped.aggregate('var')
  101. expected = grouped.var()
  102. tm.assert_frame_equal(result, expected)
  103. # group frame by function dict
  104. result = grouped.agg(OrderedDict([['A', 'var'],
  105. ['B', 'std'],
  106. ['C', 'mean'],
  107. ['D', 'sem']]))
  108. expected = DataFrame(OrderedDict([['A', grouped['A'].var()],
  109. ['B', grouped['B'].std()],
  110. ['C', grouped['C'].mean()],
  111. ['D', grouped['D'].sem()]]))
  112. tm.assert_frame_equal(result, expected)
  113. def test_aggregate_item_by_item(df):
  114. grouped = df.groupby('A')
  115. aggfun = lambda ser: ser.size
  116. result = grouped.agg(aggfun)
  117. foo = (df.A == 'foo').sum()
  118. bar = (df.A == 'bar').sum()
  119. K = len(result.columns)
  120. # GH5782
  121. # odd comparisons can result here, so cast to make easy
  122. exp = pd.Series(np.array([foo] * K), index=list('BCD'),
  123. dtype=np.float64, name='foo')
  124. tm.assert_series_equal(result.xs('foo'), exp)
  125. exp = pd.Series(np.array([bar] * K), index=list('BCD'),
  126. dtype=np.float64, name='bar')
  127. tm.assert_almost_equal(result.xs('bar'), exp)
  128. def aggfun(ser):
  129. return ser.size
  130. result = DataFrame().groupby(df.A).agg(aggfun)
  131. assert isinstance(result, DataFrame)
  132. assert len(result) == 0
  133. def test_wrap_agg_out(three_group):
  134. grouped = three_group.groupby(['A', 'B'])
  135. def func(ser):
  136. if ser.dtype == np.object:
  137. raise TypeError
  138. else:
  139. return ser.sum()
  140. result = grouped.aggregate(func)
  141. exp_grouped = three_group.loc[:, three_group.columns != 'C']
  142. expected = exp_grouped.groupby(['A', 'B']).aggregate(func)
  143. tm.assert_frame_equal(result, expected)
  144. def test_agg_multiple_functions_maintain_order(df):
  145. # GH #610
  146. funcs = [('mean', np.mean), ('max', np.max), ('min', np.min)]
  147. result = df.groupby('A')['C'].agg(funcs)
  148. exp_cols = Index(['mean', 'max', 'min'])
  149. tm.assert_index_equal(result.columns, exp_cols)
  150. def test_multiple_functions_tuples_and_non_tuples(df):
  151. # #1359
  152. funcs = [('foo', 'mean'), 'std']
  153. ex_funcs = [('foo', 'mean'), ('std', 'std')]
  154. result = df.groupby('A')['C'].agg(funcs)
  155. expected = df.groupby('A')['C'].agg(ex_funcs)
  156. tm.assert_frame_equal(result, expected)
  157. result = df.groupby('A').agg(funcs)
  158. expected = df.groupby('A').agg(ex_funcs)
  159. tm.assert_frame_equal(result, expected)
  160. def test_agg_multiple_functions_too_many_lambdas(df):
  161. grouped = df.groupby('A')
  162. funcs = ['mean', lambda x: x.mean(), lambda x: x.std()]
  163. msg = 'Function names must be unique, found multiple named <lambda>'
  164. with pytest.raises(SpecificationError, match=msg):
  165. grouped.agg(funcs)
  166. def test_more_flexible_frame_multi_function(df):
  167. grouped = df.groupby('A')
  168. exmean = grouped.agg(OrderedDict([['C', np.mean], ['D', np.mean]]))
  169. exstd = grouped.agg(OrderedDict([['C', np.std], ['D', np.std]]))
  170. expected = concat([exmean, exstd], keys=['mean', 'std'], axis=1)
  171. expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1)
  172. d = OrderedDict([['C', [np.mean, np.std]], ['D', [np.mean, np.std]]])
  173. result = grouped.aggregate(d)
  174. tm.assert_frame_equal(result, expected)
  175. # be careful
  176. result = grouped.aggregate(OrderedDict([['C', np.mean],
  177. ['D', [np.mean, np.std]]]))
  178. expected = grouped.aggregate(OrderedDict([['C', np.mean],
  179. ['D', [np.mean, np.std]]]))
  180. tm.assert_frame_equal(result, expected)
  181. def foo(x):
  182. return np.mean(x)
  183. def bar(x):
  184. return np.std(x, ddof=1)
  185. # this uses column selection & renaming
  186. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  187. d = OrderedDict([['C', np.mean],
  188. ['D', OrderedDict([['foo', np.mean],
  189. ['bar', np.std]])]])
  190. result = grouped.aggregate(d)
  191. d = OrderedDict([['C', [np.mean]], ['D', [foo, bar]]])
  192. expected = grouped.aggregate(d)
  193. tm.assert_frame_equal(result, expected)
  194. def test_multi_function_flexible_mix(df):
  195. # GH #1268
  196. grouped = df.groupby('A')
  197. # Expected
  198. d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])],
  199. ['D', {'sum': 'sum'}]])
  200. # this uses column selection & renaming
  201. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  202. expected = grouped.aggregate(d)
  203. # Test 1
  204. d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])],
  205. ['D', 'sum']])
  206. # this uses column selection & renaming
  207. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  208. result = grouped.aggregate(d)
  209. tm.assert_frame_equal(result, expected)
  210. # Test 2
  211. d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])],
  212. ['D', ['sum']]])
  213. # this uses column selection & renaming
  214. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  215. result = grouped.aggregate(d)
  216. tm.assert_frame_equal(result, expected)