test_other.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514
  1. # -*- coding: utf-8 -*-
  2. """
  3. test all other .agg behavior
  4. """
  5. from __future__ import print_function
  6. from collections import OrderedDict
  7. import datetime as dt
  8. from functools import partial
  9. import numpy as np
  10. import pytest
  11. import pandas as pd
  12. from pandas import (
  13. DataFrame, Index, MultiIndex, PeriodIndex, Series, date_range,
  14. period_range)
  15. from pandas.core.groupby.groupby import SpecificationError
  16. import pandas.util.testing as tm
  17. from pandas.io.formats.printing import pprint_thing
  18. def test_agg_api():
  19. # GH 6337
  20. # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error
  21. # different api for agg when passed custom function with mixed frame
  22. df = DataFrame({'data1': np.random.randn(5),
  23. 'data2': np.random.randn(5),
  24. 'key1': ['a', 'a', 'b', 'b', 'a'],
  25. 'key2': ['one', 'two', 'one', 'two', 'one']})
  26. grouped = df.groupby('key1')
  27. def peak_to_peak(arr):
  28. return arr.max() - arr.min()
  29. expected = grouped.agg([peak_to_peak])
  30. expected.columns = ['data1', 'data2']
  31. result = grouped.agg(peak_to_peak)
  32. tm.assert_frame_equal(result, expected)
  33. def test_agg_datetimes_mixed():
  34. data = [[1, '2012-01-01', 1.0],
  35. [2, '2012-01-02', 2.0],
  36. [3, None, 3.0]]
  37. df1 = DataFrame({'key': [x[0] for x in data],
  38. 'date': [x[1] for x in data],
  39. 'value': [x[2] for x in data]})
  40. data = [[row[0],
  41. (dt.datetime.strptime(row[1], '%Y-%m-%d').date()
  42. if row[1] else None),
  43. row[2]]
  44. for row in data]
  45. df2 = DataFrame({'key': [x[0] for x in data],
  46. 'date': [x[1] for x in data],
  47. 'value': [x[2] for x in data]})
  48. df1['weights'] = df1['value'] / df1['value'].sum()
  49. gb1 = df1.groupby('date').aggregate(np.sum)
  50. df2['weights'] = df1['value'] / df1['value'].sum()
  51. gb2 = df2.groupby('date').aggregate(np.sum)
  52. assert (len(gb1) == len(gb2))
  53. def test_agg_period_index():
  54. prng = period_range('2012-1-1', freq='M', periods=3)
  55. df = DataFrame(np.random.randn(3, 2), index=prng)
  56. rs = df.groupby(level=0).sum()
  57. assert isinstance(rs.index, PeriodIndex)
  58. # GH 3579
  59. index = period_range(start='1999-01', periods=5, freq='M')
  60. s1 = Series(np.random.rand(len(index)), index=index)
  61. s2 = Series(np.random.rand(len(index)), index=index)
  62. series = [('s1', s1), ('s2', s2)]
  63. df = DataFrame.from_dict(OrderedDict(series))
  64. grouped = df.groupby(df.index.month)
  65. list(grouped)
  66. def test_agg_dict_parameter_cast_result_dtypes():
  67. # GH 12821
  68. df = DataFrame({'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'],
  69. 'time': date_range('1/1/2011', periods=8, freq='H')})
  70. df.loc[[0, 1, 2, 5], 'time'] = None
  71. # test for `first` function
  72. exp = df.loc[[0, 3, 4, 6]].set_index('class')
  73. grouped = df.groupby('class')
  74. tm.assert_frame_equal(grouped.first(), exp)
  75. tm.assert_frame_equal(grouped.agg('first'), exp)
  76. tm.assert_frame_equal(grouped.agg({'time': 'first'}), exp)
  77. tm.assert_series_equal(grouped.time.first(), exp['time'])
  78. tm.assert_series_equal(grouped.time.agg('first'), exp['time'])
  79. # test for `last` function
  80. exp = df.loc[[0, 3, 4, 7]].set_index('class')
  81. grouped = df.groupby('class')
  82. tm.assert_frame_equal(grouped.last(), exp)
  83. tm.assert_frame_equal(grouped.agg('last'), exp)
  84. tm.assert_frame_equal(grouped.agg({'time': 'last'}), exp)
  85. tm.assert_series_equal(grouped.time.last(), exp['time'])
  86. tm.assert_series_equal(grouped.time.agg('last'), exp['time'])
  87. # count
  88. exp = pd.Series([2, 2, 2, 2],
  89. index=Index(list('ABCD'), name='class'),
  90. name='time')
  91. tm.assert_series_equal(grouped.time.agg(len), exp)
  92. tm.assert_series_equal(grouped.time.size(), exp)
  93. exp = pd.Series([0, 1, 1, 2],
  94. index=Index(list('ABCD'), name='class'),
  95. name='time')
  96. tm.assert_series_equal(grouped.time.count(), exp)
  97. def test_agg_cast_results_dtypes():
  98. # similar to GH12821
  99. # xref #11444
  100. u = [dt.datetime(2015, x + 1, 1) for x in range(12)]
  101. v = list('aaabbbbbbccd')
  102. df = pd.DataFrame({'X': v, 'Y': u})
  103. result = df.groupby('X')['Y'].agg(len)
  104. expected = df.groupby('X')['Y'].count()
  105. tm.assert_series_equal(result, expected)
  106. def test_aggregate_float64_no_int64():
  107. # see gh-11199
  108. df = DataFrame({"a": [1, 2, 3, 4, 5],
  109. "b": [1, 2, 2, 4, 5],
  110. "c": [1, 2, 3, 4, 5]})
  111. expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
  112. expected.index.name = "b"
  113. result = df.groupby("b")[["a"]].mean()
  114. tm.assert_frame_equal(result, expected)
  115. expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]},
  116. index=[1, 2, 4, 5])
  117. expected.index.name = "b"
  118. result = df.groupby("b")[["a", "c"]].mean()
  119. tm.assert_frame_equal(result, expected)
  120. def test_aggregate_api_consistency():
  121. # GH 9052
  122. # make sure that the aggregates via dict
  123. # are consistent
  124. df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
  125. 'foo', 'bar', 'foo', 'foo'],
  126. 'B': ['one', 'one', 'two', 'two',
  127. 'two', 'two', 'one', 'two'],
  128. 'C': np.random.randn(8) + 1.0,
  129. 'D': np.arange(8)})
  130. grouped = df.groupby(['A', 'B'])
  131. c_mean = grouped['C'].mean()
  132. c_sum = grouped['C'].sum()
  133. d_mean = grouped['D'].mean()
  134. d_sum = grouped['D'].sum()
  135. result = grouped['D'].agg(['sum', 'mean'])
  136. expected = pd.concat([d_sum, d_mean], axis=1)
  137. expected.columns = ['sum', 'mean']
  138. tm.assert_frame_equal(result, expected, check_like=True)
  139. result = grouped.agg([np.sum, np.mean])
  140. expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1)
  141. expected.columns = MultiIndex.from_product([['C', 'D'],
  142. ['sum', 'mean']])
  143. tm.assert_frame_equal(result, expected, check_like=True)
  144. result = grouped[['D', 'C']].agg([np.sum, np.mean])
  145. expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1)
  146. expected.columns = MultiIndex.from_product([['D', 'C'],
  147. ['sum', 'mean']])
  148. tm.assert_frame_equal(result, expected, check_like=True)
  149. result = grouped.agg({'C': 'mean', 'D': 'sum'})
  150. expected = pd.concat([d_sum, c_mean], axis=1)
  151. tm.assert_frame_equal(result, expected, check_like=True)
  152. result = grouped.agg({'C': ['mean', 'sum'],
  153. 'D': ['mean', 'sum']})
  154. expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1)
  155. expected.columns = MultiIndex.from_product([['C', 'D'],
  156. ['mean', 'sum']])
  157. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  158. result = grouped[['D', 'C']].agg({'r': np.sum,
  159. 'r2': np.mean})
  160. expected = pd.concat([d_sum, c_sum, d_mean, c_mean], axis=1)
  161. expected.columns = MultiIndex.from_product([['r', 'r2'],
  162. ['D', 'C']])
  163. tm.assert_frame_equal(result, expected, check_like=True)
  164. def test_agg_dict_renaming_deprecation():
  165. # 15931
  166. df = pd.DataFrame({'A': [1, 1, 1, 2, 2],
  167. 'B': range(5),
  168. 'C': range(5)})
  169. with tm.assert_produces_warning(FutureWarning,
  170. check_stacklevel=False) as w:
  171. df.groupby('A').agg({'B': {'foo': ['sum', 'max']},
  172. 'C': {'bar': ['count', 'min']}})
  173. assert "using a dict with renaming" in str(w[0].message)
  174. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  175. df.groupby('A')[['B', 'C']].agg({'ma': 'max'})
  176. with tm.assert_produces_warning(FutureWarning) as w:
  177. df.groupby('A').B.agg({'foo': 'count'})
  178. assert "using a dict on a Series for aggregation" in str(w[0].message)
  179. def test_agg_compat():
  180. # GH 12334
  181. df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
  182. 'foo', 'bar', 'foo', 'foo'],
  183. 'B': ['one', 'one', 'two', 'two',
  184. 'two', 'two', 'one', 'two'],
  185. 'C': np.random.randn(8) + 1.0,
  186. 'D': np.arange(8)})
  187. g = df.groupby(['A', 'B'])
  188. expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1)
  189. expected.columns = MultiIndex.from_tuples([('C', 'sum'),
  190. ('C', 'std')])
  191. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  192. result = g['D'].agg({'C': ['sum', 'std']})
  193. tm.assert_frame_equal(result, expected, check_like=True)
  194. expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1)
  195. expected.columns = ['C', 'D']
  196. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  197. result = g['D'].agg({'C': 'sum', 'D': 'std'})
  198. tm.assert_frame_equal(result, expected, check_like=True)
  199. def test_agg_nested_dicts():
  200. # API change for disallowing these types of nested dicts
  201. df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
  202. 'foo', 'bar', 'foo', 'foo'],
  203. 'B': ['one', 'one', 'two', 'two',
  204. 'two', 'two', 'one', 'two'],
  205. 'C': np.random.randn(8) + 1.0,
  206. 'D': np.arange(8)})
  207. g = df.groupby(['A', 'B'])
  208. msg = r'cannot perform renaming for r[1-2] with a nested dictionary'
  209. with pytest.raises(SpecificationError, match=msg):
  210. g.aggregate({'r1': {'C': ['mean', 'sum']},
  211. 'r2': {'D': ['mean', 'sum']}})
  212. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  213. result = g.agg({'C': {'ra': ['mean', 'std']},
  214. 'D': {'rb': ['mean', 'std']}})
  215. expected = pd.concat([g['C'].mean(), g['C'].std(),
  216. g['D'].mean(), g['D'].std()],
  217. axis=1)
  218. expected.columns = pd.MultiIndex.from_tuples(
  219. [('ra', 'mean'), ('ra', 'std'),
  220. ('rb', 'mean'), ('rb', 'std')])
  221. tm.assert_frame_equal(result, expected, check_like=True)
  222. # same name as the original column
  223. # GH9052
  224. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  225. expected = g['D'].agg({'result1': np.sum, 'result2': np.mean})
  226. expected = expected.rename(columns={'result1': 'D'})
  227. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  228. result = g['D'].agg({'D': np.sum, 'result2': np.mean})
  229. tm.assert_frame_equal(result, expected, check_like=True)
  230. def test_agg_item_by_item_raise_typeerror():
  231. df = DataFrame(np.random.randint(10, size=(20, 10)))
  232. def raiseException(df):
  233. pprint_thing('----------------------------------------')
  234. pprint_thing(df.to_string())
  235. raise TypeError('test')
  236. with pytest.raises(TypeError, match='test'):
  237. df.groupby(0).agg(raiseException)
  238. def test_series_agg_multikey():
  239. ts = tm.makeTimeSeries()
  240. grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
  241. result = grouped.agg(np.sum)
  242. expected = grouped.sum()
  243. tm.assert_series_equal(result, expected)
  244. def test_series_agg_multi_pure_python():
  245. data = DataFrame(
  246. {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
  247. 'foo', 'foo', 'foo'],
  248. 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
  249. 'two', 'two', 'one'],
  250. 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
  251. 'dull', 'shiny', 'shiny', 'shiny'],
  252. 'D': np.random.randn(11),
  253. 'E': np.random.randn(11),
  254. 'F': np.random.randn(11)})
  255. def bad(x):
  256. assert (len(x.values.base) > 0)
  257. return 'foo'
  258. result = data.groupby(['A', 'B']).agg(bad)
  259. expected = data.groupby(['A', 'B']).agg(lambda x: 'foo')
  260. tm.assert_frame_equal(result, expected)
  261. def test_agg_consistency():
  262. # agg with ([]) and () not consistent
  263. # GH 6715
  264. def P1(a):
  265. try:
  266. return np.percentile(a.dropna(), q=1)
  267. except Exception:
  268. return np.nan
  269. df = DataFrame({'col1': [1, 2, 3, 4],
  270. 'col2': [10, 25, 26, 31],
  271. 'date': [dt.date(2013, 2, 10), dt.date(2013, 2, 10),
  272. dt.date(2013, 2, 11), dt.date(2013, 2, 11)]})
  273. g = df.groupby('date')
  274. expected = g.agg([P1])
  275. expected.columns = expected.columns.levels[0]
  276. result = g.agg(P1)
  277. tm.assert_frame_equal(result, expected)
  278. def test_agg_callables():
  279. # GH 7929
  280. df = DataFrame({'foo': [1, 2], 'bar': [3, 4]}).astype(np.int64)
  281. class fn_class(object):
  282. def __call__(self, x):
  283. return sum(x)
  284. equiv_callables = [sum,
  285. np.sum,
  286. lambda x: sum(x),
  287. lambda x: x.sum(),
  288. partial(sum),
  289. fn_class(), ]
  290. expected = df.groupby("foo").agg(sum)
  291. for ecall in equiv_callables:
  292. result = df.groupby('foo').agg(ecall)
  293. tm.assert_frame_equal(result, expected)
  294. def test_agg_over_numpy_arrays():
  295. # GH 3788
  296. df = pd.DataFrame([[1, np.array([10, 20, 30])],
  297. [1, np.array([40, 50, 60])],
  298. [2, np.array([20, 30, 40])]],
  299. columns=['category', 'arraydata'])
  300. result = df.groupby('category').agg(sum)
  301. expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]]
  302. expected_index = pd.Index([1, 2], name='category')
  303. expected_column = ['arraydata']
  304. expected = pd.DataFrame(expected_data,
  305. index=expected_index,
  306. columns=expected_column)
  307. tm.assert_frame_equal(result, expected)
  308. def test_agg_timezone_round_trip():
  309. # GH 15426
  310. ts = pd.Timestamp("2016-01-01 12:00:00", tz='US/Pacific')
  311. df = pd.DataFrame({'a': 1,
  312. 'b': [ts + dt.timedelta(minutes=nn)
  313. for nn in range(10)]})
  314. result1 = df.groupby('a')['b'].agg(np.min).iloc[0]
  315. result2 = df.groupby('a')['b'].agg(lambda x: np.min(x)).iloc[0]
  316. result3 = df.groupby('a')['b'].min().iloc[0]
  317. assert result1 == ts
  318. assert result2 == ts
  319. assert result3 == ts
  320. dates = [pd.Timestamp("2016-01-0%d 12:00:00" % i, tz='US/Pacific')
  321. for i in range(1, 5)]
  322. df = pd.DataFrame({'A': ['a', 'b'] * 2, 'B': dates})
  323. grouped = df.groupby('A')
  324. ts = df['B'].iloc[0]
  325. assert ts == grouped.nth(0)['B'].iloc[0]
  326. assert ts == grouped.head(1)['B'].iloc[0]
  327. assert ts == grouped.first()['B'].iloc[0]
  328. assert ts == grouped.apply(lambda x: x.iloc[0])[0]
  329. ts = df['B'].iloc[2]
  330. assert ts == grouped.last()['B'].iloc[0]
  331. assert ts == grouped.apply(lambda x: x.iloc[-1])[0]
  332. def test_sum_uint64_overflow():
  333. # see gh-14758
  334. # Convert to uint64 and don't overflow
  335. df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object)
  336. df = df + 9223372036854775807
  337. index = pd.Index([9223372036854775808,
  338. 9223372036854775810,
  339. 9223372036854775812],
  340. dtype=np.uint64)
  341. expected = pd.DataFrame({1: [9223372036854775809,
  342. 9223372036854775811,
  343. 9223372036854775813]},
  344. index=index)
  345. expected.index.name = 0
  346. result = df.groupby(0).sum()
  347. tm.assert_frame_equal(result, expected)
  348. @pytest.mark.parametrize("structure, expected", [
  349. (tuple, pd.DataFrame({'C': {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})),
  350. (list, pd.DataFrame({'C': {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})),
  351. (lambda x: tuple(x), pd.DataFrame({'C': {(1, 1): (1, 1, 1),
  352. (3, 4): (3, 4, 4)}})),
  353. (lambda x: list(x), pd.DataFrame({'C': {(1, 1): [1, 1, 1],
  354. (3, 4): [3, 4, 4]}}))
  355. ])
  356. def test_agg_structs_dataframe(structure, expected):
  357. df = pd.DataFrame({'A': [1, 1, 1, 3, 3, 3],
  358. 'B': [1, 1, 1, 4, 4, 4],
  359. 'C': [1, 1, 1, 3, 4, 4]})
  360. result = df.groupby(['A', 'B']).aggregate(structure)
  361. expected.index.names = ['A', 'B']
  362. tm.assert_frame_equal(result, expected)
  363. @pytest.mark.parametrize("structure, expected", [
  364. (tuple, pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name='C')),
  365. (list, pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name='C')),
  366. (lambda x: tuple(x), pd.Series([(1, 1, 1), (3, 4, 4)],
  367. index=[1, 3], name='C')),
  368. (lambda x: list(x), pd.Series([[1, 1, 1], [3, 4, 4]],
  369. index=[1, 3], name='C'))
  370. ])
  371. def test_agg_structs_series(structure, expected):
  372. # Issue #18079
  373. df = pd.DataFrame({'A': [1, 1, 1, 3, 3, 3],
  374. 'B': [1, 1, 1, 4, 4, 4],
  375. 'C': [1, 1, 1, 3, 4, 4]})
  376. result = df.groupby('A')['C'].aggregate(structure)
  377. expected.index.name = 'A'
  378. tm.assert_series_equal(result, expected)
  379. def test_agg_category_nansum(observed):
  380. categories = ['a', 'b', 'c']
  381. df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'],
  382. categories=categories),
  383. 'B': [1, 2, 3]})
  384. result = df.groupby("A", observed=observed).B.agg(np.nansum)
  385. expected = pd.Series([3, 3, 0],
  386. index=pd.CategoricalIndex(['a', 'b', 'c'],
  387. categories=categories,
  388. name='A'),
  389. name='B')
  390. if observed:
  391. expected = expected[expected != 0]
  392. tm.assert_series_equal(result, expected)
  393. def test_agg_list_like_func():
  394. # GH 18473
  395. df = pd.DataFrame({'A': [str(x) for x in range(3)],
  396. 'B': [str(x) for x in range(3)]})
  397. grouped = df.groupby('A', as_index=False, sort=False)
  398. result = grouped.agg({'B': lambda x: list(x)})
  399. expected = pd.DataFrame({'A': [str(x) for x in range(3)],
  400. 'B': [[str(x)] for x in range(3)]})
  401. tm.assert_frame_equal(result, expected)