test_function.py 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143
  1. from string import ascii_lowercase
  2. import numpy as np
  3. import pytest
  4. from pandas.compat import product as cart_product
  5. from pandas.errors import UnsupportedFunctionCall
  6. import pandas as pd
  7. from pandas import (
  8. DataFrame, Index, MultiIndex, Series, Timestamp, compat, date_range, isna)
  9. import pandas.core.nanops as nanops
  10. from pandas.util import testing as tm
  11. @pytest.mark.parametrize("agg_func", ['any', 'all'])
  12. @pytest.mark.parametrize("skipna", [True, False])
  13. @pytest.mark.parametrize("vals", [
  14. ['foo', 'bar', 'baz'], ['foo', '', ''], ['', '', ''],
  15. [1, 2, 3], [1, 0, 0], [0, 0, 0],
  16. [1., 2., 3.], [1., 0., 0.], [0., 0., 0.],
  17. [True, True, True], [True, False, False], [False, False, False],
  18. [np.nan, np.nan, np.nan]
  19. ])
  20. def test_groupby_bool_aggs(agg_func, skipna, vals):
  21. df = DataFrame({'key': ['a'] * 3 + ['b'] * 3, 'val': vals * 2})
  22. # Figure out expectation using Python builtin
  23. exp = getattr(compat.builtins, agg_func)(vals)
  24. # edge case for missing data with skipna and 'any'
  25. if skipna and all(isna(vals)) and agg_func == 'any':
  26. exp = False
  27. exp_df = DataFrame([exp] * 2, columns=['val'], index=Index(
  28. ['a', 'b'], name='key'))
  29. result = getattr(df.groupby('key'), agg_func)(skipna=skipna)
  30. tm.assert_frame_equal(result, exp_df)
  31. def test_max_min_non_numeric():
  32. # #2700
  33. aa = DataFrame({'nn': [11, 11, 22, 22],
  34. 'ii': [1, 2, 3, 4],
  35. 'ss': 4 * ['mama']})
  36. result = aa.groupby('nn').max()
  37. assert 'ss' in result
  38. result = aa.groupby('nn').max(numeric_only=False)
  39. assert 'ss' in result
  40. result = aa.groupby('nn').min()
  41. assert 'ss' in result
  42. result = aa.groupby('nn').min(numeric_only=False)
  43. assert 'ss' in result
  44. def test_intercept_builtin_sum():
  45. s = Series([1., 2., np.nan, 3.])
  46. grouped = s.groupby([0, 1, 2, 2])
  47. result = grouped.agg(compat.builtins.sum)
  48. result2 = grouped.apply(compat.builtins.sum)
  49. expected = grouped.sum()
  50. tm.assert_series_equal(result, expected)
  51. tm.assert_series_equal(result2, expected)
  52. # @pytest.mark.parametrize("f", [max, min, sum])
  53. # def test_builtins_apply(f):
  54. @pytest.mark.parametrize("f", [max, min, sum])
  55. @pytest.mark.parametrize('keys', [
  56. "jim", # Single key
  57. ["jim", "joe"] # Multi-key
  58. ])
  59. def test_builtins_apply(keys, f):
  60. # see gh-8155
  61. df = pd.DataFrame(np.random.randint(1, 50, (1000, 2)),
  62. columns=["jim", "joe"])
  63. df["jolie"] = np.random.randn(1000)
  64. fname = f.__name__
  65. result = df.groupby(keys).apply(f)
  66. ngroups = len(df.drop_duplicates(subset=keys))
  67. assert_msg = ("invalid frame shape: {} "
  68. "(expected ({}, 3))".format(result.shape, ngroups))
  69. assert result.shape == (ngroups, 3), assert_msg
  70. tm.assert_frame_equal(result, # numpy's equivalent function
  71. df.groupby(keys).apply(getattr(np, fname)))
  72. if f != sum:
  73. expected = df.groupby(keys).agg(fname).reset_index()
  74. expected.set_index(keys, inplace=True, drop=False)
  75. tm.assert_frame_equal(result, expected, check_dtype=False)
  76. tm.assert_series_equal(getattr(result, fname)(),
  77. getattr(df, fname)())
  78. def test_arg_passthru():
  79. # make sure that we are passing thru kwargs
  80. # to our agg functions
  81. # GH3668
  82. # GH5724
  83. df = pd.DataFrame(
  84. {'group': [1, 1, 2],
  85. 'int': [1, 2, 3],
  86. 'float': [4., 5., 6.],
  87. 'string': list('abc'),
  88. 'category_string': pd.Series(list('abc')).astype('category'),
  89. 'category_int': [7, 8, 9],
  90. 'datetime': pd.date_range('20130101', periods=3),
  91. 'datetimetz': pd.date_range('20130101',
  92. periods=3,
  93. tz='US/Eastern'),
  94. 'timedelta': pd.timedelta_range('1 s', periods=3, freq='s')},
  95. columns=['group', 'int', 'float', 'string',
  96. 'category_string', 'category_int',
  97. 'datetime', 'datetimetz',
  98. 'timedelta'])
  99. expected_columns_numeric = Index(['int', 'float', 'category_int'])
  100. # mean / median
  101. expected = pd.DataFrame(
  102. {'category_int': [7.5, 9],
  103. 'float': [4.5, 6.],
  104. 'timedelta': [pd.Timedelta('1.5s'),
  105. pd.Timedelta('3s')],
  106. 'int': [1.5, 3],
  107. 'datetime': [pd.Timestamp('2013-01-01 12:00:00'),
  108. pd.Timestamp('2013-01-03 00:00:00')],
  109. 'datetimetz': [
  110. pd.Timestamp('2013-01-01 12:00:00', tz='US/Eastern'),
  111. pd.Timestamp('2013-01-03 00:00:00', tz='US/Eastern')]},
  112. index=Index([1, 2], name='group'),
  113. columns=['int', 'float', 'category_int',
  114. 'datetime', 'datetimetz', 'timedelta'])
  115. for attr in ['mean', 'median']:
  116. f = getattr(df.groupby('group'), attr)
  117. result = f()
  118. tm.assert_index_equal(result.columns, expected_columns_numeric)
  119. result = f(numeric_only=False)
  120. tm.assert_frame_equal(result.reindex_like(expected), expected)
  121. # TODO: min, max *should* handle
  122. # categorical (ordered) dtype
  123. expected_columns = Index(['int', 'float', 'string',
  124. 'category_int',
  125. 'datetime', 'datetimetz',
  126. 'timedelta'])
  127. for attr in ['min', 'max']:
  128. f = getattr(df.groupby('group'), attr)
  129. result = f()
  130. tm.assert_index_equal(result.columns, expected_columns)
  131. result = f(numeric_only=False)
  132. tm.assert_index_equal(result.columns, expected_columns)
  133. expected_columns = Index(['int', 'float', 'string',
  134. 'category_string', 'category_int',
  135. 'datetime', 'datetimetz',
  136. 'timedelta'])
  137. for attr in ['first', 'last']:
  138. f = getattr(df.groupby('group'), attr)
  139. result = f()
  140. tm.assert_index_equal(result.columns, expected_columns)
  141. result = f(numeric_only=False)
  142. tm.assert_index_equal(result.columns, expected_columns)
  143. expected_columns = Index(['int', 'float', 'string',
  144. 'category_int', 'timedelta'])
  145. for attr in ['sum']:
  146. f = getattr(df.groupby('group'), attr)
  147. result = f()
  148. tm.assert_index_equal(result.columns, expected_columns_numeric)
  149. result = f(numeric_only=False)
  150. tm.assert_index_equal(result.columns, expected_columns)
  151. expected_columns = Index(['int', 'float', 'category_int'])
  152. for attr in ['prod', 'cumprod']:
  153. f = getattr(df.groupby('group'), attr)
  154. result = f()
  155. tm.assert_index_equal(result.columns, expected_columns_numeric)
  156. result = f(numeric_only=False)
  157. tm.assert_index_equal(result.columns, expected_columns)
  158. # like min, max, but don't include strings
  159. expected_columns = Index(['int', 'float',
  160. 'category_int',
  161. 'datetime', 'datetimetz',
  162. 'timedelta'])
  163. for attr in ['cummin', 'cummax']:
  164. f = getattr(df.groupby('group'), attr)
  165. result = f()
  166. # GH 15561: numeric_only=False set by default like min/max
  167. tm.assert_index_equal(result.columns, expected_columns)
  168. result = f(numeric_only=False)
  169. tm.assert_index_equal(result.columns, expected_columns)
  170. expected_columns = Index(['int', 'float', 'category_int',
  171. 'timedelta'])
  172. for attr in ['cumsum']:
  173. f = getattr(df.groupby('group'), attr)
  174. result = f()
  175. tm.assert_index_equal(result.columns, expected_columns_numeric)
  176. result = f(numeric_only=False)
  177. tm.assert_index_equal(result.columns, expected_columns)
  178. def test_non_cython_api():
  179. # GH5610
  180. # non-cython calls should not include the grouper
  181. df = DataFrame(
  182. [[1, 2, 'foo'],
  183. [1, np.nan, 'bar'],
  184. [3, np.nan, 'baz']],
  185. columns=['A', 'B', 'C'])
  186. g = df.groupby('A')
  187. gni = df.groupby('A', as_index=False)
  188. # mad
  189. expected = DataFrame([[0], [np.nan]], columns=['B'], index=[1, 3])
  190. expected.index.name = 'A'
  191. result = g.mad()
  192. tm.assert_frame_equal(result, expected)
  193. expected = DataFrame([[0., 0.], [0, np.nan]], columns=['A', 'B'],
  194. index=[0, 1])
  195. result = gni.mad()
  196. tm.assert_frame_equal(result, expected)
  197. # describe
  198. expected_index = pd.Index([1, 3], name='A')
  199. expected_col = pd.MultiIndex(levels=[['B'],
  200. ['count', 'mean', 'std', 'min',
  201. '25%', '50%', '75%', 'max']],
  202. codes=[[0] * 8, list(range(8))])
  203. expected = pd.DataFrame([[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
  204. [0.0, np.nan, np.nan, np.nan, np.nan, np.nan,
  205. np.nan, np.nan]],
  206. index=expected_index,
  207. columns=expected_col)
  208. result = g.describe()
  209. tm.assert_frame_equal(result, expected)
  210. expected = pd.concat([df[df.A == 1].describe().unstack().to_frame().T,
  211. df[df.A == 3].describe().unstack().to_frame().T])
  212. expected.index = pd.Index([0, 1])
  213. result = gni.describe()
  214. tm.assert_frame_equal(result, expected)
  215. # any
  216. expected = DataFrame([[True, True], [False, True]], columns=['B', 'C'],
  217. index=[1, 3])
  218. expected.index.name = 'A'
  219. result = g.any()
  220. tm.assert_frame_equal(result, expected)
  221. # idxmax
  222. expected = DataFrame([[0.0], [np.nan]], columns=['B'], index=[1, 3])
  223. expected.index.name = 'A'
  224. result = g.idxmax()
  225. tm.assert_frame_equal(result, expected)
  226. def test_cython_api2():
  227. # this takes the fast apply path
  228. # cumsum (GH5614)
  229. df = DataFrame(
  230. [[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]
  231. ], columns=['A', 'B', 'C'])
  232. expected = DataFrame(
  233. [[2, np.nan], [np.nan, 9], [4, 9]], columns=['B', 'C'])
  234. result = df.groupby('A').cumsum()
  235. tm.assert_frame_equal(result, expected)
  236. # GH 5755 - cumsum is a transformer and should ignore as_index
  237. result = df.groupby('A', as_index=False).cumsum()
  238. tm.assert_frame_equal(result, expected)
  239. # GH 13994
  240. result = df.groupby('A').cumsum(axis=1)
  241. expected = df.cumsum(axis=1)
  242. tm.assert_frame_equal(result, expected)
  243. result = df.groupby('A').cumprod(axis=1)
  244. expected = df.cumprod(axis=1)
  245. tm.assert_frame_equal(result, expected)
  246. def test_cython_median():
  247. df = DataFrame(np.random.randn(1000))
  248. df.values[::2] = np.nan
  249. labels = np.random.randint(0, 50, size=1000).astype(float)
  250. labels[::17] = np.nan
  251. result = df.groupby(labels).median()
  252. exp = df.groupby(labels).agg(nanops.nanmedian)
  253. tm.assert_frame_equal(result, exp)
  254. df = DataFrame(np.random.randn(1000, 5))
  255. rs = df.groupby(labels).agg(np.median)
  256. xp = df.groupby(labels).median()
  257. tm.assert_frame_equal(rs, xp)
  258. def test_median_empty_bins(observed):
  259. df = pd.DataFrame(np.random.randint(0, 44, 500))
  260. grps = range(0, 55, 5)
  261. bins = pd.cut(df[0], grps)
  262. result = df.groupby(bins, observed=observed).median()
  263. expected = df.groupby(bins, observed=observed).agg(lambda x: x.median())
  264. tm.assert_frame_equal(result, expected)
  265. @pytest.mark.parametrize("dtype", [
  266. 'int8', 'int16', 'int32', 'int64', 'float32', 'float64'])
  267. @pytest.mark.parametrize("method,data", [
  268. ('first', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}),
  269. ('last', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}),
  270. ('min', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}),
  271. ('max', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}),
  272. ('nth', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}],
  273. 'args': [1]}),
  274. ('count', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}],
  275. 'out_type': 'int64'})
  276. ])
  277. def test_groupby_non_arithmetic_agg_types(dtype, method, data):
  278. # GH9311, GH6620
  279. df = pd.DataFrame(
  280. [{'a': 1, 'b': 1},
  281. {'a': 1, 'b': 2},
  282. {'a': 2, 'b': 3},
  283. {'a': 2, 'b': 4}])
  284. df['b'] = df.b.astype(dtype)
  285. if 'args' not in data:
  286. data['args'] = []
  287. if 'out_type' in data:
  288. out_type = data['out_type']
  289. else:
  290. out_type = dtype
  291. exp = data['df']
  292. df_out = pd.DataFrame(exp)
  293. df_out['b'] = df_out.b.astype(out_type)
  294. df_out.set_index('a', inplace=True)
  295. grpd = df.groupby('a')
  296. t = getattr(grpd, method)(*data['args'])
  297. tm.assert_frame_equal(t, df_out)
  298. @pytest.mark.parametrize("i", [
  299. (Timestamp("2011-01-15 12:50:28.502376"),
  300. Timestamp("2011-01-20 12:50:28.593448")),
  301. (24650000000000001, 24650000000000002)
  302. ])
  303. def test_groupby_non_arithmetic_agg_int_like_precision(i):
  304. # see gh-6620, gh-9311
  305. df = pd.DataFrame([{"a": 1, "b": i[0]}, {"a": 1, "b": i[1]}])
  306. grp_exp = {"first": {"expected": i[0]},
  307. "last": {"expected": i[1]},
  308. "min": {"expected": i[0]},
  309. "max": {"expected": i[1]},
  310. "nth": {"expected": i[1],
  311. "args": [1]},
  312. "count": {"expected": 2}}
  313. for method, data in compat.iteritems(grp_exp):
  314. if "args" not in data:
  315. data["args"] = []
  316. grouped = df.groupby("a")
  317. res = getattr(grouped, method)(*data["args"])
  318. assert res.iloc[0].b == data["expected"]
  319. def test_fill_consistency():
  320. # GH9221
  321. # pass thru keyword arguments to the generated wrapper
  322. # are set if the passed kw is None (only)
  323. df = DataFrame(index=pd.MultiIndex.from_product(
  324. [['value1', 'value2'], date_range('2014-01-01', '2014-01-06')]),
  325. columns=Index(
  326. ['1', '2'], name='id'))
  327. df['1'] = [np.nan, 1, np.nan, np.nan, 11, np.nan, np.nan, 2, np.nan,
  328. np.nan, 22, np.nan]
  329. df['2'] = [np.nan, 3, np.nan, np.nan, 33, np.nan, np.nan, 4, np.nan,
  330. np.nan, 44, np.nan]
  331. expected = df.groupby(level=0, axis=0).fillna(method='ffill')
  332. result = df.T.groupby(level=0, axis=1).fillna(method='ffill').T
  333. tm.assert_frame_equal(result, expected)
  334. def test_groupby_cumprod():
  335. # GH 4095
  336. df = pd.DataFrame({'key': ['b'] * 10, 'value': 2})
  337. actual = df.groupby('key')['value'].cumprod()
  338. expected = df.groupby('key')['value'].apply(lambda x: x.cumprod())
  339. expected.name = 'value'
  340. tm.assert_series_equal(actual, expected)
  341. df = pd.DataFrame({'key': ['b'] * 100, 'value': 2})
  342. actual = df.groupby('key')['value'].cumprod()
  343. # if overflows, groupby product casts to float
  344. # while numpy passes back invalid values
  345. df['value'] = df['value'].astype(float)
  346. expected = df.groupby('key')['value'].apply(lambda x: x.cumprod())
  347. expected.name = 'value'
  348. tm.assert_series_equal(actual, expected)
  349. def test_ops_general():
  350. ops = [('mean', np.mean),
  351. ('median', np.median),
  352. ('std', np.std),
  353. ('var', np.var),
  354. ('sum', np.sum),
  355. ('prod', np.prod),
  356. ('min', np.min),
  357. ('max', np.max),
  358. ('first', lambda x: x.iloc[0]),
  359. ('last', lambda x: x.iloc[-1]),
  360. ('count', np.size), ]
  361. try:
  362. from scipy.stats import sem
  363. except ImportError:
  364. pass
  365. else:
  366. ops.append(('sem', sem))
  367. df = DataFrame(np.random.randn(1000))
  368. labels = np.random.randint(0, 50, size=1000).astype(float)
  369. for op, targop in ops:
  370. result = getattr(df.groupby(labels), op)().astype(float)
  371. expected = df.groupby(labels).agg(targop)
  372. try:
  373. tm.assert_frame_equal(result, expected)
  374. except BaseException as exc:
  375. exc.args += ('operation: %s' % op, )
  376. raise
  377. def test_max_nan_bug():
  378. raw = """,Date,app,File
  379. -04-23,2013-04-23 00:00:00,,log080001.log
  380. -05-06,2013-05-06 00:00:00,,log.log
  381. -05-07,2013-05-07 00:00:00,OE,xlsx"""
  382. df = pd.read_csv(compat.StringIO(raw), parse_dates=[0])
  383. gb = df.groupby('Date')
  384. r = gb[['File']].max()
  385. e = gb['File'].max().to_frame()
  386. tm.assert_frame_equal(r, e)
  387. assert not r['File'].isna().any()
  388. def test_nlargest():
  389. a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
  390. b = Series(list('a' * 5 + 'b' * 5))
  391. gb = a.groupby(b)
  392. r = gb.nlargest(3)
  393. e = Series([
  394. 7, 5, 3, 10, 9, 6
  395. ], index=MultiIndex.from_arrays([list('aaabbb'), [3, 2, 1, 9, 5, 8]]))
  396. tm.assert_series_equal(r, e)
  397. a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
  398. gb = a.groupby(b)
  399. e = Series([
  400. 3, 2, 1, 3, 3, 2
  401. ], index=MultiIndex.from_arrays([list('aaabbb'), [2, 3, 1, 6, 5, 7]]))
  402. tm.assert_series_equal(gb.nlargest(3, keep='last'), e)
  403. def test_nsmallest():
  404. a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
  405. b = Series(list('a' * 5 + 'b' * 5))
  406. gb = a.groupby(b)
  407. r = gb.nsmallest(3)
  408. e = Series([
  409. 1, 2, 3, 0, 4, 6
  410. ], index=MultiIndex.from_arrays([list('aaabbb'), [0, 4, 1, 6, 7, 8]]))
  411. tm.assert_series_equal(r, e)
  412. a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
  413. gb = a.groupby(b)
  414. e = Series([
  415. 0, 1, 1, 0, 1, 2
  416. ], index=MultiIndex.from_arrays([list('aaabbb'), [4, 1, 0, 9, 8, 7]]))
  417. tm.assert_series_equal(gb.nsmallest(3, keep='last'), e)
  418. @pytest.mark.parametrize("func", [
  419. 'mean', 'var', 'std', 'cumprod', 'cumsum'
  420. ])
  421. def test_numpy_compat(func):
  422. # see gh-12811
  423. df = pd.DataFrame({'A': [1, 2, 1], 'B': [1, 2, 3]})
  424. g = df.groupby('A')
  425. msg = "numpy operations are not valid with groupby"
  426. with pytest.raises(UnsupportedFunctionCall, match=msg):
  427. getattr(g, func)(1, 2, 3)
  428. with pytest.raises(UnsupportedFunctionCall, match=msg):
  429. getattr(g, func)(foo=1)
  430. def test_cummin_cummax():
  431. # GH 15048
  432. num_types = [np.int32, np.int64, np.float32, np.float64]
  433. num_mins = [np.iinfo(np.int32).min, np.iinfo(np.int64).min,
  434. np.finfo(np.float32).min, np.finfo(np.float64).min]
  435. num_max = [np.iinfo(np.int32).max, np.iinfo(np.int64).max,
  436. np.finfo(np.float32).max, np.finfo(np.float64).max]
  437. base_df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 2, 2],
  438. 'B': [3, 4, 3, 2, 2, 3, 2, 1]})
  439. expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
  440. expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]
  441. for dtype, min_val, max_val in zip(num_types, num_mins, num_max):
  442. df = base_df.astype(dtype)
  443. # cummin
  444. expected = pd.DataFrame({'B': expected_mins}).astype(dtype)
  445. result = df.groupby('A').cummin()
  446. tm.assert_frame_equal(result, expected)
  447. result = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
  448. tm.assert_frame_equal(result, expected)
  449. # Test cummin w/ min value for dtype
  450. df.loc[[2, 6], 'B'] = min_val
  451. expected.loc[[2, 3, 6, 7], 'B'] = min_val
  452. result = df.groupby('A').cummin()
  453. tm.assert_frame_equal(result, expected)
  454. expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
  455. tm.assert_frame_equal(result, expected)
  456. # cummax
  457. expected = pd.DataFrame({'B': expected_maxs}).astype(dtype)
  458. result = df.groupby('A').cummax()
  459. tm.assert_frame_equal(result, expected)
  460. result = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
  461. tm.assert_frame_equal(result, expected)
  462. # Test cummax w/ max value for dtype
  463. df.loc[[2, 6], 'B'] = max_val
  464. expected.loc[[2, 3, 6, 7], 'B'] = max_val
  465. result = df.groupby('A').cummax()
  466. tm.assert_frame_equal(result, expected)
  467. expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
  468. tm.assert_frame_equal(result, expected)
  469. # Test nan in some values
  470. base_df.loc[[0, 2, 4, 6], 'B'] = np.nan
  471. expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 2,
  472. np.nan, 3, np.nan, 1]})
  473. result = base_df.groupby('A').cummin()
  474. tm.assert_frame_equal(result, expected)
  475. expected = (base_df.groupby('A')
  476. .B
  477. .apply(lambda x: x.cummin())
  478. .to_frame())
  479. tm.assert_frame_equal(result, expected)
  480. expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 4,
  481. np.nan, 3, np.nan, 3]})
  482. result = base_df.groupby('A').cummax()
  483. tm.assert_frame_equal(result, expected)
  484. expected = (base_df.groupby('A')
  485. .B
  486. .apply(lambda x: x.cummax())
  487. .to_frame())
  488. tm.assert_frame_equal(result, expected)
  489. # Test nan in entire column
  490. base_df['B'] = np.nan
  491. expected = pd.DataFrame({'B': [np.nan] * 8})
  492. result = base_df.groupby('A').cummin()
  493. tm.assert_frame_equal(expected, result)
  494. result = base_df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
  495. tm.assert_frame_equal(expected, result)
  496. result = base_df.groupby('A').cummax()
  497. tm.assert_frame_equal(expected, result)
  498. result = base_df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
  499. tm.assert_frame_equal(expected, result)
  500. # GH 15561
  501. df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(['2001'])))
  502. expected = pd.Series(pd.to_datetime('2001'), index=[0], name='b')
  503. for method in ['cummax', 'cummin']:
  504. result = getattr(df.groupby('a')['b'], method)()
  505. tm.assert_series_equal(expected, result)
  506. # GH 15635
  507. df = pd.DataFrame(dict(a=[1, 2, 1], b=[2, 1, 1]))
  508. result = df.groupby('a').b.cummax()
  509. expected = pd.Series([2, 1, 2], name='b')
  510. tm.assert_series_equal(result, expected)
  511. df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2]))
  512. result = df.groupby('a').b.cummin()
  513. expected = pd.Series([1, 2, 1], name='b')
  514. tm.assert_series_equal(result, expected)
  515. @pytest.mark.parametrize('in_vals, out_vals', [
  516. # Basics: strictly increasing (T), strictly decreasing (F),
  517. # abs val increasing (F), non-strictly increasing (T)
  518. ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1],
  519. [True, False, False, True]),
  520. # Test with inf vals
  521. ([1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf],
  522. [True, False, True, False]),
  523. # Test with nan vals; should always be False
  524. ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
  525. [False, False, False, False]),
  526. ])
  527. def test_is_monotonic_increasing(in_vals, out_vals):
  528. # GH 17015
  529. source_dict = {
  530. 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'],
  531. 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'],
  532. 'C': in_vals}
  533. df = pd.DataFrame(source_dict)
  534. result = df.groupby('B').C.is_monotonic_increasing
  535. index = Index(list('abcd'), name='B')
  536. expected = pd.Series(index=index, data=out_vals, name='C')
  537. tm.assert_series_equal(result, expected)
  538. # Also check result equal to manually taking x.is_monotonic_increasing.
  539. expected = (
  540. df.groupby(['B']).C.apply(lambda x: x.is_monotonic_increasing))
  541. tm.assert_series_equal(result, expected)
  542. @pytest.mark.parametrize('in_vals, out_vals', [
  543. # Basics: strictly decreasing (T), strictly increasing (F),
  544. # abs val decreasing (F), non-strictly increasing (T)
  545. ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1],
  546. [True, False, False, True]),
  547. # Test with inf vals
  548. ([np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf],
  549. [True, True, False, True]),
  550. # Test with nan vals; should always be False
  551. ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
  552. [False, False, False, False]),
  553. ])
  554. def test_is_monotonic_decreasing(in_vals, out_vals):
  555. # GH 17015
  556. source_dict = {
  557. 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'],
  558. 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'],
  559. 'C': in_vals}
  560. df = pd.DataFrame(source_dict)
  561. result = df.groupby('B').C.is_monotonic_decreasing
  562. index = Index(list('abcd'), name='B')
  563. expected = pd.Series(index=index, data=out_vals, name='C')
  564. tm.assert_series_equal(result, expected)
  565. # describe
  566. # --------------------------------
  567. def test_apply_describe_bug(mframe):
  568. grouped = mframe.groupby(level='first')
  569. grouped.describe() # it works!
  570. def test_series_describe_multikey():
  571. ts = tm.makeTimeSeries()
  572. grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
  573. result = grouped.describe()
  574. tm.assert_series_equal(result['mean'], grouped.mean(),
  575. check_names=False)
  576. tm.assert_series_equal(result['std'], grouped.std(), check_names=False)
  577. tm.assert_series_equal(result['min'], grouped.min(), check_names=False)
  578. def test_series_describe_single():
  579. ts = tm.makeTimeSeries()
  580. grouped = ts.groupby(lambda x: x.month)
  581. result = grouped.apply(lambda x: x.describe())
  582. expected = grouped.describe().stack()
  583. tm.assert_series_equal(result, expected)
  584. def test_series_index_name(df):
  585. grouped = df.loc[:, ['C']].groupby(df['A'])
  586. result = grouped.agg(lambda x: x.mean())
  587. assert result.index.name == 'A'
  588. def test_frame_describe_multikey(tsframe):
  589. grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
  590. result = grouped.describe()
  591. desc_groups = []
  592. for col in tsframe:
  593. group = grouped[col].describe()
  594. # GH 17464 - Remove duplicate MultiIndex levels
  595. group_col = pd.MultiIndex(
  596. levels=[[col], group.columns],
  597. codes=[[0] * len(group.columns), range(len(group.columns))])
  598. group = pd.DataFrame(group.values,
  599. columns=group_col,
  600. index=group.index)
  601. desc_groups.append(group)
  602. expected = pd.concat(desc_groups, axis=1)
  603. tm.assert_frame_equal(result, expected)
  604. groupedT = tsframe.groupby({'A': 0, 'B': 0,
  605. 'C': 1, 'D': 1}, axis=1)
  606. result = groupedT.describe()
  607. expected = tsframe.describe().T
  608. expected.index = pd.MultiIndex(
  609. levels=[[0, 1], expected.index],
  610. codes=[[0, 0, 1, 1], range(len(expected.index))])
  611. tm.assert_frame_equal(result, expected)
  612. def test_frame_describe_tupleindex():
  613. # GH 14848 - regression from 0.19.0 to 0.19.1
  614. df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3,
  615. 'y': [10, 20, 30, 40, 50] * 3,
  616. 'z': [100, 200, 300, 400, 500] * 3})
  617. df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
  618. df2 = df1.rename(columns={'k': 'key'})
  619. msg = "Names should be list-like for a MultiIndex"
  620. with pytest.raises(ValueError, match=msg):
  621. df1.groupby('k').describe()
  622. with pytest.raises(ValueError, match=msg):
  623. df2.groupby('key').describe()
  624. def test_frame_describe_unstacked_format():
  625. # GH 4792
  626. prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990,
  627. pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499,
  628. pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499}
  629. volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000,
  630. pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000,
  631. pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000}
  632. df = pd.DataFrame({'PRICE': prices,
  633. 'VOLUME': volumes})
  634. result = df.groupby('PRICE').VOLUME.describe()
  635. data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
  636. df[df.PRICE == 25499].VOLUME.describe().values.tolist()]
  637. expected = pd.DataFrame(data,
  638. index=pd.Index([24990, 25499], name='PRICE'),
  639. columns=['count', 'mean', 'std', 'min',
  640. '25%', '50%', '75%', 'max'])
  641. tm.assert_frame_equal(result, expected)
  642. # nunique
  643. # --------------------------------
  644. @pytest.mark.parametrize('n', 10 ** np.arange(2, 6))
  645. @pytest.mark.parametrize('m', [10, 100, 1000])
  646. @pytest.mark.parametrize('sort', [False, True])
  647. @pytest.mark.parametrize('dropna', [False, True])
  648. def test_series_groupby_nunique(n, m, sort, dropna):
  649. def check_nunique(df, keys, as_index=True):
  650. gr = df.groupby(keys, as_index=as_index, sort=sort)
  651. left = gr['julie'].nunique(dropna=dropna)
  652. gr = df.groupby(keys, as_index=as_index, sort=sort)
  653. right = gr['julie'].apply(Series.nunique, dropna=dropna)
  654. if not as_index:
  655. right = right.reset_index(drop=True)
  656. tm.assert_series_equal(left, right, check_names=False)
  657. days = date_range('2015-08-23', periods=10)
  658. frame = DataFrame({'jim': np.random.choice(list(ascii_lowercase), n),
  659. 'joe': np.random.choice(days, n),
  660. 'julie': np.random.randint(0, m, n)})
  661. check_nunique(frame, ['jim'])
  662. check_nunique(frame, ['jim', 'joe'])
  663. frame.loc[1::17, 'jim'] = None
  664. frame.loc[3::37, 'joe'] = None
  665. frame.loc[7::19, 'julie'] = None
  666. frame.loc[8::19, 'julie'] = None
  667. frame.loc[9::19, 'julie'] = None
  668. check_nunique(frame, ['jim'])
  669. check_nunique(frame, ['jim', 'joe'])
  670. check_nunique(frame, ['jim'], as_index=False)
  671. check_nunique(frame, ['jim', 'joe'], as_index=False)
  672. def test_nunique():
  673. df = DataFrame({
  674. 'A': list('abbacc'),
  675. 'B': list('abxacc'),
  676. 'C': list('abbacx'),
  677. })
  678. expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]})
  679. result = df.groupby('A', as_index=False).nunique()
  680. tm.assert_frame_equal(result, expected)
  681. # as_index
  682. expected.index = list('abc')
  683. expected.index.name = 'A'
  684. result = df.groupby('A').nunique()
  685. tm.assert_frame_equal(result, expected)
  686. # with na
  687. result = df.replace({'x': None}).groupby('A').nunique(dropna=False)
  688. tm.assert_frame_equal(result, expected)
  689. # dropna
  690. expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3},
  691. index=list('abc'))
  692. expected.index.name = 'A'
  693. result = df.replace({'x': None}).groupby('A').nunique()
  694. tm.assert_frame_equal(result, expected)
  695. def test_nunique_with_object():
  696. # GH 11077
  697. data = pd.DataFrame(
  698. [[100, 1, 'Alice'],
  699. [200, 2, 'Bob'],
  700. [300, 3, 'Charlie'],
  701. [-400, 4, 'Dan'],
  702. [500, 5, 'Edith']],
  703. columns=['amount', 'id', 'name']
  704. )
  705. result = data.groupby(['id', 'amount'])['name'].nunique()
  706. index = MultiIndex.from_arrays([data.id, data.amount])
  707. expected = pd.Series([1] * 5, name='name', index=index)
  708. tm.assert_series_equal(result, expected)
  709. def test_nunique_with_empty_series():
  710. # GH 12553
  711. data = pd.Series(name='name')
  712. result = data.groupby(level=0).nunique()
  713. expected = pd.Series(name='name', dtype='int64')
  714. tm.assert_series_equal(result, expected)
  715. def test_nunique_with_timegrouper():
  716. # GH 13453
  717. test = pd.DataFrame({
  718. 'time': [Timestamp('2016-06-28 09:35:35'),
  719. Timestamp('2016-06-28 16:09:30'),
  720. Timestamp('2016-06-28 16:46:28')],
  721. 'data': ['1', '2', '3']}).set_index('time')
  722. result = test.groupby(pd.Grouper(freq='h'))['data'].nunique()
  723. expected = test.groupby(
  724. pd.Grouper(freq='h')
  725. )['data'].apply(pd.Series.nunique)
  726. tm.assert_series_equal(result, expected)
  727. # count
  728. # --------------------------------
  729. def test_groupby_timedelta_cython_count():
  730. df = DataFrame({'g': list('ab' * 2),
  731. 'delt': np.arange(4).astype('timedelta64[ns]')})
  732. expected = Series([
  733. 2, 2
  734. ], index=pd.Index(['a', 'b'], name='g'), name='delt')
  735. result = df.groupby('g').delt.count()
  736. tm.assert_series_equal(expected, result)
  737. def test_count():
  738. n = 1 << 15
  739. dr = date_range('2015-08-30', periods=n // 10, freq='T')
  740. df = DataFrame({
  741. '1st': np.random.choice(
  742. list(ascii_lowercase), n),
  743. '2nd': np.random.randint(0, 5, n),
  744. '3rd': np.random.randn(n).round(3),
  745. '4th': np.random.randint(-10, 10, n),
  746. '5th': np.random.choice(dr, n),
  747. '6th': np.random.randn(n).round(3),
  748. '7th': np.random.randn(n).round(3),
  749. '8th': np.random.choice(dr, n) - np.random.choice(dr, 1),
  750. '9th': np.random.choice(
  751. list(ascii_lowercase), n)
  752. })
  753. for col in df.columns.drop(['1st', '2nd', '4th']):
  754. df.loc[np.random.choice(n, n // 10), col] = np.nan
  755. df['9th'] = df['9th'].astype('category')
  756. for key in '1st', '2nd', ['1st', '2nd']:
  757. left = df.groupby(key).count()
  758. right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1)
  759. tm.assert_frame_equal(left, right)
  760. # GH5610
  761. # count counts non-nulls
  762. df = pd.DataFrame([[1, 2, 'foo'],
  763. [1, np.nan, 'bar'],
  764. [3, np.nan, np.nan]],
  765. columns=['A', 'B', 'C'])
  766. count_as = df.groupby('A').count()
  767. count_not_as = df.groupby('A', as_index=False).count()
  768. expected = DataFrame([[1, 2], [0, 0]], columns=['B', 'C'],
  769. index=[1, 3])
  770. expected.index.name = 'A'
  771. tm.assert_frame_equal(count_not_as, expected.reset_index())
  772. tm.assert_frame_equal(count_as, expected)
  773. count_B = df.groupby('A')['B'].count()
  774. tm.assert_series_equal(count_B, expected['B'])
  775. def test_count_object():
  776. df = pd.DataFrame({'a': ['a'] * 3 + ['b'] * 3, 'c': [2] * 3 + [3] * 3})
  777. result = df.groupby('c').a.count()
  778. expected = pd.Series([
  779. 3, 3
  780. ], index=pd.Index([2, 3], name='c'), name='a')
  781. tm.assert_series_equal(result, expected)
  782. df = pd.DataFrame({'a': ['a', np.nan, np.nan] + ['b'] * 3,
  783. 'c': [2] * 3 + [3] * 3})
  784. result = df.groupby('c').a.count()
  785. expected = pd.Series([
  786. 1, 3
  787. ], index=pd.Index([2, 3], name='c'), name='a')
  788. tm.assert_series_equal(result, expected)
  789. def test_count_cross_type():
  790. # GH8169
  791. vals = np.hstack((np.random.randint(0, 5, (100, 2)), np.random.randint(
  792. 0, 2, (100, 2))))
  793. df = pd.DataFrame(vals, columns=['a', 'b', 'c', 'd'])
  794. df[df == 2] = np.nan
  795. expected = df.groupby(['c', 'd']).count()
  796. for t in ['float32', 'object']:
  797. df['a'] = df['a'].astype(t)
  798. df['b'] = df['b'].astype(t)
  799. result = df.groupby(['c', 'd']).count()
  800. tm.assert_frame_equal(result, expected)
  801. def test_lower_int_prec_count():
  802. df = DataFrame({'a': np.array(
  803. [0, 1, 2, 100], np.int8),
  804. 'b': np.array(
  805. [1, 2, 3, 6], np.uint32),
  806. 'c': np.array(
  807. [4, 5, 6, 8], np.int16),
  808. 'grp': list('ab' * 2)})
  809. result = df.groupby('grp').count()
  810. expected = DataFrame({'a': [2, 2],
  811. 'b': [2, 2],
  812. 'c': [2, 2]}, index=pd.Index(list('ab'),
  813. name='grp'))
  814. tm.assert_frame_equal(result, expected)
  815. def test_count_uses_size_on_exception():
  816. class RaisingObjectException(Exception):
  817. pass
  818. class RaisingObject(object):
  819. def __init__(self, msg='I will raise inside Cython'):
  820. super(RaisingObject, self).__init__()
  821. self.msg = msg
  822. def __eq__(self, other):
  823. # gets called in Cython to check that raising calls the method
  824. raise RaisingObjectException(self.msg)
  825. df = DataFrame({'a': [RaisingObject() for _ in range(4)],
  826. 'grp': list('ab' * 2)})
  827. result = df.groupby('grp').count()
  828. expected = DataFrame({'a': [2, 2]}, index=pd.Index(
  829. list('ab'), name='grp'))
  830. tm.assert_frame_equal(result, expected)
  831. # size
  832. # --------------------------------
  833. def test_size(df):
  834. grouped = df.groupby(['A', 'B'])
  835. result = grouped.size()
  836. for key, group in grouped:
  837. assert result[key] == len(group)
  838. grouped = df.groupby('A')
  839. result = grouped.size()
  840. for key, group in grouped:
  841. assert result[key] == len(group)
  842. grouped = df.groupby('B')
  843. result = grouped.size()
  844. for key, group in grouped:
  845. assert result[key] == len(group)
  846. df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc'))
  847. for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])):
  848. left = df.groupby(key, sort=sort).size()
  849. right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0])
  850. tm.assert_series_equal(left, right, check_names=False)
  851. # GH11699
  852. df = DataFrame([], columns=['A', 'B'])
  853. out = Series([], dtype='int64', index=Index([], name='A'))
  854. tm.assert_series_equal(df.groupby('A').size(), out)
  855. # pipe
  856. # --------------------------------
  857. def test_pipe():
  858. # Test the pipe method of DataFrameGroupBy.
  859. # Issue #17871
  860. random_state = np.random.RandomState(1234567890)
  861. df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
  862. 'foo', 'bar', 'foo', 'foo'],
  863. 'B': random_state.randn(8),
  864. 'C': random_state.randn(8)})
  865. def f(dfgb):
  866. return dfgb.B.max() - dfgb.C.min().min()
  867. def square(srs):
  868. return srs ** 2
  869. # Note that the transformations are
  870. # GroupBy -> Series
  871. # Series -> Series
  872. # This then chains the GroupBy.pipe and the
  873. # NDFrame.pipe methods
  874. result = df.groupby('A').pipe(f).pipe(square)
  875. index = Index([u'bar', u'foo'], dtype='object', name=u'A')
  876. expected = pd.Series([8.99110003361, 8.17516964785], name='B',
  877. index=index)
  878. tm.assert_series_equal(expected, result)
  879. def test_pipe_args():
  880. # Test passing args to the pipe method of DataFrameGroupBy.
  881. # Issue #17871
  882. df = pd.DataFrame({'group': ['A', 'A', 'B', 'B', 'C'],
  883. 'x': [1.0, 2.0, 3.0, 2.0, 5.0],
  884. 'y': [10.0, 100.0, 1000.0, -100.0, -1000.0]})
  885. def f(dfgb, arg1):
  886. return (dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False)
  887. .groupby(dfgb.grouper))
  888. def g(dfgb, arg2):
  889. return dfgb.sum() / dfgb.sum().sum() + arg2
  890. def h(df, arg3):
  891. return df.x + df.y - arg3
  892. result = (df
  893. .groupby('group')
  894. .pipe(f, 0)
  895. .pipe(g, 10)
  896. .pipe(h, 100))
  897. # Assert the results here
  898. index = pd.Index(['A', 'B', 'C'], name='group')
  899. expected = pd.Series([-79.5160891089, -78.4839108911, -80],
  900. index=index)
  901. tm.assert_series_equal(expected, result)
  902. # test SeriesGroupby.pipe
  903. ser = pd.Series([1, 1, 2, 2, 3, 3])
  904. result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count())
  905. expected = pd.Series([4, 8, 12], index=pd.Int64Index([1, 2, 3]))
  906. tm.assert_series_equal(result, expected)
  907. def test_groupby_mean_no_overflow():
  908. # Regression test for (#22487)
  909. df = pd.DataFrame({
  910. "user": ["A", "A", "A", "A", "A"],
  911. "connections": [4970, 4749, 4719, 4704, 18446744073699999744]
  912. })
  913. assert df.groupby('user')['connections'].mean()['A'] == 3689348814740003840