test_groupby.py 55 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746
  1. # -*- coding: utf-8 -*-
  2. from __future__ import print_function
  3. from collections import defaultdict
  4. from datetime import datetime
  5. from decimal import Decimal
  6. import numpy as np
  7. import pytest
  8. from pandas.compat import (
  9. OrderedDict, StringIO, lmap, lrange, lzip, map, range, zip)
  10. from pandas.errors import PerformanceWarning
  11. import pandas as pd
  12. from pandas import (
  13. DataFrame, Index, MultiIndex, Panel, Series, Timestamp, compat, date_range,
  14. read_csv)
  15. import pandas.core.common as com
  16. import pandas.util.testing as tm
  17. from pandas.util.testing import (
  18. assert_almost_equal, assert_frame_equal, assert_series_equal)
  19. def test_repr():
  20. # GH18203
  21. result = repr(pd.Grouper(key='A', level='B'))
  22. expected = "Grouper(key='A', level='B', axis=0, sort=False)"
  23. assert result == expected
  24. @pytest.mark.parametrize('dtype', ['int64', 'int32', 'float64', 'float32'])
  25. def test_basic(dtype):
  26. data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype)
  27. index = np.arange(9)
  28. np.random.shuffle(index)
  29. data = data.reindex(index)
  30. grouped = data.groupby(lambda x: x // 3)
  31. for k, v in grouped:
  32. assert len(v) == 3
  33. agged = grouped.aggregate(np.mean)
  34. assert agged[1] == 1
  35. assert_series_equal(agged, grouped.agg(np.mean)) # shorthand
  36. assert_series_equal(agged, grouped.mean())
  37. assert_series_equal(grouped.agg(np.sum), grouped.sum())
  38. expected = grouped.apply(lambda x: x * x.sum())
  39. transformed = grouped.transform(lambda x: x * x.sum())
  40. assert transformed[7] == 12
  41. assert_series_equal(transformed, expected)
  42. value_grouped = data.groupby(data)
  43. assert_series_equal(value_grouped.aggregate(np.mean), agged,
  44. check_index_type=False)
  45. # complex agg
  46. agged = grouped.aggregate([np.mean, np.std])
  47. with tm.assert_produces_warning(FutureWarning,
  48. check_stacklevel=False):
  49. agged = grouped.aggregate({'one': np.mean, 'two': np.std})
  50. group_constants = {0: 10, 1: 20, 2: 30}
  51. agged = grouped.agg(lambda x: group_constants[x.name] + x.mean())
  52. assert agged[1] == 21
  53. # corner cases
  54. msg = "Must produce aggregated value"
  55. # exception raised is type Exception
  56. with pytest.raises(Exception, match=msg):
  57. grouped.aggregate(lambda x: x * 2)
  58. def test_groupby_nonobject_dtype(mframe, df_mixed_floats):
  59. key = mframe.index.codes[0]
  60. grouped = mframe.groupby(key)
  61. result = grouped.sum()
  62. expected = mframe.groupby(key.astype('O')).sum()
  63. assert_frame_equal(result, expected)
  64. # GH 3911, mixed frame non-conversion
  65. df = df_mixed_floats.copy()
  66. df['value'] = lrange(len(df))
  67. def max_value(group):
  68. return group.loc[group['value'].idxmax()]
  69. applied = df.groupby('A').apply(max_value)
  70. result = applied.get_dtype_counts().sort_values()
  71. expected = Series({'float64': 2,
  72. 'int64': 1,
  73. 'object': 2}).sort_values()
  74. assert_series_equal(result, expected)
  75. def test_groupby_return_type():
  76. # GH2893, return a reduced type
  77. df1 = DataFrame(
  78. [{"val1": 1, "val2": 20},
  79. {"val1": 1, "val2": 19},
  80. {"val1": 2, "val2": 27},
  81. {"val1": 2, "val2": 12}
  82. ])
  83. def func(dataf):
  84. return dataf["val2"] - dataf["val2"].mean()
  85. result = df1.groupby("val1", squeeze=True).apply(func)
  86. assert isinstance(result, Series)
  87. df2 = DataFrame(
  88. [{"val1": 1, "val2": 20},
  89. {"val1": 1, "val2": 19},
  90. {"val1": 1, "val2": 27},
  91. {"val1": 1, "val2": 12}
  92. ])
  93. def func(dataf):
  94. return dataf["val2"] - dataf["val2"].mean()
  95. result = df2.groupby("val1", squeeze=True).apply(func)
  96. assert isinstance(result, Series)
  97. # GH3596, return a consistent type (regression in 0.11 from 0.10.1)
  98. df = DataFrame([[1, 1], [1, 1]], columns=['X', 'Y'])
  99. result = df.groupby('X', squeeze=False).count()
  100. assert isinstance(result, DataFrame)
  101. # GH5592
  102. # inconcistent return type
  103. df = DataFrame(dict(A=['Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb',
  104. 'Pony', 'Pony'], B=Series(
  105. np.arange(7), dtype='int64'), C=date_range(
  106. '20130101', periods=7)))
  107. def f(grp):
  108. return grp.iloc[0]
  109. expected = df.groupby('A').first()[['B']]
  110. result = df.groupby('A').apply(f)[['B']]
  111. assert_frame_equal(result, expected)
  112. def f(grp):
  113. if grp.name == 'Tiger':
  114. return None
  115. return grp.iloc[0]
  116. result = df.groupby('A').apply(f)[['B']]
  117. e = expected.copy()
  118. e.loc['Tiger'] = np.nan
  119. assert_frame_equal(result, e)
  120. def f(grp):
  121. if grp.name == 'Pony':
  122. return None
  123. return grp.iloc[0]
  124. result = df.groupby('A').apply(f)[['B']]
  125. e = expected.copy()
  126. e.loc['Pony'] = np.nan
  127. assert_frame_equal(result, e)
  128. # 5592 revisited, with datetimes
  129. def f(grp):
  130. if grp.name == 'Pony':
  131. return None
  132. return grp.iloc[0]
  133. result = df.groupby('A').apply(f)[['C']]
  134. e = df.groupby('A').first()[['C']]
  135. e.loc['Pony'] = pd.NaT
  136. assert_frame_equal(result, e)
  137. # scalar outputs
  138. def f(grp):
  139. if grp.name == 'Pony':
  140. return None
  141. return grp.iloc[0].loc['C']
  142. result = df.groupby('A').apply(f)
  143. e = df.groupby('A').first()['C'].copy()
  144. e.loc['Pony'] = np.nan
  145. e.name = None
  146. assert_series_equal(result, e)
  147. def test_pass_args_kwargs(ts, tsframe):
  148. def f(x, q=None, axis=0):
  149. return np.percentile(x, q, axis=axis)
  150. g = lambda x: np.percentile(x, 80, axis=0)
  151. # Series
  152. ts_grouped = ts.groupby(lambda x: x.month)
  153. agg_result = ts_grouped.agg(np.percentile, 80, axis=0)
  154. apply_result = ts_grouped.apply(np.percentile, 80, axis=0)
  155. trans_result = ts_grouped.transform(np.percentile, 80, axis=0)
  156. agg_expected = ts_grouped.quantile(.8)
  157. trans_expected = ts_grouped.transform(g)
  158. assert_series_equal(apply_result, agg_expected)
  159. assert_series_equal(agg_result, agg_expected, check_names=False)
  160. assert_series_equal(trans_result, trans_expected)
  161. agg_result = ts_grouped.agg(f, q=80)
  162. apply_result = ts_grouped.apply(f, q=80)
  163. trans_result = ts_grouped.transform(f, q=80)
  164. assert_series_equal(agg_result, agg_expected)
  165. assert_series_equal(apply_result, agg_expected)
  166. assert_series_equal(trans_result, trans_expected)
  167. # DataFrame
  168. df_grouped = tsframe.groupby(lambda x: x.month)
  169. agg_result = df_grouped.agg(np.percentile, 80, axis=0)
  170. apply_result = df_grouped.apply(DataFrame.quantile, .8)
  171. expected = df_grouped.quantile(.8)
  172. assert_frame_equal(apply_result, expected)
  173. assert_frame_equal(agg_result, expected, check_names=False)
  174. agg_result = df_grouped.agg(f, q=80)
  175. apply_result = df_grouped.apply(DataFrame.quantile, q=.8)
  176. assert_frame_equal(agg_result, expected, check_names=False)
  177. assert_frame_equal(apply_result, expected)
  178. def test_len():
  179. df = tm.makeTimeDataFrame()
  180. grouped = df.groupby([lambda x: x.year, lambda x: x.month,
  181. lambda x: x.day])
  182. assert len(grouped) == len(df)
  183. grouped = df.groupby([lambda x: x.year, lambda x: x.month])
  184. expected = len({(x.year, x.month) for x in df.index})
  185. assert len(grouped) == expected
  186. # issue 11016
  187. df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3]))
  188. assert len(df.groupby(('a'))) == 0
  189. assert len(df.groupby(('b'))) == 3
  190. assert len(df.groupby(['a', 'b'])) == 3
  191. def test_basic_regression():
  192. # regression
  193. T = [1.0 * x for x in lrange(1, 10) * 10][:1095]
  194. result = Series(T, lrange(0, len(T)))
  195. groupings = np.random.random((1100, ))
  196. groupings = Series(groupings, lrange(0, len(groupings))) * 10.
  197. grouped = result.groupby(groupings)
  198. grouped.mean()
  199. @pytest.mark.parametrize('dtype', ['float64', 'float32', 'int64',
  200. 'int32', 'int16', 'int8'])
  201. def test_with_na_groups(dtype):
  202. index = Index(np.arange(10))
  203. values = Series(np.ones(10), index, dtype=dtype)
  204. labels = Series([np.nan, 'foo', 'bar', 'bar', np.nan, np.nan,
  205. 'bar', 'bar', np.nan, 'foo'], index=index)
  206. # this SHOULD be an int
  207. grouped = values.groupby(labels)
  208. agged = grouped.agg(len)
  209. expected = Series([4, 2], index=['bar', 'foo'])
  210. assert_series_equal(agged, expected, check_dtype=False)
  211. # assert issubclass(agged.dtype.type, np.integer)
  212. # explicitly return a float from my function
  213. def f(x):
  214. return float(len(x))
  215. agged = grouped.agg(f)
  216. expected = Series([4, 2], index=['bar', 'foo'])
  217. assert_series_equal(agged, expected, check_dtype=False)
  218. assert issubclass(agged.dtype.type, np.dtype(dtype).type)
  219. def test_indices_concatenation_order():
  220. # GH 2808
  221. def f1(x):
  222. y = x[(x.b % 2) == 1] ** 2
  223. if y.empty:
  224. multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2,
  225. names=['b', 'c'])
  226. res = DataFrame(None, columns=['a'], index=multiindex)
  227. return res
  228. else:
  229. y = y.set_index(['b', 'c'])
  230. return y
  231. def f2(x):
  232. y = x[(x.b % 2) == 1] ** 2
  233. if y.empty:
  234. return DataFrame()
  235. else:
  236. y = y.set_index(['b', 'c'])
  237. return y
  238. def f3(x):
  239. y = x[(x.b % 2) == 1] ** 2
  240. if y.empty:
  241. multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2,
  242. names=['foo', 'bar'])
  243. res = DataFrame(None, columns=['a', 'b'], index=multiindex)
  244. return res
  245. else:
  246. return y
  247. df = DataFrame({'a': [1, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)})
  248. df2 = DataFrame({'a': [3, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)})
  249. # correct result
  250. result1 = df.groupby('a').apply(f1)
  251. result2 = df2.groupby('a').apply(f1)
  252. assert_frame_equal(result1, result2)
  253. # should fail (not the same number of levels)
  254. msg = "Cannot concat indices that do not have the same number of levels"
  255. with pytest.raises(AssertionError, match=msg):
  256. df.groupby('a').apply(f2)
  257. with pytest.raises(AssertionError, match=msg):
  258. df2.groupby('a').apply(f2)
  259. # should fail (incorrect shape)
  260. with pytest.raises(AssertionError, match=msg):
  261. df.groupby('a').apply(f3)
  262. with pytest.raises(AssertionError, match=msg):
  263. df2.groupby('a').apply(f3)
  264. def test_attr_wrapper(ts):
  265. grouped = ts.groupby(lambda x: x.weekday())
  266. result = grouped.std()
  267. expected = grouped.agg(lambda x: np.std(x, ddof=1))
  268. assert_series_equal(result, expected)
  269. # this is pretty cool
  270. result = grouped.describe()
  271. expected = {name: gp.describe() for name, gp in grouped}
  272. expected = DataFrame(expected).T
  273. assert_frame_equal(result, expected)
  274. # get attribute
  275. result = grouped.dtype
  276. expected = grouped.agg(lambda x: x.dtype)
  277. # make sure raises error
  278. msg = "'SeriesGroupBy' object has no attribute 'foo'"
  279. with pytest.raises(AttributeError, match=msg):
  280. getattr(grouped, 'foo')
  281. def test_frame_groupby(tsframe):
  282. grouped = tsframe.groupby(lambda x: x.weekday())
  283. # aggregate
  284. aggregated = grouped.aggregate(np.mean)
  285. assert len(aggregated) == 5
  286. assert len(aggregated.columns) == 4
  287. # by string
  288. tscopy = tsframe.copy()
  289. tscopy['weekday'] = [x.weekday() for x in tscopy.index]
  290. stragged = tscopy.groupby('weekday').aggregate(np.mean)
  291. assert_frame_equal(stragged, aggregated, check_names=False)
  292. # transform
  293. grouped = tsframe.head(30).groupby(lambda x: x.weekday())
  294. transformed = grouped.transform(lambda x: x - x.mean())
  295. assert len(transformed) == 30
  296. assert len(transformed.columns) == 4
  297. # transform propagate
  298. transformed = grouped.transform(lambda x: x.mean())
  299. for name, group in grouped:
  300. mean = group.mean()
  301. for idx in group.index:
  302. tm.assert_series_equal(transformed.xs(idx), mean,
  303. check_names=False)
  304. # iterate
  305. for weekday, group in grouped:
  306. assert group.index[0].weekday() == weekday
  307. # groups / group_indices
  308. groups = grouped.groups
  309. indices = grouped.indices
  310. for k, v in compat.iteritems(groups):
  311. samething = tsframe.index.take(indices[k])
  312. assert (samething == v).all()
  313. def test_frame_groupby_columns(tsframe):
  314. mapping = {'A': 0, 'B': 0, 'C': 1, 'D': 1}
  315. grouped = tsframe.groupby(mapping, axis=1)
  316. # aggregate
  317. aggregated = grouped.aggregate(np.mean)
  318. assert len(aggregated) == len(tsframe)
  319. assert len(aggregated.columns) == 2
  320. # transform
  321. tf = lambda x: x - x.mean()
  322. groupedT = tsframe.T.groupby(mapping, axis=0)
  323. assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf))
  324. # iterate
  325. for k, v in grouped:
  326. assert len(v.columns) == 2
  327. def test_frame_set_name_single(df):
  328. grouped = df.groupby('A')
  329. result = grouped.mean()
  330. assert result.index.name == 'A'
  331. result = df.groupby('A', as_index=False).mean()
  332. assert result.index.name != 'A'
  333. result = grouped.agg(np.mean)
  334. assert result.index.name == 'A'
  335. result = grouped.agg({'C': np.mean, 'D': np.std})
  336. assert result.index.name == 'A'
  337. result = grouped['C'].mean()
  338. assert result.index.name == 'A'
  339. result = grouped['C'].agg(np.mean)
  340. assert result.index.name == 'A'
  341. result = grouped['C'].agg([np.mean, np.std])
  342. assert result.index.name == 'A'
  343. with tm.assert_produces_warning(FutureWarning,
  344. check_stacklevel=False):
  345. result = grouped['C'].agg({'foo': np.mean, 'bar': np.std})
  346. assert result.index.name == 'A'
  347. def test_multi_func(df):
  348. col1 = df['A']
  349. col2 = df['B']
  350. grouped = df.groupby([col1.get, col2.get])
  351. agged = grouped.mean()
  352. expected = df.groupby(['A', 'B']).mean()
  353. # TODO groupby get drops names
  354. assert_frame_equal(agged.loc[:, ['C', 'D']],
  355. expected.loc[:, ['C', 'D']],
  356. check_names=False)
  357. # some "groups" with no data
  358. df = DataFrame({'v1': np.random.randn(6),
  359. 'v2': np.random.randn(6),
  360. 'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']),
  361. 'k2': np.array(['1', '1', '1', '2', '2', '2'])},
  362. index=['one', 'two', 'three', 'four', 'five', 'six'])
  363. # only verify that it works for now
  364. grouped = df.groupby(['k1', 'k2'])
  365. grouped.agg(np.sum)
  366. def test_multi_key_multiple_functions(df):
  367. grouped = df.groupby(['A', 'B'])['C']
  368. agged = grouped.agg([np.mean, np.std])
  369. expected = DataFrame({'mean': grouped.agg(np.mean),
  370. 'std': grouped.agg(np.std)})
  371. assert_frame_equal(agged, expected)
  372. def test_frame_multi_key_function_list():
  373. data = DataFrame(
  374. {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
  375. 'foo', 'foo', 'foo'],
  376. 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
  377. 'two', 'two', 'one'],
  378. 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
  379. 'dull', 'shiny', 'shiny', 'shiny'],
  380. 'D': np.random.randn(11),
  381. 'E': np.random.randn(11),
  382. 'F': np.random.randn(11)})
  383. grouped = data.groupby(['A', 'B'])
  384. funcs = [np.mean, np.std]
  385. agged = grouped.agg(funcs)
  386. expected = pd.concat([grouped['D'].agg(funcs), grouped['E'].agg(funcs),
  387. grouped['F'].agg(funcs)],
  388. keys=['D', 'E', 'F'], axis=1)
  389. assert (isinstance(agged.index, MultiIndex))
  390. assert (isinstance(expected.index, MultiIndex))
  391. assert_frame_equal(agged, expected)
  392. @pytest.mark.parametrize('op', [lambda x: x.sum(), lambda x: x.mean()])
  393. @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
  394. def test_groupby_multiple_columns(df, op):
  395. data = df
  396. grouped = data.groupby(['A', 'B'])
  397. result1 = op(grouped)
  398. expected = defaultdict(dict)
  399. for n1, gp1 in data.groupby('A'):
  400. for n2, gp2 in gp1.groupby('B'):
  401. expected[n1][n2] = op(gp2.loc[:, ['C', 'D']])
  402. expected = {k: DataFrame(v)
  403. for k, v in compat.iteritems(expected)}
  404. expected = Panel.fromDict(expected).swapaxes(0, 1)
  405. expected.major_axis.name, expected.minor_axis.name = 'A', 'B'
  406. # a little bit crude
  407. for col in ['C', 'D']:
  408. result_col = op(grouped[col])
  409. exp = expected[col]
  410. pivoted = result1[col].unstack()
  411. pivoted2 = result_col.unstack()
  412. assert_frame_equal(pivoted.reindex_like(exp), exp)
  413. assert_frame_equal(pivoted2.reindex_like(exp), exp)
  414. # test single series works the same
  415. result = data['C'].groupby([data['A'], data['B']]).mean()
  416. expected = data.groupby(['A', 'B']).mean()['C']
  417. assert_series_equal(result, expected)
  418. def test_groupby_as_index_agg(df):
  419. grouped = df.groupby('A', as_index=False)
  420. # single-key
  421. result = grouped.agg(np.mean)
  422. expected = grouped.mean()
  423. assert_frame_equal(result, expected)
  424. result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]]))
  425. expected2 = grouped.mean()
  426. expected2['D'] = grouped.sum()['D']
  427. assert_frame_equal(result2, expected2)
  428. grouped = df.groupby('A', as_index=True)
  429. expected3 = grouped['C'].sum()
  430. expected3 = DataFrame(expected3).rename(columns={'C': 'Q'})
  431. with tm.assert_produces_warning(FutureWarning,
  432. check_stacklevel=False):
  433. result3 = grouped['C'].agg({'Q': np.sum})
  434. assert_frame_equal(result3, expected3)
  435. # multi-key
  436. grouped = df.groupby(['A', 'B'], as_index=False)
  437. result = grouped.agg(np.mean)
  438. expected = grouped.mean()
  439. assert_frame_equal(result, expected)
  440. result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]]))
  441. expected2 = grouped.mean()
  442. expected2['D'] = grouped.sum()['D']
  443. assert_frame_equal(result2, expected2)
  444. expected3 = grouped['C'].sum()
  445. expected3 = DataFrame(expected3).rename(columns={'C': 'Q'})
  446. result3 = grouped['C'].agg({'Q': np.sum})
  447. assert_frame_equal(result3, expected3)
  448. # GH7115 & GH8112 & GH8582
  449. df = DataFrame(np.random.randint(0, 100, (50, 3)),
  450. columns=['jim', 'joe', 'jolie'])
  451. ts = Series(np.random.randint(5, 10, 50), name='jim')
  452. gr = df.groupby(ts)
  453. gr.nth(0) # invokes set_selection_from_grouper internally
  454. assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum))
  455. for attr in ['mean', 'max', 'count', 'idxmax', 'cumsum', 'all']:
  456. gr = df.groupby(ts, as_index=False)
  457. left = getattr(gr, attr)()
  458. gr = df.groupby(ts.values, as_index=True)
  459. right = getattr(gr, attr)().reset_index(drop=True)
  460. assert_frame_equal(left, right)
  461. def test_as_index_series_return_frame(df):
  462. grouped = df.groupby('A', as_index=False)
  463. grouped2 = df.groupby(['A', 'B'], as_index=False)
  464. result = grouped['C'].agg(np.sum)
  465. expected = grouped.agg(np.sum).loc[:, ['A', 'C']]
  466. assert isinstance(result, DataFrame)
  467. assert_frame_equal(result, expected)
  468. result2 = grouped2['C'].agg(np.sum)
  469. expected2 = grouped2.agg(np.sum).loc[:, ['A', 'B', 'C']]
  470. assert isinstance(result2, DataFrame)
  471. assert_frame_equal(result2, expected2)
  472. result = grouped['C'].sum()
  473. expected = grouped.sum().loc[:, ['A', 'C']]
  474. assert isinstance(result, DataFrame)
  475. assert_frame_equal(result, expected)
  476. result2 = grouped2['C'].sum()
  477. expected2 = grouped2.sum().loc[:, ['A', 'B', 'C']]
  478. assert isinstance(result2, DataFrame)
  479. assert_frame_equal(result2, expected2)
  480. def test_as_index_series_column_slice_raises(df):
  481. # GH15072
  482. grouped = df.groupby('A', as_index=False)
  483. msg = r"Column\(s\) C already selected"
  484. with pytest.raises(IndexError, match=msg):
  485. grouped['C'].__getitem__('D')
  486. def test_groupby_as_index_cython(df):
  487. data = df
  488. # single-key
  489. grouped = data.groupby('A', as_index=False)
  490. result = grouped.mean()
  491. expected = data.groupby(['A']).mean()
  492. expected.insert(0, 'A', expected.index)
  493. expected.index = np.arange(len(expected))
  494. assert_frame_equal(result, expected)
  495. # multi-key
  496. grouped = data.groupby(['A', 'B'], as_index=False)
  497. result = grouped.mean()
  498. expected = data.groupby(['A', 'B']).mean()
  499. arrays = lzip(*expected.index.values)
  500. expected.insert(0, 'A', arrays[0])
  501. expected.insert(1, 'B', arrays[1])
  502. expected.index = np.arange(len(expected))
  503. assert_frame_equal(result, expected)
  504. def test_groupby_as_index_series_scalar(df):
  505. grouped = df.groupby(['A', 'B'], as_index=False)
  506. # GH #421
  507. result = grouped['C'].agg(len)
  508. expected = grouped.agg(len).loc[:, ['A', 'B', 'C']]
  509. assert_frame_equal(result, expected)
  510. def test_groupby_as_index_corner(df, ts):
  511. msg = "as_index=False only valid with DataFrame"
  512. with pytest.raises(TypeError, match=msg):
  513. ts.groupby(lambda x: x.weekday(), as_index=False)
  514. msg = "as_index=False only valid for axis=0"
  515. with pytest.raises(ValueError, match=msg):
  516. df.groupby(lambda x: x.lower(), as_index=False, axis=1)
  517. def test_groupby_multiple_key(df):
  518. df = tm.makeTimeDataFrame()
  519. grouped = df.groupby([lambda x: x.year, lambda x: x.month,
  520. lambda x: x.day])
  521. agged = grouped.sum()
  522. assert_almost_equal(df.values, agged.values)
  523. grouped = df.T.groupby([lambda x: x.year,
  524. lambda x: x.month,
  525. lambda x: x.day], axis=1)
  526. agged = grouped.agg(lambda x: x.sum())
  527. tm.assert_index_equal(agged.index, df.columns)
  528. assert_almost_equal(df.T.values, agged.values)
  529. agged = grouped.agg(lambda x: x.sum())
  530. assert_almost_equal(df.T.values, agged.values)
  531. def test_groupby_multi_corner(df):
  532. # test that having an all-NA column doesn't mess you up
  533. df = df.copy()
  534. df['bad'] = np.nan
  535. agged = df.groupby(['A', 'B']).mean()
  536. expected = df.groupby(['A', 'B']).mean()
  537. expected['bad'] = np.nan
  538. assert_frame_equal(agged, expected)
  539. def test_omit_nuisance(df):
  540. grouped = df.groupby('A')
  541. result = grouped.mean()
  542. expected = df.loc[:, ['A', 'C', 'D']].groupby('A').mean()
  543. assert_frame_equal(result, expected)
  544. agged = grouped.agg(np.mean)
  545. exp = grouped.mean()
  546. assert_frame_equal(agged, exp)
  547. df = df.loc[:, ['A', 'C', 'D']]
  548. df['E'] = datetime.now()
  549. grouped = df.groupby('A')
  550. result = grouped.agg(np.sum)
  551. expected = grouped.sum()
  552. assert_frame_equal(result, expected)
  553. # won't work with axis = 1
  554. grouped = df.groupby({'A': 0, 'C': 0, 'D': 1, 'E': 1}, axis=1)
  555. msg = (r'\("unsupported operand type\(s\) for \+: '
  556. "'Timestamp' and 'float'\""
  557. r", u?'occurred at index 0'\)")
  558. with pytest.raises(TypeError, match=msg):
  559. grouped.agg(lambda x: x.sum(0, numeric_only=False))
  560. def test_omit_nuisance_python_multiple(three_group):
  561. grouped = three_group.groupby(['A', 'B'])
  562. agged = grouped.agg(np.mean)
  563. exp = grouped.mean()
  564. assert_frame_equal(agged, exp)
  565. def test_empty_groups_corner(mframe):
  566. # handle empty groups
  567. df = DataFrame({'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']),
  568. 'k2': np.array(['1', '1', '1', '2', '2', '2']),
  569. 'k3': ['foo', 'bar'] * 3,
  570. 'v1': np.random.randn(6),
  571. 'v2': np.random.randn(6)})
  572. grouped = df.groupby(['k1', 'k2'])
  573. result = grouped.agg(np.mean)
  574. expected = grouped.mean()
  575. assert_frame_equal(result, expected)
  576. grouped = mframe[3:5].groupby(level=0)
  577. agged = grouped.apply(lambda x: x.mean())
  578. agged_A = grouped['A'].apply(np.mean)
  579. assert_series_equal(agged['A'], agged_A)
  580. assert agged.index.name == 'first'
  581. def test_nonsense_func():
  582. df = DataFrame([0])
  583. msg = r"unsupported operand type\(s\) for \+: '(int|long)' and 'str'"
  584. with pytest.raises(TypeError, match=msg):
  585. df.groupby(lambda x: x + 'foo')
  586. def test_wrap_aggregated_output_multindex(mframe):
  587. df = mframe.T
  588. df['baz', 'two'] = 'peekaboo'
  589. keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
  590. agged = df.groupby(keys).agg(np.mean)
  591. assert isinstance(agged.columns, MultiIndex)
  592. def aggfun(ser):
  593. if ser.name == ('foo', 'one'):
  594. raise TypeError
  595. else:
  596. return ser.sum()
  597. agged2 = df.groupby(keys).aggregate(aggfun)
  598. assert len(agged2.columns) + 1 == len(df.columns)
  599. def test_groupby_level_apply(mframe):
  600. result = mframe.groupby(level=0).count()
  601. assert result.index.name == 'first'
  602. result = mframe.groupby(level=1).count()
  603. assert result.index.name == 'second'
  604. result = mframe['A'].groupby(level=0).count()
  605. assert result.index.name == 'first'
  606. def test_groupby_level_mapper(mframe):
  607. deleveled = mframe.reset_index()
  608. mapper0 = {'foo': 0, 'bar': 0, 'baz': 1, 'qux': 1}
  609. mapper1 = {'one': 0, 'two': 0, 'three': 1}
  610. result0 = mframe.groupby(mapper0, level=0).sum()
  611. result1 = mframe.groupby(mapper1, level=1).sum()
  612. mapped_level0 = np.array([mapper0.get(x) for x in deleveled['first']])
  613. mapped_level1 = np.array([mapper1.get(x) for x in deleveled['second']])
  614. expected0 = mframe.groupby(mapped_level0).sum()
  615. expected1 = mframe.groupby(mapped_level1).sum()
  616. expected0.index.name, expected1.index.name = 'first', 'second'
  617. assert_frame_equal(result0, expected0)
  618. assert_frame_equal(result1, expected1)
  619. def test_groupby_level_nonmulti():
  620. # GH 1313, GH 13901
  621. s = Series([1, 2, 3, 10, 4, 5, 20, 6],
  622. Index([1, 2, 3, 1, 4, 5, 2, 6], name='foo'))
  623. expected = Series([11, 22, 3, 4, 5, 6],
  624. Index(range(1, 7), name='foo'))
  625. result = s.groupby(level=0).sum()
  626. tm.assert_series_equal(result, expected)
  627. result = s.groupby(level=[0]).sum()
  628. tm.assert_series_equal(result, expected)
  629. result = s.groupby(level=-1).sum()
  630. tm.assert_series_equal(result, expected)
  631. result = s.groupby(level=[-1]).sum()
  632. tm.assert_series_equal(result, expected)
  633. msg = "level > 0 or level < -1 only valid with MultiIndex"
  634. with pytest.raises(ValueError, match=msg):
  635. s.groupby(level=1)
  636. with pytest.raises(ValueError, match=msg):
  637. s.groupby(level=-2)
  638. msg = "No group keys passed!"
  639. with pytest.raises(ValueError, match=msg):
  640. s.groupby(level=[])
  641. msg = "multiple levels only valid with MultiIndex"
  642. with pytest.raises(ValueError, match=msg):
  643. s.groupby(level=[0, 0])
  644. with pytest.raises(ValueError, match=msg):
  645. s.groupby(level=[0, 1])
  646. msg = "level > 0 or level < -1 only valid with MultiIndex"
  647. with pytest.raises(ValueError, match=msg):
  648. s.groupby(level=[1])
  649. def test_groupby_complex():
  650. # GH 12902
  651. a = Series(data=np.arange(4) * (1 + 2j), index=[0, 0, 1, 1])
  652. expected = Series((1 + 2j, 5 + 10j))
  653. result = a.groupby(level=0).sum()
  654. assert_series_equal(result, expected)
  655. result = a.sum(level=0)
  656. assert_series_equal(result, expected)
  657. def test_mutate_groups():
  658. # GH3380
  659. df = DataFrame({
  660. 'cat1': ['a'] * 8 + ['b'] * 6,
  661. 'cat2': ['c'] * 2 + ['d'] * 2 + ['e'] * 2 + ['f'] * 2 + ['c'] * 2 +
  662. ['d'] * 2 + ['e'] * 2,
  663. 'cat3': lmap(lambda x: 'g%s' % x, lrange(1, 15)),
  664. 'val': np.random.randint(100, size=14),
  665. })
  666. def f_copy(x):
  667. x = x.copy()
  668. x['rank'] = x.val.rank(method='min')
  669. return x.groupby('cat2')['rank'].min()
  670. def f_no_copy(x):
  671. x['rank'] = x.val.rank(method='min')
  672. return x.groupby('cat2')['rank'].min()
  673. grpby_copy = df.groupby('cat1').apply(f_copy)
  674. grpby_no_copy = df.groupby('cat1').apply(f_no_copy)
  675. assert_series_equal(grpby_copy, grpby_no_copy)
  676. def test_no_mutate_but_looks_like():
  677. # GH 8467
  678. # first show's mutation indicator
  679. # second does not, but should yield the same results
  680. df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], 'value': range(9)})
  681. result1 = df.groupby('key', group_keys=True).apply(lambda x: x[:].key)
  682. result2 = df.groupby('key', group_keys=True).apply(lambda x: x.key)
  683. assert_series_equal(result1, result2)
  684. def test_groupby_series_indexed_differently():
  685. s1 = Series([5.0, -9.0, 4.0, 100., -5., 55., 6.7],
  686. index=Index(['a', 'b', 'c', 'd', 'e', 'f', 'g']))
  687. s2 = Series([1.0, 1.0, 4.0, 5.0, 5.0, 7.0],
  688. index=Index(['a', 'b', 'd', 'f', 'g', 'h']))
  689. grouped = s1.groupby(s2)
  690. agged = grouped.mean()
  691. exp = s1.groupby(s2.reindex(s1.index).get).mean()
  692. assert_series_equal(agged, exp)
  693. def test_groupby_with_hier_columns():
  694. tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux',
  695. 'qux'], ['one', 'two', 'one', 'two', 'one', 'two',
  696. 'one', 'two']]))
  697. index = MultiIndex.from_tuples(tuples)
  698. columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), (
  699. 'B', 'cat'), ('A', 'dog')])
  700. df = DataFrame(np.random.randn(8, 4), index=index, columns=columns)
  701. result = df.groupby(level=0).mean()
  702. tm.assert_index_equal(result.columns, columns)
  703. result = df.groupby(level=0, axis=1).mean()
  704. tm.assert_index_equal(result.index, df.index)
  705. result = df.groupby(level=0).agg(np.mean)
  706. tm.assert_index_equal(result.columns, columns)
  707. result = df.groupby(level=0).apply(lambda x: x.mean())
  708. tm.assert_index_equal(result.columns, columns)
  709. result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1))
  710. tm.assert_index_equal(result.columns, Index(['A', 'B']))
  711. tm.assert_index_equal(result.index, df.index)
  712. # add a nuisance column
  713. sorted_columns, _ = columns.sortlevel(0)
  714. df['A', 'foo'] = 'bar'
  715. result = df.groupby(level=0).mean()
  716. tm.assert_index_equal(result.columns, df.columns[:-1])
  717. def test_grouping_ndarray(df):
  718. grouped = df.groupby(df['A'].values)
  719. result = grouped.sum()
  720. expected = df.groupby('A').sum()
  721. assert_frame_equal(result, expected, check_names=False
  722. ) # Note: no names when grouping by value
  723. def test_groupby_wrong_multi_labels():
  724. data = """index,foo,bar,baz,spam,data
  725. 0,foo1,bar1,baz1,spam2,20
  726. 1,foo1,bar2,baz1,spam3,30
  727. 2,foo2,bar2,baz1,spam2,40
  728. 3,foo1,bar1,baz2,spam1,50
  729. 4,foo3,bar1,baz2,spam1,60"""
  730. data = read_csv(StringIO(data), index_col=0)
  731. grouped = data.groupby(['foo', 'bar', 'baz', 'spam'])
  732. result = grouped.agg(np.mean)
  733. expected = grouped.mean()
  734. assert_frame_equal(result, expected)
  735. def test_groupby_series_with_name(df):
  736. result = df.groupby(df['A']).mean()
  737. result2 = df.groupby(df['A'], as_index=False).mean()
  738. assert result.index.name == 'A'
  739. assert 'A' in result2
  740. result = df.groupby([df['A'], df['B']]).mean()
  741. result2 = df.groupby([df['A'], df['B']],
  742. as_index=False).mean()
  743. assert result.index.names == ('A', 'B')
  744. assert 'A' in result2
  745. assert 'B' in result2
  746. def test_seriesgroupby_name_attr(df):
  747. # GH 6265
  748. result = df.groupby('A')['C']
  749. assert result.count().name == 'C'
  750. assert result.mean().name == 'C'
  751. testFunc = lambda x: np.sum(x) * 2
  752. assert result.agg(testFunc).name == 'C'
  753. def test_consistency_name():
  754. # GH 12363
  755. df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
  756. 'foo', 'bar', 'foo', 'foo'],
  757. 'B': ['one', 'one', 'two', 'two',
  758. 'two', 'two', 'one', 'two'],
  759. 'C': np.random.randn(8) + 1.0,
  760. 'D': np.arange(8)})
  761. expected = df.groupby(['A']).B.count()
  762. result = df.B.groupby(df.A).count()
  763. assert_series_equal(result, expected)
  764. def test_groupby_name_propagation(df):
  765. # GH 6124
  766. def summarize(df, name=None):
  767. return Series({'count': 1, 'mean': 2, 'omissions': 3, }, name=name)
  768. def summarize_random_name(df):
  769. # Provide a different name for each Series. In this case, groupby
  770. # should not attempt to propagate the Series name since they are
  771. # inconsistent.
  772. return Series({
  773. 'count': 1,
  774. 'mean': 2,
  775. 'omissions': 3,
  776. }, name=df.iloc[0]['A'])
  777. metrics = df.groupby('A').apply(summarize)
  778. assert metrics.columns.name is None
  779. metrics = df.groupby('A').apply(summarize, 'metrics')
  780. assert metrics.columns.name == 'metrics'
  781. metrics = df.groupby('A').apply(summarize_random_name)
  782. assert metrics.columns.name is None
  783. def test_groupby_nonstring_columns():
  784. df = DataFrame([np.arange(10) for x in range(10)])
  785. grouped = df.groupby(0)
  786. result = grouped.mean()
  787. expected = df.groupby(df[0]).mean()
  788. assert_frame_equal(result, expected)
  789. def test_groupby_mixed_type_columns():
  790. # GH 13432, unorderable types in py3
  791. df = DataFrame([[0, 1, 2]], columns=['A', 'B', 0])
  792. expected = DataFrame([[1, 2]], columns=['B', 0],
  793. index=Index([0], name='A'))
  794. result = df.groupby('A').first()
  795. tm.assert_frame_equal(result, expected)
  796. result = df.groupby('A').sum()
  797. tm.assert_frame_equal(result, expected)
  798. # TODO: Ensure warning isn't emitted in the first place
  799. @pytest.mark.filterwarnings("ignore:Mean of:RuntimeWarning")
  800. def test_cython_grouper_series_bug_noncontig():
  801. arr = np.empty((100, 100))
  802. arr.fill(np.nan)
  803. obj = Series(arr[:, 0], index=lrange(100))
  804. inds = np.tile(lrange(10), 10)
  805. result = obj.groupby(inds).agg(Series.median)
  806. assert result.isna().all()
  807. def test_series_grouper_noncontig_index():
  808. index = Index(tm.rands_array(10, 100))
  809. values = Series(np.random.randn(50), index=index[::2])
  810. labels = np.random.randint(0, 5, 50)
  811. # it works!
  812. grouped = values.groupby(labels)
  813. # accessing the index elements causes segfault
  814. f = lambda x: len(set(map(id, x.index)))
  815. grouped.agg(f)
  816. def test_convert_objects_leave_decimal_alone():
  817. s = Series(lrange(5))
  818. labels = np.array(['a', 'b', 'c', 'd', 'e'], dtype='O')
  819. def convert_fast(x):
  820. return Decimal(str(x.mean()))
  821. def convert_force_pure(x):
  822. # base will be length 0
  823. assert (len(x.values.base) > 0)
  824. return Decimal(str(x.mean()))
  825. grouped = s.groupby(labels)
  826. result = grouped.agg(convert_fast)
  827. assert result.dtype == np.object_
  828. assert isinstance(result[0], Decimal)
  829. result = grouped.agg(convert_force_pure)
  830. assert result.dtype == np.object_
  831. assert isinstance(result[0], Decimal)
  832. def test_groupby_dtype_inference_empty():
  833. # GH 6733
  834. df = DataFrame({'x': [], 'range': np.arange(0, dtype='int64')})
  835. assert df['x'].dtype == np.float64
  836. result = df.groupby('x').first()
  837. exp_index = Index([], name='x', dtype=np.float64)
  838. expected = DataFrame({'range': Series(
  839. [], index=exp_index, dtype='int64')})
  840. assert_frame_equal(result, expected, by_blocks=True)
  841. def test_groupby_list_infer_array_like(df):
  842. result = df.groupby(list(df['A'])).mean()
  843. expected = df.groupby(df['A']).mean()
  844. assert_frame_equal(result, expected, check_names=False)
  845. with pytest.raises(KeyError, match=r"^'foo'$"):
  846. df.groupby(list(df['A'][:-1]))
  847. # pathological case of ambiguity
  848. df = DataFrame({'foo': [0, 1],
  849. 'bar': [3, 4],
  850. 'val': np.random.randn(2)})
  851. result = df.groupby(['foo', 'bar']).mean()
  852. expected = df.groupby([df['foo'], df['bar']]).mean()[['val']]
  853. def test_groupby_keys_same_size_as_index():
  854. # GH 11185
  855. freq = 's'
  856. index = pd.date_range(start=pd.Timestamp('2015-09-29T11:34:44-0700'),
  857. periods=2, freq=freq)
  858. df = pd.DataFrame([['A', 10], ['B', 15]], columns=[
  859. 'metric', 'values'
  860. ], index=index)
  861. result = df.groupby([pd.Grouper(level=0, freq=freq), 'metric']).mean()
  862. expected = df.set_index([df.index, 'metric'])
  863. assert_frame_equal(result, expected)
  864. def test_groupby_one_row():
  865. # GH 11741
  866. msg = r"^'Z'$"
  867. df1 = pd.DataFrame(np.random.randn(1, 4), columns=list('ABCD'))
  868. with pytest.raises(KeyError, match=msg):
  869. df1.groupby('Z')
  870. df2 = pd.DataFrame(np.random.randn(2, 4), columns=list('ABCD'))
  871. with pytest.raises(KeyError, match=msg):
  872. df2.groupby('Z')
  873. def test_groupby_nat_exclude():
  874. # GH 6992
  875. df = pd.DataFrame(
  876. {'values': np.random.randn(8),
  877. 'dt': [np.nan, pd.Timestamp('2013-01-01'), np.nan, pd.Timestamp(
  878. '2013-02-01'), np.nan, pd.Timestamp('2013-02-01'), np.nan,
  879. pd.Timestamp('2013-01-01')],
  880. 'str': [np.nan, 'a', np.nan, 'a', np.nan, 'a', np.nan, 'b']})
  881. grouped = df.groupby('dt')
  882. expected = [pd.Index([1, 7]), pd.Index([3, 5])]
  883. keys = sorted(grouped.groups.keys())
  884. assert len(keys) == 2
  885. for k, e in zip(keys, expected):
  886. # grouped.groups keys are np.datetime64 with system tz
  887. # not to be affected by tz, only compare values
  888. tm.assert_index_equal(grouped.groups[k], e)
  889. # confirm obj is not filtered
  890. tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df)
  891. assert grouped.ngroups == 2
  892. expected = {
  893. Timestamp('2013-01-01 00:00:00'): np.array([1, 7], dtype=np.int64),
  894. Timestamp('2013-02-01 00:00:00'): np.array([3, 5], dtype=np.int64)
  895. }
  896. for k in grouped.indices:
  897. tm.assert_numpy_array_equal(grouped.indices[k], expected[k])
  898. tm.assert_frame_equal(
  899. grouped.get_group(Timestamp('2013-01-01')), df.iloc[[1, 7]])
  900. tm.assert_frame_equal(
  901. grouped.get_group(Timestamp('2013-02-01')), df.iloc[[3, 5]])
  902. with pytest.raises(KeyError, match=r"^NaT$"):
  903. grouped.get_group(pd.NaT)
  904. nan_df = DataFrame({'nan': [np.nan, np.nan, np.nan],
  905. 'nat': [pd.NaT, pd.NaT, pd.NaT]})
  906. assert nan_df['nan'].dtype == 'float64'
  907. assert nan_df['nat'].dtype == 'datetime64[ns]'
  908. for key in ['nan', 'nat']:
  909. grouped = nan_df.groupby(key)
  910. assert grouped.groups == {}
  911. assert grouped.ngroups == 0
  912. assert grouped.indices == {}
  913. with pytest.raises(KeyError, match=r"^nan$"):
  914. grouped.get_group(np.nan)
  915. with pytest.raises(KeyError, match=r"^NaT$"):
  916. grouped.get_group(pd.NaT)
  917. @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
  918. def test_sparse_friendly(df):
  919. sdf = df[['C', 'D']].to_sparse()
  920. panel = tm.makePanel()
  921. tm.add_nans(panel)
  922. def _check_work(gp):
  923. gp.mean()
  924. gp.agg(np.mean)
  925. dict(iter(gp))
  926. # it works!
  927. _check_work(sdf.groupby(lambda x: x // 2))
  928. _check_work(sdf['C'].groupby(lambda x: x // 2))
  929. _check_work(sdf.groupby(df['A']))
  930. # do this someday
  931. # _check_work(panel.groupby(lambda x: x.month, axis=1))
  932. @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
  933. def test_panel_groupby():
  934. panel = tm.makePanel()
  935. tm.add_nans(panel)
  936. grouped = panel.groupby({'ItemA': 0, 'ItemB': 0, 'ItemC': 1},
  937. axis='items')
  938. agged = grouped.mean()
  939. agged2 = grouped.agg(lambda x: x.mean('items'))
  940. tm.assert_panel_equal(agged, agged2)
  941. tm.assert_index_equal(agged.items, Index([0, 1]))
  942. grouped = panel.groupby(lambda x: x.month, axis='major')
  943. agged = grouped.mean()
  944. exp = Index(sorted(list(set(panel.major_axis.month))))
  945. tm.assert_index_equal(agged.major_axis, exp)
  946. grouped = panel.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1},
  947. axis='minor')
  948. agged = grouped.mean()
  949. tm.assert_index_equal(agged.minor_axis, Index([0, 1]))
  950. def test_groupby_2d_malformed():
  951. d = DataFrame(index=lrange(2))
  952. d['group'] = ['g1', 'g2']
  953. d['zeros'] = [0, 0]
  954. d['ones'] = [1, 1]
  955. d['label'] = ['l1', 'l2']
  956. tmp = d.groupby(['group']).mean()
  957. res_values = np.array([[0, 1], [0, 1]], dtype=np.int64)
  958. tm.assert_index_equal(tmp.columns, Index(['zeros', 'ones']))
  959. tm.assert_numpy_array_equal(tmp.values, res_values)
  960. def test_int32_overflow():
  961. B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000)
  962. ))
  963. A = np.arange(25000)
  964. df = DataFrame({'A': A,
  965. 'B': B,
  966. 'C': A,
  967. 'D': B,
  968. 'E': np.random.randn(25000)})
  969. left = df.groupby(['A', 'B', 'C', 'D']).sum()
  970. right = df.groupby(['D', 'C', 'B', 'A']).sum()
  971. assert len(left) == len(right)
  972. def test_groupby_sort_multi():
  973. df = DataFrame({'a': ['foo', 'bar', 'baz'],
  974. 'b': [3, 2, 1],
  975. 'c': [0, 1, 2],
  976. 'd': np.random.randn(3)})
  977. tups = lmap(tuple, df[['a', 'b', 'c']].values)
  978. tups = com.asarray_tuplesafe(tups)
  979. result = df.groupby(['a', 'b', 'c'], sort=True).sum()
  980. tm.assert_numpy_array_equal(result.index.values, tups[[1, 2, 0]])
  981. tups = lmap(tuple, df[['c', 'a', 'b']].values)
  982. tups = com.asarray_tuplesafe(tups)
  983. result = df.groupby(['c', 'a', 'b'], sort=True).sum()
  984. tm.assert_numpy_array_equal(result.index.values, tups)
  985. tups = lmap(tuple, df[['b', 'c', 'a']].values)
  986. tups = com.asarray_tuplesafe(tups)
  987. result = df.groupby(['b', 'c', 'a'], sort=True).sum()
  988. tm.assert_numpy_array_equal(result.index.values, tups[[2, 1, 0]])
  989. df = DataFrame({'a': [0, 1, 2, 0, 1, 2],
  990. 'b': [0, 0, 0, 1, 1, 1],
  991. 'd': np.random.randn(6)})
  992. grouped = df.groupby(['a', 'b'])['d']
  993. result = grouped.sum()
  994. def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
  995. tups = lmap(tuple, df[keys].values)
  996. tups = com.asarray_tuplesafe(tups)
  997. expected = f(df.groupby(tups)[field])
  998. for k, v in compat.iteritems(expected):
  999. assert (result[k] == v)
  1000. _check_groupby(df, result, ['a', 'b'], 'd')
  1001. def test_dont_clobber_name_column():
  1002. df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'],
  1003. 'name': ['foo', 'bar', 'baz'] * 2})
  1004. result = df.groupby('key').apply(lambda x: x)
  1005. assert_frame_equal(result, df)
  1006. def test_skip_group_keys():
  1007. tsf = tm.makeTimeDataFrame()
  1008. grouped = tsf.groupby(lambda x: x.month, group_keys=False)
  1009. result = grouped.apply(lambda x: x.sort_values(by='A')[:3])
  1010. pieces = [group.sort_values(by='A')[:3] for key, group in grouped]
  1011. expected = pd.concat(pieces)
  1012. assert_frame_equal(result, expected)
  1013. grouped = tsf['A'].groupby(lambda x: x.month, group_keys=False)
  1014. result = grouped.apply(lambda x: x.sort_values()[:3])
  1015. pieces = [group.sort_values()[:3] for key, group in grouped]
  1016. expected = pd.concat(pieces)
  1017. assert_series_equal(result, expected)
  1018. def test_no_nonsense_name(frame):
  1019. # GH #995
  1020. s = frame['C'].copy()
  1021. s.name = None
  1022. result = s.groupby(frame['A']).agg(np.sum)
  1023. assert result.name is None
  1024. def test_multifunc_sum_bug():
  1025. # GH #1065
  1026. x = DataFrame(np.arange(9).reshape(3, 3))
  1027. x['test'] = 0
  1028. x['fl'] = [1.3, 1.5, 1.6]
  1029. grouped = x.groupby('test')
  1030. result = grouped.agg({'fl': 'sum', 2: 'size'})
  1031. assert result['fl'].dtype == np.float64
  1032. def test_handle_dict_return_value(df):
  1033. def f(group):
  1034. return {'max': group.max(), 'min': group.min()}
  1035. def g(group):
  1036. return Series({'max': group.max(), 'min': group.min()})
  1037. result = df.groupby('A')['C'].apply(f)
  1038. expected = df.groupby('A')['C'].apply(g)
  1039. assert isinstance(result, Series)
  1040. assert_series_equal(result, expected)
  1041. @pytest.mark.parametrize('grouper', ['A', ['A', 'B']])
  1042. def test_set_group_name(df, grouper):
  1043. def f(group):
  1044. assert group.name is not None
  1045. return group
  1046. def freduce(group):
  1047. assert group.name is not None
  1048. return group.sum()
  1049. def foo(x):
  1050. return freduce(x)
  1051. grouped = df.groupby(grouper)
  1052. # make sure all these work
  1053. grouped.apply(f)
  1054. grouped.aggregate(freduce)
  1055. grouped.aggregate({'C': freduce, 'D': freduce})
  1056. grouped.transform(f)
  1057. grouped['C'].apply(f)
  1058. grouped['C'].aggregate(freduce)
  1059. grouped['C'].aggregate([freduce, foo])
  1060. grouped['C'].transform(f)
  1061. def test_group_name_available_in_inference_pass():
  1062. # gh-15062
  1063. df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)})
  1064. names = []
  1065. def f(group):
  1066. names.append(group.name)
  1067. return group.copy()
  1068. df.groupby('a', sort=False, group_keys=False).apply(f)
  1069. # we expect 2 zeros because we call ``f`` once to see if a faster route
  1070. # can be used.
  1071. expected_names = [0, 0, 1, 2]
  1072. assert names == expected_names
  1073. def test_no_dummy_key_names(df):
  1074. # see gh-1291
  1075. result = df.groupby(df['A'].values).sum()
  1076. assert result.index.name is None
  1077. result = df.groupby([df['A'].values, df['B'].values]).sum()
  1078. assert result.index.names == (None, None)
  1079. def test_groupby_sort_multiindex_series():
  1080. # series multiindex groupby sort argument was not being passed through
  1081. # _compress_group_index
  1082. # GH 9444
  1083. index = MultiIndex(levels=[[1, 2], [1, 2]],
  1084. codes=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]],
  1085. names=['a', 'b'])
  1086. mseries = Series([0, 1, 2, 3, 4, 5], index=index)
  1087. index = MultiIndex(levels=[[1, 2], [1, 2]],
  1088. codes=[[0, 0, 1], [1, 0, 0]], names=['a', 'b'])
  1089. mseries_result = Series([0, 2, 4], index=index)
  1090. result = mseries.groupby(level=['a', 'b'], sort=False).first()
  1091. assert_series_equal(result, mseries_result)
  1092. result = mseries.groupby(level=['a', 'b'], sort=True).first()
  1093. assert_series_equal(result, mseries_result.sort_index())
  1094. def test_groupby_reindex_inside_function():
  1095. periods = 1000
  1096. ind = date_range(start='2012/1/1', freq='5min', periods=periods)
  1097. df = DataFrame({'high': np.arange(
  1098. periods), 'low': np.arange(periods)}, index=ind)
  1099. def agg_before(hour, func, fix=False):
  1100. """
  1101. Run an aggregate func on the subset of data.
  1102. """
  1103. def _func(data):
  1104. d = data.loc[data.index.map(
  1105. lambda x: x.hour < 11)].dropna()
  1106. if fix:
  1107. data[data.index[0]]
  1108. if len(d) == 0:
  1109. return None
  1110. return func(d)
  1111. return _func
  1112. def afunc(data):
  1113. d = data.select(lambda x: x.hour < 11).dropna()
  1114. return np.max(d)
  1115. grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
  1116. closure_bad = grouped.agg({'high': agg_before(11, np.max)})
  1117. closure_good = grouped.agg({'high': agg_before(11, np.max, True)})
  1118. assert_frame_equal(closure_bad, closure_good)
  1119. def test_groupby_multiindex_missing_pair():
  1120. # GH9049
  1121. df = DataFrame({'group1': ['a', 'a', 'a', 'b'],
  1122. 'group2': ['c', 'c', 'd', 'c'],
  1123. 'value': [1, 1, 1, 5]})
  1124. df = df.set_index(['group1', 'group2'])
  1125. df_grouped = df.groupby(level=['group1', 'group2'], sort=True)
  1126. res = df_grouped.agg('sum')
  1127. idx = MultiIndex.from_tuples(
  1128. [('a', 'c'), ('a', 'd'), ('b', 'c')], names=['group1', 'group2'])
  1129. exp = DataFrame([[2], [1], [5]], index=idx, columns=['value'])
  1130. tm.assert_frame_equal(res, exp)
  1131. def test_groupby_multiindex_not_lexsorted():
  1132. # GH 11640
  1133. # define the lexsorted version
  1134. lexsorted_mi = MultiIndex.from_tuples(
  1135. [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c'])
  1136. lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
  1137. assert lexsorted_df.columns.is_lexsorted()
  1138. # define the non-lexsorted version
  1139. not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'],
  1140. data=[[1, 'b1', 'c1', 3],
  1141. [1, 'b2', 'c2', 4]])
  1142. not_lexsorted_df = not_lexsorted_df.pivot_table(
  1143. index='a', columns=['b', 'c'], values='d')
  1144. not_lexsorted_df = not_lexsorted_df.reset_index()
  1145. assert not not_lexsorted_df.columns.is_lexsorted()
  1146. # compare the results
  1147. tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)
  1148. expected = lexsorted_df.groupby('a').mean()
  1149. with tm.assert_produces_warning(PerformanceWarning):
  1150. result = not_lexsorted_df.groupby('a').mean()
  1151. tm.assert_frame_equal(expected, result)
  1152. # a transforming function should work regardless of sort
  1153. # GH 14776
  1154. df = DataFrame({'x': ['a', 'a', 'b', 'a'],
  1155. 'y': [1, 1, 2, 2],
  1156. 'z': [1, 2, 3, 4]}).set_index(['x', 'y'])
  1157. assert not df.index.is_lexsorted()
  1158. for level in [0, 1, [0, 1]]:
  1159. for sort in [False, True]:
  1160. result = df.groupby(level=level, sort=sort).apply(
  1161. DataFrame.drop_duplicates)
  1162. expected = df
  1163. tm.assert_frame_equal(expected, result)
  1164. result = df.sort_index().groupby(level=level, sort=sort).apply(
  1165. DataFrame.drop_duplicates)
  1166. expected = df.sort_index()
  1167. tm.assert_frame_equal(expected, result)
  1168. def test_index_label_overlaps_location():
  1169. # checking we don't have any label/location confusion in the
  1170. # the wake of GH5375
  1171. df = DataFrame(list('ABCDE'), index=[2, 0, 2, 1, 1])
  1172. g = df.groupby(list('ababb'))
  1173. actual = g.filter(lambda x: len(x) > 2)
  1174. expected = df.iloc[[1, 3, 4]]
  1175. assert_frame_equal(actual, expected)
  1176. ser = df[0]
  1177. g = ser.groupby(list('ababb'))
  1178. actual = g.filter(lambda x: len(x) > 2)
  1179. expected = ser.take([1, 3, 4])
  1180. assert_series_equal(actual, expected)
  1181. # ... and again, with a generic Index of floats
  1182. df.index = df.index.astype(float)
  1183. g = df.groupby(list('ababb'))
  1184. actual = g.filter(lambda x: len(x) > 2)
  1185. expected = df.iloc[[1, 3, 4]]
  1186. assert_frame_equal(actual, expected)
  1187. ser = df[0]
  1188. g = ser.groupby(list('ababb'))
  1189. actual = g.filter(lambda x: len(x) > 2)
  1190. expected = ser.take([1, 3, 4])
  1191. assert_series_equal(actual, expected)
  1192. def test_transform_doesnt_clobber_ints():
  1193. # GH 7972
  1194. n = 6
  1195. x = np.arange(n)
  1196. df = DataFrame({'a': x // 2, 'b': 2.0 * x, 'c': 3.0 * x})
  1197. df2 = DataFrame({'a': x // 2 * 1.0, 'b': 2.0 * x, 'c': 3.0 * x})
  1198. gb = df.groupby('a')
  1199. result = gb.transform('mean')
  1200. gb2 = df2.groupby('a')
  1201. expected = gb2.transform('mean')
  1202. tm.assert_frame_equal(result, expected)
  1203. @pytest.mark.parametrize('sort_column', ['ints', 'floats', 'strings',
  1204. ['ints', 'floats'],
  1205. ['ints', 'strings']])
  1206. @pytest.mark.parametrize('group_column', ['int_groups', 'string_groups',
  1207. ['int_groups', 'string_groups']])
  1208. def test_groupby_preserves_sort(sort_column, group_column):
  1209. # Test to ensure that groupby always preserves sort order of original
  1210. # object. Issue #8588 and #9651
  1211. df = DataFrame(
  1212. {'int_groups': [3, 1, 0, 1, 0, 3, 3, 3],
  1213. 'string_groups': ['z', 'a', 'z', 'a', 'a', 'g', 'g', 'g'],
  1214. 'ints': [8, 7, 4, 5, 2, 9, 1, 1],
  1215. 'floats': [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5],
  1216. 'strings': ['z', 'd', 'a', 'e', 'word', 'word2', '42', '47']})
  1217. # Try sorting on different types and with different group types
  1218. df = df.sort_values(by=sort_column)
  1219. g = df.groupby(group_column)
  1220. def test_sort(x):
  1221. assert_frame_equal(x, x.sort_values(by=sort_column))
  1222. g.apply(test_sort)
  1223. def test_group_shift_with_null_key():
  1224. # This test is designed to replicate the segfault in issue #13813.
  1225. n_rows = 1200
  1226. # Generate a moderately large dataframe with occasional missing
  1227. # values in column `B`, and then group by [`A`, `B`]. This should
  1228. # force `-1` in `labels` array of `g.grouper.group_info` exactly
  1229. # at those places, where the group-by key is partially missing.
  1230. df = DataFrame([(i % 12, i % 3 if i % 3 else np.nan, i)
  1231. for i in range(n_rows)], dtype=float,
  1232. columns=["A", "B", "Z"], index=None)
  1233. g = df.groupby(["A", "B"])
  1234. expected = DataFrame([(i + 12 if i % 3 and i < n_rows - 12
  1235. else np.nan)
  1236. for i in range(n_rows)], dtype=float,
  1237. columns=["Z"], index=None)
  1238. result = g.shift(-1)
  1239. assert_frame_equal(result, expected)
  1240. def test_group_shift_with_fill_value():
  1241. # GH #24128
  1242. n_rows = 24
  1243. df = DataFrame([(i % 12, i % 3, i)
  1244. for i in range(n_rows)], dtype=float,
  1245. columns=["A", "B", "Z"], index=None)
  1246. g = df.groupby(["A", "B"])
  1247. expected = DataFrame([(i + 12 if i < n_rows - 12
  1248. else 0)
  1249. for i in range(n_rows)], dtype=float,
  1250. columns=["Z"], index=None)
  1251. result = g.shift(-1, fill_value=0)[["Z"]]
  1252. assert_frame_equal(result, expected)
  1253. def test_pivot_table_values_key_error():
  1254. # This test is designed to replicate the error in issue #14938
  1255. df = pd.DataFrame({'eventDate':
  1256. pd.date_range(pd.datetime.today(),
  1257. periods=20, freq='M').tolist(),
  1258. 'thename': range(0, 20)})
  1259. df['year'] = df.set_index('eventDate').index.year
  1260. df['month'] = df.set_index('eventDate').index.month
  1261. with pytest.raises(KeyError, match="'badname'"):
  1262. df.reset_index().pivot_table(index='year', columns='month',
  1263. values='badname', aggfunc='count')
  1264. def test_empty_dataframe_groupby():
  1265. # GH8093
  1266. df = DataFrame(columns=['A', 'B', 'C'])
  1267. result = df.groupby('A').sum()
  1268. expected = DataFrame(columns=['B', 'C'], dtype=np.float64)
  1269. expected.index.name = 'A'
  1270. assert_frame_equal(result, expected)
  1271. def test_tuple_warns():
  1272. # https://github.com/pandas-dev/pandas/issues/18314
  1273. df = pd.DataFrame({('a', 'b'): [1, 1, 2, 2], 'a': [1, 1, 1, 2],
  1274. 'b': [1, 2, 2, 2], 'c': [1, 1, 1, 1]})
  1275. with tm.assert_produces_warning(FutureWarning) as w:
  1276. df[['a', 'b', 'c']].groupby(('a', 'b')).c.mean()
  1277. assert "Interpreting tuple 'by' as a list" in str(w[0].message)
  1278. with tm.assert_produces_warning(None):
  1279. df.groupby(('a', 'b')).c.mean()
  1280. def test_tuple_warns_unhashable():
  1281. # https://github.com/pandas-dev/pandas/issues/18314
  1282. business_dates = date_range(start='4/1/2014', end='6/30/2014',
  1283. freq='B')
  1284. df = DataFrame(1, index=business_dates, columns=['a', 'b'])
  1285. with tm.assert_produces_warning(FutureWarning) as w:
  1286. df.groupby((df.index.year, df.index.month)).nth([0, 3, -1])
  1287. assert "Interpreting tuple 'by' as a list" in str(w[0].message)
  1288. def test_tuple_correct_keyerror():
  1289. # https://github.com/pandas-dev/pandas/issues/18798
  1290. df = pd.DataFrame(1, index=range(3),
  1291. columns=pd.MultiIndex.from_product([[1, 2],
  1292. [3, 4]]))
  1293. with pytest.raises(KeyError, match=r"^\(7, 8\)$"):
  1294. df.groupby((7, 8)).mean()
  1295. def test_groupby_agg_ohlc_non_first():
  1296. # GH 21716
  1297. df = pd.DataFrame([[1], [1]], columns=['foo'],
  1298. index=pd.date_range('2018-01-01', periods=2, freq='D'))
  1299. expected = pd.DataFrame([
  1300. [1, 1, 1, 1, 1],
  1301. [1, 1, 1, 1, 1]
  1302. ], columns=pd.MultiIndex.from_tuples((
  1303. ('foo', 'ohlc', 'open'), ('foo', 'ohlc', 'high'),
  1304. ('foo', 'ohlc', 'low'), ('foo', 'ohlc', 'close'),
  1305. ('foo', 'sum', 'foo'))), index=pd.date_range(
  1306. '2018-01-01', periods=2, freq='D'))
  1307. result = df.groupby(pd.Grouper(freq='D')).agg(['sum', 'ohlc'])
  1308. tm.assert_frame_equal(result, expected)