test_grouping.py 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838
  1. # -*- coding: utf-8 -*-
  2. """ test where we are determining what we are grouping, or getting groups """
  3. import numpy as np
  4. import pytest
  5. from pandas.compat import long, lrange
  6. import pandas as pd
  7. from pandas import (
  8. CategoricalIndex, DataFrame, Index, MultiIndex, Series, Timestamp, compat,
  9. date_range)
  10. from pandas.core.groupby.grouper import Grouping
  11. import pandas.util.testing as tm
  12. from pandas.util.testing import (
  13. assert_almost_equal, assert_frame_equal, assert_panel_equal,
  14. assert_series_equal)
  15. # selection
  16. # --------------------------------
  17. class TestSelection(object):
  18. def test_select_bad_cols(self):
  19. df = DataFrame([[1, 2]], columns=['A', 'B'])
  20. g = df.groupby('A')
  21. with pytest.raises(KeyError, match='"Columns not found: \'C\'"'):
  22. g[['C']]
  23. with pytest.raises(KeyError, match='^[^A]+$'):
  24. # A should not be referenced as a bad column...
  25. # will have to rethink regex if you change message!
  26. g[['A', 'C']]
  27. def test_groupby_duplicated_column_errormsg(self):
  28. # GH7511
  29. df = DataFrame(columns=['A', 'B', 'A', 'C'],
  30. data=[range(4), range(2, 6), range(0, 8, 2)])
  31. msg = "Grouper for 'A' not 1-dimensional"
  32. with pytest.raises(ValueError, match=msg):
  33. df.groupby('A')
  34. with pytest.raises(ValueError, match=msg):
  35. df.groupby(['A', 'B'])
  36. grouped = df.groupby('B')
  37. c = grouped.count()
  38. assert c.columns.nlevels == 1
  39. assert c.columns.size == 3
  40. def test_column_select_via_attr(self, df):
  41. result = df.groupby('A').C.sum()
  42. expected = df.groupby('A')['C'].sum()
  43. assert_series_equal(result, expected)
  44. df['mean'] = 1.5
  45. result = df.groupby('A').mean()
  46. expected = df.groupby('A').agg(np.mean)
  47. assert_frame_equal(result, expected)
  48. def test_getitem_list_of_columns(self):
  49. df = DataFrame(
  50. {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
  51. 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
  52. 'C': np.random.randn(8),
  53. 'D': np.random.randn(8),
  54. 'E': np.random.randn(8)})
  55. result = df.groupby('A')[['C', 'D']].mean()
  56. result2 = df.groupby('A')['C', 'D'].mean()
  57. result3 = df.groupby('A')[df.columns[2:4]].mean()
  58. expected = df.loc[:, ['A', 'C', 'D']].groupby('A').mean()
  59. assert_frame_equal(result, expected)
  60. assert_frame_equal(result2, expected)
  61. assert_frame_equal(result3, expected)
  62. def test_getitem_numeric_column_names(self):
  63. # GH #13731
  64. df = DataFrame({0: list('abcd') * 2,
  65. 2: np.random.randn(8),
  66. 4: np.random.randn(8),
  67. 6: np.random.randn(8)})
  68. result = df.groupby(0)[df.columns[1:3]].mean()
  69. result2 = df.groupby(0)[2, 4].mean()
  70. result3 = df.groupby(0)[[2, 4]].mean()
  71. expected = df.loc[:, [0, 2, 4]].groupby(0).mean()
  72. assert_frame_equal(result, expected)
  73. assert_frame_equal(result2, expected)
  74. assert_frame_equal(result3, expected)
  75. # grouping
  76. # --------------------------------
  77. class TestGrouping():
  78. def test_grouper_index_types(self):
  79. # related GH5375
  80. # groupby misbehaving when using a Floatlike index
  81. df = DataFrame(np.arange(10).reshape(5, 2), columns=list('AB'))
  82. for index in [tm.makeFloatIndex, tm.makeStringIndex,
  83. tm.makeUnicodeIndex, tm.makeIntIndex, tm.makeDateIndex,
  84. tm.makePeriodIndex]:
  85. df.index = index(len(df))
  86. df.groupby(list('abcde')).apply(lambda x: x)
  87. df.index = list(reversed(df.index.tolist()))
  88. df.groupby(list('abcde')).apply(lambda x: x)
  89. def test_grouper_multilevel_freq(self):
  90. # GH 7885
  91. # with level and freq specified in a pd.Grouper
  92. from datetime import date, timedelta
  93. d0 = date.today() - timedelta(days=14)
  94. dates = date_range(d0, date.today())
  95. date_index = pd.MultiIndex.from_product(
  96. [dates, dates], names=['foo', 'bar'])
  97. df = pd.DataFrame(np.random.randint(0, 100, 225), index=date_index)
  98. # Check string level
  99. expected = df.reset_index().groupby([pd.Grouper(
  100. key='foo', freq='W'), pd.Grouper(key='bar', freq='W')]).sum()
  101. # reset index changes columns dtype to object
  102. expected.columns = pd.Index([0], dtype='int64')
  103. result = df.groupby([pd.Grouper(level='foo', freq='W'), pd.Grouper(
  104. level='bar', freq='W')]).sum()
  105. assert_frame_equal(result, expected)
  106. # Check integer level
  107. result = df.groupby([pd.Grouper(level=0, freq='W'), pd.Grouper(
  108. level=1, freq='W')]).sum()
  109. assert_frame_equal(result, expected)
  110. def test_grouper_creation_bug(self):
  111. # GH 8795
  112. df = DataFrame({'A': [0, 0, 1, 1, 2, 2], 'B': [1, 2, 3, 4, 5, 6]})
  113. g = df.groupby('A')
  114. expected = g.sum()
  115. g = df.groupby(pd.Grouper(key='A'))
  116. result = g.sum()
  117. assert_frame_equal(result, expected)
  118. result = g.apply(lambda x: x.sum())
  119. assert_frame_equal(result, expected)
  120. g = df.groupby(pd.Grouper(key='A', axis=0))
  121. result = g.sum()
  122. assert_frame_equal(result, expected)
  123. # GH14334
  124. # pd.Grouper(key=...) may be passed in a list
  125. df = DataFrame({'A': [0, 0, 0, 1, 1, 1],
  126. 'B': [1, 1, 2, 2, 3, 3],
  127. 'C': [1, 2, 3, 4, 5, 6]})
  128. # Group by single column
  129. expected = df.groupby('A').sum()
  130. g = df.groupby([pd.Grouper(key='A')])
  131. result = g.sum()
  132. assert_frame_equal(result, expected)
  133. # Group by two columns
  134. # using a combination of strings and Grouper objects
  135. expected = df.groupby(['A', 'B']).sum()
  136. # Group with two Grouper objects
  137. g = df.groupby([pd.Grouper(key='A'), pd.Grouper(key='B')])
  138. result = g.sum()
  139. assert_frame_equal(result, expected)
  140. # Group with a string and a Grouper object
  141. g = df.groupby(['A', pd.Grouper(key='B')])
  142. result = g.sum()
  143. assert_frame_equal(result, expected)
  144. # Group with a Grouper object and a string
  145. g = df.groupby([pd.Grouper(key='A'), 'B'])
  146. result = g.sum()
  147. assert_frame_equal(result, expected)
  148. # GH8866
  149. s = Series(np.arange(8, dtype='int64'),
  150. index=pd.MultiIndex.from_product(
  151. [list('ab'), range(2),
  152. date_range('20130101', periods=2)],
  153. names=['one', 'two', 'three']))
  154. result = s.groupby(pd.Grouper(level='three', freq='M')).sum()
  155. expected = Series([28], index=Index(
  156. [Timestamp('2013-01-31')], freq='M', name='three'))
  157. assert_series_equal(result, expected)
  158. # just specifying a level breaks
  159. result = s.groupby(pd.Grouper(level='one')).sum()
  160. expected = s.groupby(level='one').sum()
  161. assert_series_equal(result, expected)
  162. def test_grouper_column_and_index(self):
  163. # GH 14327
  164. # Grouping a multi-index frame by a column and an index level should
  165. # be equivalent to resetting the index and grouping by two columns
  166. idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3),
  167. ('b', 1), ('b', 2), ('b', 3)])
  168. idx.names = ['outer', 'inner']
  169. df_multi = pd.DataFrame({"A": np.arange(6),
  170. 'B': ['one', 'one', 'two',
  171. 'two', 'one', 'one']},
  172. index=idx)
  173. result = df_multi.groupby(['B', pd.Grouper(level='inner')]).mean()
  174. expected = df_multi.reset_index().groupby(['B', 'inner']).mean()
  175. assert_frame_equal(result, expected)
  176. # Test the reverse grouping order
  177. result = df_multi.groupby([pd.Grouper(level='inner'), 'B']).mean()
  178. expected = df_multi.reset_index().groupby(['inner', 'B']).mean()
  179. assert_frame_equal(result, expected)
  180. # Grouping a single-index frame by a column and the index should
  181. # be equivalent to resetting the index and grouping by two columns
  182. df_single = df_multi.reset_index('outer')
  183. result = df_single.groupby(['B', pd.Grouper(level='inner')]).mean()
  184. expected = df_single.reset_index().groupby(['B', 'inner']).mean()
  185. assert_frame_equal(result, expected)
  186. # Test the reverse grouping order
  187. result = df_single.groupby([pd.Grouper(level='inner'), 'B']).mean()
  188. expected = df_single.reset_index().groupby(['inner', 'B']).mean()
  189. assert_frame_equal(result, expected)
  190. def test_groupby_levels_and_columns(self):
  191. # GH9344, GH9049
  192. idx_names = ['x', 'y']
  193. idx = pd.MultiIndex.from_tuples(
  194. [(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names)
  195. df = pd.DataFrame(np.arange(12).reshape(-1, 3), index=idx)
  196. by_levels = df.groupby(level=idx_names).mean()
  197. # reset_index changes columns dtype to object
  198. by_columns = df.reset_index().groupby(idx_names).mean()
  199. tm.assert_frame_equal(by_levels, by_columns, check_column_type=False)
  200. by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64)
  201. tm.assert_frame_equal(by_levels, by_columns)
  202. def test_groupby_categorical_index_and_columns(self, observed):
  203. # GH18432
  204. columns = ['A', 'B', 'A', 'B']
  205. categories = ['B', 'A']
  206. data = np.ones((5, 4), int)
  207. cat_columns = CategoricalIndex(columns,
  208. categories=categories,
  209. ordered=True)
  210. df = DataFrame(data=data, columns=cat_columns)
  211. result = df.groupby(axis=1, level=0, observed=observed).sum()
  212. expected_data = 2 * np.ones((5, 2), int)
  213. if observed:
  214. # if we are not-observed we undergo a reindex
  215. # so need to adjust the output as our expected sets us up
  216. # to be non-observed
  217. expected_columns = CategoricalIndex(['A', 'B'],
  218. categories=categories,
  219. ordered=True)
  220. else:
  221. expected_columns = CategoricalIndex(categories,
  222. categories=categories,
  223. ordered=True)
  224. expected = DataFrame(data=expected_data, columns=expected_columns)
  225. assert_frame_equal(result, expected)
  226. # test transposed version
  227. df = DataFrame(data.T, index=cat_columns)
  228. result = df.groupby(axis=0, level=0, observed=observed).sum()
  229. expected = DataFrame(data=expected_data.T, index=expected_columns)
  230. assert_frame_equal(result, expected)
  231. def test_grouper_getting_correct_binner(self):
  232. # GH 10063
  233. # using a non-time-based grouper and a time-based grouper
  234. # and specifying levels
  235. df = DataFrame({'A': 1}, index=pd.MultiIndex.from_product(
  236. [list('ab'), date_range('20130101', periods=80)], names=['one',
  237. 'two']))
  238. result = df.groupby([pd.Grouper(level='one'), pd.Grouper(
  239. level='two', freq='M')]).sum()
  240. expected = DataFrame({'A': [31, 28, 21, 31, 28, 21]},
  241. index=MultiIndex.from_product(
  242. [list('ab'),
  243. date_range('20130101', freq='M', periods=3)],
  244. names=['one', 'two']))
  245. assert_frame_equal(result, expected)
  246. def test_grouper_iter(self, df):
  247. assert sorted(df.groupby('A').grouper) == ['bar', 'foo']
  248. def test_empty_groups(self, df):
  249. # see gh-1048
  250. with pytest.raises(ValueError, match="No group keys passed!"):
  251. df.groupby([])
  252. def test_groupby_grouper(self, df):
  253. grouped = df.groupby('A')
  254. result = df.groupby(grouped.grouper).mean()
  255. expected = grouped.mean()
  256. tm.assert_frame_equal(result, expected)
  257. def test_groupby_dict_mapping(self):
  258. # GH #679
  259. from pandas import Series
  260. s = Series({'T1': 5})
  261. result = s.groupby({'T1': 'T2'}).agg(sum)
  262. expected = s.groupby(['T2']).agg(sum)
  263. assert_series_equal(result, expected)
  264. s = Series([1., 2., 3., 4.], index=list('abcd'))
  265. mapping = {'a': 0, 'b': 0, 'c': 1, 'd': 1}
  266. result = s.groupby(mapping).mean()
  267. result2 = s.groupby(mapping).agg(np.mean)
  268. expected = s.groupby([0, 0, 1, 1]).mean()
  269. expected2 = s.groupby([0, 0, 1, 1]).mean()
  270. assert_series_equal(result, expected)
  271. assert_series_equal(result, result2)
  272. assert_series_equal(result, expected2)
  273. def test_groupby_grouper_f_sanity_checked(self):
  274. dates = date_range('01-Jan-2013', periods=12, freq='MS')
  275. ts = Series(np.random.randn(12), index=dates)
  276. # GH3035
  277. # index.map is used to apply grouper to the index
  278. # if it fails on the elements, map tries it on the entire index as
  279. # a sequence. That can yield invalid results that cause trouble
  280. # down the line.
  281. # the surprise comes from using key[0:6] rather then str(key)[0:6]
  282. # when the elements are Timestamp.
  283. # the result is Index[0:6], very confusing.
  284. msg = r"Grouper result violates len\(labels\) == len\(data\)"
  285. with pytest.raises(AssertionError, match=msg):
  286. ts.groupby(lambda key: key[0:6])
  287. def test_grouping_error_on_multidim_input(self, df):
  288. msg = ("Grouper for '<class 'pandas.core.frame.DataFrame'>'"
  289. " not 1-dimensional")
  290. with pytest.raises(ValueError, match=msg):
  291. Grouping(df.index, df[['A', 'A']])
  292. def test_multiindex_passthru(self):
  293. # GH 7997
  294. # regression from 0.14.1
  295. df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
  296. df.columns = pd.MultiIndex.from_tuples([(0, 1), (1, 1), (2, 1)])
  297. result = df.groupby(axis=1, level=[0, 1]).first()
  298. assert_frame_equal(result, df)
  299. def test_multiindex_negative_level(self, mframe):
  300. # GH 13901
  301. result = mframe.groupby(level=-1).sum()
  302. expected = mframe.groupby(level='second').sum()
  303. assert_frame_equal(result, expected)
  304. result = mframe.groupby(level=-2).sum()
  305. expected = mframe.groupby(level='first').sum()
  306. assert_frame_equal(result, expected)
  307. result = mframe.groupby(level=[-2, -1]).sum()
  308. expected = mframe
  309. assert_frame_equal(result, expected)
  310. result = mframe.groupby(level=[-1, 'first']).sum()
  311. expected = mframe.groupby(level=['second', 'first']).sum()
  312. assert_frame_equal(result, expected)
  313. def test_multifunc_select_col_integer_cols(self, df):
  314. df.columns = np.arange(len(df.columns))
  315. # it works!
  316. df.groupby(1, as_index=False)[2].agg({'Q': np.mean})
  317. def test_multiindex_columns_empty_level(self):
  318. lst = [['count', 'values'], ['to filter', '']]
  319. midx = MultiIndex.from_tuples(lst)
  320. df = DataFrame([[long(1), 'A']], columns=midx)
  321. grouped = df.groupby('to filter').groups
  322. assert grouped['A'] == [0]
  323. grouped = df.groupby([('to filter', '')]).groups
  324. assert grouped['A'] == [0]
  325. df = DataFrame([[long(1), 'A'], [long(2), 'B']], columns=midx)
  326. expected = df.groupby('to filter').groups
  327. result = df.groupby([('to filter', '')]).groups
  328. assert result == expected
  329. df = DataFrame([[long(1), 'A'], [long(2), 'A']], columns=midx)
  330. expected = df.groupby('to filter').groups
  331. result = df.groupby([('to filter', '')]).groups
  332. tm.assert_dict_equal(result, expected)
  333. def test_groupby_multiindex_tuple(self):
  334. # GH 17979
  335. df = pd.DataFrame([[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]],
  336. columns=pd.MultiIndex.from_arrays(
  337. [['a', 'b', 'b', 'c'],
  338. [1, 1, 2, 2]]))
  339. expected = df.groupby([('b', 1)]).groups
  340. result = df.groupby(('b', 1)).groups
  341. tm.assert_dict_equal(expected, result)
  342. df2 = pd.DataFrame(df.values,
  343. columns=pd.MultiIndex.from_arrays(
  344. [['a', 'b', 'b', 'c'],
  345. ['d', 'd', 'e', 'e']]))
  346. expected = df2.groupby([('b', 'd')]).groups
  347. result = df.groupby(('b', 1)).groups
  348. tm.assert_dict_equal(expected, result)
  349. df3 = pd.DataFrame(df.values,
  350. columns=[('a', 'd'), ('b', 'd'), ('b', 'e'), 'c'])
  351. expected = df3.groupby([('b', 'd')]).groups
  352. result = df.groupby(('b', 1)).groups
  353. tm.assert_dict_equal(expected, result)
  354. @pytest.mark.parametrize('sort', [True, False])
  355. def test_groupby_level(self, sort, mframe, df):
  356. # GH 17537
  357. frame = mframe
  358. deleveled = frame.reset_index()
  359. result0 = frame.groupby(level=0, sort=sort).sum()
  360. result1 = frame.groupby(level=1, sort=sort).sum()
  361. expected0 = frame.groupby(deleveled['first'].values, sort=sort).sum()
  362. expected1 = frame.groupby(deleveled['second'].values, sort=sort).sum()
  363. expected0.index.name = 'first'
  364. expected1.index.name = 'second'
  365. assert result0.index.name == 'first'
  366. assert result1.index.name == 'second'
  367. assert_frame_equal(result0, expected0)
  368. assert_frame_equal(result1, expected1)
  369. assert result0.index.name == frame.index.names[0]
  370. assert result1.index.name == frame.index.names[1]
  371. # groupby level name
  372. result0 = frame.groupby(level='first', sort=sort).sum()
  373. result1 = frame.groupby(level='second', sort=sort).sum()
  374. assert_frame_equal(result0, expected0)
  375. assert_frame_equal(result1, expected1)
  376. # axis=1
  377. result0 = frame.T.groupby(level=0, axis=1, sort=sort).sum()
  378. result1 = frame.T.groupby(level=1, axis=1, sort=sort).sum()
  379. assert_frame_equal(result0, expected0.T)
  380. assert_frame_equal(result1, expected1.T)
  381. # raise exception for non-MultiIndex
  382. msg = "level > 0 or level < -1 only valid with MultiIndex"
  383. with pytest.raises(ValueError, match=msg):
  384. df.groupby(level=1)
  385. def test_groupby_level_index_names(self):
  386. # GH4014 this used to raise ValueError since 'exp'>1 (in py2)
  387. df = DataFrame({'exp': ['A'] * 3 + ['B'] * 3,
  388. 'var1': lrange(6), }).set_index('exp')
  389. df.groupby(level='exp')
  390. msg = "level name foo is not the name of the index"
  391. with pytest.raises(ValueError, match=msg):
  392. df.groupby(level='foo')
  393. @pytest.mark.parametrize('sort', [True, False])
  394. def test_groupby_level_with_nas(self, sort):
  395. # GH 17537
  396. index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]],
  397. codes=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1,
  398. 2, 3]])
  399. # factorizing doesn't confuse things
  400. s = Series(np.arange(8.), index=index)
  401. result = s.groupby(level=0, sort=sort).sum()
  402. expected = Series([6., 22.], index=[0, 1])
  403. assert_series_equal(result, expected)
  404. index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]],
  405. codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0,
  406. 1, 2, 3]])
  407. # factorizing doesn't confuse things
  408. s = Series(np.arange(8.), index=index)
  409. result = s.groupby(level=0, sort=sort).sum()
  410. expected = Series([6., 18.], index=[0.0, 1.0])
  411. assert_series_equal(result, expected)
  412. def test_groupby_args(self, mframe):
  413. # PR8618 and issue 8015
  414. frame = mframe
  415. msg = "You have to supply one of 'by' and 'level'"
  416. with pytest.raises(TypeError, match=msg):
  417. frame.groupby()
  418. msg = "You have to supply one of 'by' and 'level'"
  419. with pytest.raises(TypeError, match=msg):
  420. frame.groupby(by=None, level=None)
  421. @pytest.mark.parametrize('sort,labels', [
  422. [True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]],
  423. [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]]
  424. ])
  425. def test_level_preserve_order(self, sort, labels, mframe):
  426. # GH 17537
  427. grouped = mframe.groupby(level=0, sort=sort)
  428. exp_labels = np.array(labels, np.intp)
  429. assert_almost_equal(grouped.grouper.labels[0], exp_labels)
  430. def test_grouping_labels(self, mframe):
  431. grouped = mframe.groupby(mframe.index.get_level_values(0))
  432. exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp)
  433. assert_almost_equal(grouped.grouper.labels[0], exp_labels)
  434. def test_list_grouper_with_nat(self):
  435. # GH 14715
  436. df = pd.DataFrame({'date': pd.date_range('1/1/2011',
  437. periods=365, freq='D')})
  438. df.iloc[-1] = pd.NaT
  439. grouper = pd.Grouper(key='date', freq='AS')
  440. # Grouper in a list grouping
  441. result = df.groupby([grouper])
  442. expected = {pd.Timestamp('2011-01-01'): pd.Index(list(range(364)))}
  443. tm.assert_dict_equal(result.groups, expected)
  444. # Test case without a list
  445. result = df.groupby(grouper)
  446. expected = {pd.Timestamp('2011-01-01'): 365}
  447. tm.assert_dict_equal(result.groups, expected)
  448. # get_group
  449. # --------------------------------
  450. class TestGetGroup():
  451. @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
  452. def test_get_group(self):
  453. wp = tm.makePanel()
  454. grouped = wp.groupby(lambda x: x.month, axis='major')
  455. gp = grouped.get_group(1)
  456. expected = wp.reindex(
  457. major=[x for x in wp.major_axis if x.month == 1])
  458. assert_panel_equal(gp, expected)
  459. # GH 5267
  460. # be datelike friendly
  461. df = DataFrame({'DATE': pd.to_datetime(
  462. ['10-Oct-2013', '10-Oct-2013', '10-Oct-2013', '11-Oct-2013',
  463. '11-Oct-2013', '11-Oct-2013']),
  464. 'label': ['foo', 'foo', 'bar', 'foo', 'foo', 'bar'],
  465. 'VAL': [1, 2, 3, 4, 5, 6]})
  466. g = df.groupby('DATE')
  467. key = list(g.groups)[0]
  468. result1 = g.get_group(key)
  469. result2 = g.get_group(Timestamp(key).to_pydatetime())
  470. result3 = g.get_group(str(Timestamp(key)))
  471. assert_frame_equal(result1, result2)
  472. assert_frame_equal(result1, result3)
  473. g = df.groupby(['DATE', 'label'])
  474. key = list(g.groups)[0]
  475. result1 = g.get_group(key)
  476. result2 = g.get_group((Timestamp(key[0]).to_pydatetime(), key[1]))
  477. result3 = g.get_group((str(Timestamp(key[0])), key[1]))
  478. assert_frame_equal(result1, result2)
  479. assert_frame_equal(result1, result3)
  480. # must pass a same-length tuple with multiple keys
  481. msg = "must supply a tuple to get_group with multiple grouping keys"
  482. with pytest.raises(ValueError, match=msg):
  483. g.get_group('foo')
  484. with pytest.raises(ValueError, match=msg):
  485. g.get_group(('foo'))
  486. msg = ("must supply a same-length tuple to get_group with multiple"
  487. " grouping keys")
  488. with pytest.raises(ValueError, match=msg):
  489. g.get_group(('foo', 'bar', 'baz'))
  490. def test_get_group_empty_bins(self, observed):
  491. d = pd.DataFrame([3, 1, 7, 6])
  492. bins = [0, 5, 10, 15]
  493. g = d.groupby(pd.cut(d[0], bins), observed=observed)
  494. # TODO: should prob allow a str of Interval work as well
  495. # IOW '(0, 5]'
  496. result = g.get_group(pd.Interval(0, 5))
  497. expected = DataFrame([3, 1], index=[0, 1])
  498. assert_frame_equal(result, expected)
  499. msg = r"Interval\(10, 15, closed='right'\)"
  500. with pytest.raises(KeyError, match=msg):
  501. g.get_group(pd.Interval(10, 15))
  502. def test_get_group_grouped_by_tuple(self):
  503. # GH 8121
  504. df = DataFrame([[(1, ), (1, 2), (1, ), (1, 2)]], index=['ids']).T
  505. gr = df.groupby('ids')
  506. expected = DataFrame({'ids': [(1, ), (1, )]}, index=[0, 2])
  507. result = gr.get_group((1, ))
  508. assert_frame_equal(result, expected)
  509. dt = pd.to_datetime(['2010-01-01', '2010-01-02', '2010-01-01',
  510. '2010-01-02'])
  511. df = DataFrame({'ids': [(x, ) for x in dt]})
  512. gr = df.groupby('ids')
  513. result = gr.get_group(('2010-01-01', ))
  514. expected = DataFrame({'ids': [(dt[0], ), (dt[0], )]}, index=[0, 2])
  515. assert_frame_equal(result, expected)
  516. def test_groupby_with_empty(self):
  517. index = pd.DatetimeIndex(())
  518. data = ()
  519. series = pd.Series(data, index)
  520. grouper = pd.Grouper(freq='D')
  521. grouped = series.groupby(grouper)
  522. assert next(iter(grouped), None) is None
  523. def test_groupby_with_single_column(self):
  524. df = pd.DataFrame({'a': list('abssbab')})
  525. tm.assert_frame_equal(df.groupby('a').get_group('a'), df.iloc[[0, 5]])
  526. # GH 13530
  527. exp = pd.DataFrame([], index=pd.Index(['a', 'b', 's'], name='a'))
  528. tm.assert_frame_equal(df.groupby('a').count(), exp)
  529. tm.assert_frame_equal(df.groupby('a').sum(), exp)
  530. tm.assert_frame_equal(df.groupby('a').nth(1), exp)
  531. def test_gb_key_len_equal_axis_len(self):
  532. # GH16843
  533. # test ensures that index and column keys are recognized correctly
  534. # when number of keys equals axis length of groupby
  535. df = pd.DataFrame([['foo', 'bar', 'B', 1],
  536. ['foo', 'bar', 'B', 2],
  537. ['foo', 'baz', 'C', 3]],
  538. columns=['first', 'second', 'third', 'one'])
  539. df = df.set_index(['first', 'second'])
  540. df = df.groupby(['first', 'second', 'third']).size()
  541. assert df.loc[('foo', 'bar', 'B')] == 2
  542. assert df.loc[('foo', 'baz', 'C')] == 1
  543. # groups & iteration
  544. # --------------------------------
  545. class TestIteration():
  546. def test_groups(self, df):
  547. grouped = df.groupby(['A'])
  548. groups = grouped.groups
  549. assert groups is grouped.groups # caching works
  550. for k, v in compat.iteritems(grouped.groups):
  551. assert (df.loc[v]['A'] == k).all()
  552. grouped = df.groupby(['A', 'B'])
  553. groups = grouped.groups
  554. assert groups is grouped.groups # caching works
  555. for k, v in compat.iteritems(grouped.groups):
  556. assert (df.loc[v]['A'] == k[0]).all()
  557. assert (df.loc[v]['B'] == k[1]).all()
  558. def test_grouping_is_iterable(self, tsframe):
  559. # this code path isn't used anywhere else
  560. # not sure it's useful
  561. grouped = tsframe.groupby([lambda x: x.weekday(), lambda x: x.year])
  562. # test it works
  563. for g in grouped.grouper.groupings[0]:
  564. pass
  565. def test_multi_iter(self):
  566. s = Series(np.arange(6))
  567. k1 = np.array(['a', 'a', 'a', 'b', 'b', 'b'])
  568. k2 = np.array(['1', '2', '1', '2', '1', '2'])
  569. grouped = s.groupby([k1, k2])
  570. iterated = list(grouped)
  571. expected = [('a', '1', s[[0, 2]]), ('a', '2', s[[1]]),
  572. ('b', '1', s[[4]]), ('b', '2', s[[3, 5]])]
  573. for i, ((one, two), three) in enumerate(iterated):
  574. e1, e2, e3 = expected[i]
  575. assert e1 == one
  576. assert e2 == two
  577. assert_series_equal(three, e3)
  578. def test_multi_iter_frame(self, three_group):
  579. k1 = np.array(['b', 'b', 'b', 'a', 'a', 'a'])
  580. k2 = np.array(['1', '2', '1', '2', '1', '2'])
  581. df = DataFrame({'v1': np.random.randn(6),
  582. 'v2': np.random.randn(6),
  583. 'k1': k1, 'k2': k2},
  584. index=['one', 'two', 'three', 'four', 'five', 'six'])
  585. grouped = df.groupby(['k1', 'k2'])
  586. # things get sorted!
  587. iterated = list(grouped)
  588. idx = df.index
  589. expected = [('a', '1', df.loc[idx[[4]]]),
  590. ('a', '2', df.loc[idx[[3, 5]]]),
  591. ('b', '1', df.loc[idx[[0, 2]]]),
  592. ('b', '2', df.loc[idx[[1]]])]
  593. for i, ((one, two), three) in enumerate(iterated):
  594. e1, e2, e3 = expected[i]
  595. assert e1 == one
  596. assert e2 == two
  597. assert_frame_equal(three, e3)
  598. # don't iterate through groups with no data
  599. df['k1'] = np.array(['b', 'b', 'b', 'a', 'a', 'a'])
  600. df['k2'] = np.array(['1', '1', '1', '2', '2', '2'])
  601. grouped = df.groupby(['k1', 'k2'])
  602. groups = {key: gp for key, gp in grouped}
  603. assert len(groups) == 2
  604. # axis = 1
  605. three_levels = three_group.groupby(['A', 'B', 'C']).mean()
  606. grouped = three_levels.T.groupby(axis=1, level=(1, 2))
  607. for key, group in grouped:
  608. pass
  609. @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
  610. def test_multi_iter_panel(self):
  611. wp = tm.makePanel()
  612. grouped = wp.groupby([lambda x: x.month, lambda x: x.weekday()],
  613. axis=1)
  614. for (month, wd), group in grouped:
  615. exp_axis = [x
  616. for x in wp.major_axis
  617. if x.month == month and x.weekday() == wd]
  618. expected = wp.reindex(major=exp_axis)
  619. assert_panel_equal(group, expected)
  620. def test_dictify(self, df):
  621. dict(iter(df.groupby('A')))
  622. dict(iter(df.groupby(['A', 'B'])))
  623. dict(iter(df['C'].groupby(df['A'])))
  624. dict(iter(df['C'].groupby([df['A'], df['B']])))
  625. dict(iter(df.groupby('A')['C']))
  626. dict(iter(df.groupby(['A', 'B'])['C']))
  627. def test_groupby_with_small_elem(self):
  628. # GH 8542
  629. # length=2
  630. df = pd.DataFrame({'event': ['start', 'start'],
  631. 'change': [1234, 5678]},
  632. index=pd.DatetimeIndex(['2014-09-10', '2013-10-10']))
  633. grouped = df.groupby([pd.Grouper(freq='M'), 'event'])
  634. assert len(grouped.groups) == 2
  635. assert grouped.ngroups == 2
  636. assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups
  637. assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups
  638. res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start'))
  639. tm.assert_frame_equal(res, df.iloc[[0], :])
  640. res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start'))
  641. tm.assert_frame_equal(res, df.iloc[[1], :])
  642. df = pd.DataFrame({'event': ['start', 'start', 'start'],
  643. 'change': [1234, 5678, 9123]},
  644. index=pd.DatetimeIndex(['2014-09-10', '2013-10-10',
  645. '2014-09-15']))
  646. grouped = df.groupby([pd.Grouper(freq='M'), 'event'])
  647. assert len(grouped.groups) == 2
  648. assert grouped.ngroups == 2
  649. assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups
  650. assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups
  651. res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start'))
  652. tm.assert_frame_equal(res, df.iloc[[0, 2], :])
  653. res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start'))
  654. tm.assert_frame_equal(res, df.iloc[[1], :])
  655. # length=3
  656. df = pd.DataFrame({'event': ['start', 'start', 'start'],
  657. 'change': [1234, 5678, 9123]},
  658. index=pd.DatetimeIndex(['2014-09-10', '2013-10-10',
  659. '2014-08-05']))
  660. grouped = df.groupby([pd.Grouper(freq='M'), 'event'])
  661. assert len(grouped.groups) == 3
  662. assert grouped.ngroups == 3
  663. assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups
  664. assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups
  665. assert (pd.Timestamp('2014-08-31'), 'start') in grouped.groups
  666. res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start'))
  667. tm.assert_frame_equal(res, df.iloc[[0], :])
  668. res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start'))
  669. tm.assert_frame_equal(res, df.iloc[[1], :])
  670. res = grouped.get_group((pd.Timestamp('2014-08-31'), 'start'))
  671. tm.assert_frame_equal(res, df.iloc[[2], :])
  672. def test_grouping_string_repr(self):
  673. # GH 13394
  674. mi = MultiIndex.from_arrays([list("AAB"), list("aba")])
  675. df = DataFrame([[1, 2, 3]], columns=mi)
  676. gr = df.groupby(df[('A', 'a')])
  677. result = gr.grouper.groupings[0].__repr__()
  678. expected = "Grouping(('A', 'a'))"
  679. assert result == expected