test_categorical.py 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936
  1. # -*- coding: utf-8 -*-
  2. from __future__ import print_function
  3. from datetime import datetime
  4. import numpy as np
  5. import pytest
  6. from pandas.compat import PY37
  7. import pandas as pd
  8. from pandas import (
  9. Categorical, CategoricalIndex, DataFrame, Index, MultiIndex, Series, qcut)
  10. import pandas.util.testing as tm
  11. from pandas.util.testing import (
  12. assert_equal, assert_frame_equal, assert_series_equal)
  13. def cartesian_product_for_groupers(result, args, names):
  14. """ Reindex to a cartesian production for the groupers,
  15. preserving the nature (Categorical) of each grouper """
  16. def f(a):
  17. if isinstance(a, (CategoricalIndex, Categorical)):
  18. categories = a.categories
  19. a = Categorical.from_codes(np.arange(len(categories)),
  20. categories=categories,
  21. ordered=a.ordered)
  22. return a
  23. index = pd.MultiIndex.from_product(map(f, args), names=names)
  24. return result.reindex(index).sort_index()
  25. def test_apply_use_categorical_name(df):
  26. cats = qcut(df.C, 4)
  27. def get_stats(group):
  28. return {'min': group.min(),
  29. 'max': group.max(),
  30. 'count': group.count(),
  31. 'mean': group.mean()}
  32. result = df.groupby(cats, observed=False).D.apply(get_stats)
  33. assert result.index.names[0] == 'C'
  34. def test_basic():
  35. cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
  36. categories=["a", "b", "c", "d"], ordered=True)
  37. data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})
  38. exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True)
  39. expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index)
  40. result = data.groupby("b", observed=False).mean()
  41. tm.assert_frame_equal(result, expected)
  42. cat1 = Categorical(["a", "a", "b", "b"],
  43. categories=["a", "b", "z"], ordered=True)
  44. cat2 = Categorical(["c", "d", "c", "d"],
  45. categories=["c", "d", "y"], ordered=True)
  46. df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
  47. # single grouper
  48. gb = df.groupby("A", observed=False)
  49. exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True)
  50. expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)})
  51. result = gb.sum()
  52. tm.assert_frame_equal(result, expected)
  53. # GH 8623
  54. x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'],
  55. [1, 'John P. Doe']],
  56. columns=['person_id', 'person_name'])
  57. x['person_name'] = Categorical(x.person_name)
  58. g = x.groupby(['person_id'], observed=False)
  59. result = g.transform(lambda x: x)
  60. tm.assert_frame_equal(result, x[['person_name']])
  61. result = x.drop_duplicates('person_name')
  62. expected = x.iloc[[0, 1]]
  63. tm.assert_frame_equal(result, expected)
  64. def f(x):
  65. return x.drop_duplicates('person_name').iloc[0]
  66. result = g.apply(f)
  67. expected = x.iloc[[0, 1]].copy()
  68. expected.index = Index([1, 2], name='person_id')
  69. expected['person_name'] = expected['person_name'].astype('object')
  70. tm.assert_frame_equal(result, expected)
  71. # GH 9921
  72. # Monotonic
  73. df = DataFrame({"a": [5, 15, 25]})
  74. c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])
  75. result = df.a.groupby(c, observed=False).transform(sum)
  76. tm.assert_series_equal(result, df['a'])
  77. tm.assert_series_equal(
  78. df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
  79. df['a'])
  80. tm.assert_frame_equal(
  81. df.groupby(c, observed=False).transform(sum),
  82. df[['a']])
  83. tm.assert_frame_equal(
  84. df.groupby(c, observed=False).transform(lambda xs: np.max(xs)),
  85. df[['a']])
  86. # Filter
  87. tm.assert_series_equal(
  88. df.a.groupby(c, observed=False).filter(np.all),
  89. df['a'])
  90. tm.assert_frame_equal(
  91. df.groupby(c, observed=False).filter(np.all),
  92. df)
  93. # Non-monotonic
  94. df = DataFrame({"a": [5, 15, 25, -5]})
  95. c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])
  96. result = df.a.groupby(c, observed=False).transform(sum)
  97. tm.assert_series_equal(result, df['a'])
  98. tm.assert_series_equal(
  99. df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
  100. df['a'])
  101. tm.assert_frame_equal(
  102. df.groupby(c, observed=False).transform(sum),
  103. df[['a']])
  104. tm.assert_frame_equal(
  105. df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
  106. df[['a']])
  107. # GH 9603
  108. df = DataFrame({'a': [1, 0, 0, 0]})
  109. c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd')))
  110. result = df.groupby(c, observed=False).apply(len)
  111. exp_index = CategoricalIndex(
  112. c.values.categories, ordered=c.values.ordered)
  113. expected = Series([1, 0, 0, 0], index=exp_index)
  114. expected.index.name = 'a'
  115. tm.assert_series_equal(result, expected)
  116. # more basic
  117. levels = ['foo', 'bar', 'baz', 'qux']
  118. codes = np.random.randint(0, 4, size=100)
  119. cats = Categorical.from_codes(codes, levels, ordered=True)
  120. data = DataFrame(np.random.randn(100, 4))
  121. result = data.groupby(cats, observed=False).mean()
  122. expected = data.groupby(np.asarray(cats), observed=False).mean()
  123. exp_idx = CategoricalIndex(levels, categories=cats.categories,
  124. ordered=True)
  125. expected = expected.reindex(exp_idx)
  126. assert_frame_equal(result, expected)
  127. grouped = data.groupby(cats, observed=False)
  128. desc_result = grouped.describe()
  129. idx = cats.codes.argsort()
  130. ord_labels = np.asarray(cats).take(idx)
  131. ord_data = data.take(idx)
  132. exp_cats = Categorical(ord_labels, ordered=True,
  133. categories=['foo', 'bar', 'baz', 'qux'])
  134. expected = ord_data.groupby(
  135. exp_cats, sort=False, observed=False).describe()
  136. assert_frame_equal(desc_result, expected)
  137. # GH 10460
  138. expc = Categorical.from_codes(np.arange(4).repeat(8),
  139. levels, ordered=True)
  140. exp = CategoricalIndex(expc)
  141. tm.assert_index_equal((desc_result.stack().index
  142. .get_level_values(0)), exp)
  143. exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
  144. '75%', 'max'] * 4)
  145. tm.assert_index_equal((desc_result.stack().index
  146. .get_level_values(1)), exp)
  147. def test_level_get_group(observed):
  148. # GH15155
  149. df = DataFrame(data=np.arange(2, 22, 2),
  150. index=MultiIndex(
  151. levels=[pd.CategoricalIndex(["a", "b"]), range(10)],
  152. codes=[[0] * 5 + [1] * 5, range(10)],
  153. names=["Index1", "Index2"]))
  154. g = df.groupby(level=["Index1"], observed=observed)
  155. # expected should equal test.loc[["a"]]
  156. # GH15166
  157. expected = DataFrame(data=np.arange(2, 12, 2),
  158. index=pd.MultiIndex(levels=[pd.CategoricalIndex(
  159. ["a", "b"]), range(5)],
  160. codes=[[0] * 5, range(5)],
  161. names=["Index1", "Index2"]))
  162. result = g.get_group('a')
  163. assert_frame_equal(result, expected)
  164. @pytest.mark.xfail(PY37, reason="flaky on 3.7, xref gh-21636", strict=False)
  165. @pytest.mark.parametrize('ordered', [True, False])
  166. def test_apply(ordered):
  167. # GH 10138
  168. dense = Categorical(list('abc'), ordered=ordered)
  169. # 'b' is in the categories but not in the list
  170. missing = Categorical(
  171. list('aaa'), categories=['a', 'b'], ordered=ordered)
  172. values = np.arange(len(dense))
  173. df = DataFrame({'missing': missing,
  174. 'dense': dense,
  175. 'values': values})
  176. grouped = df.groupby(['missing', 'dense'], observed=True)
  177. # missing category 'b' should still exist in the output index
  178. idx = MultiIndex.from_arrays(
  179. [missing, dense], names=['missing', 'dense'])
  180. expected = DataFrame([0, 1, 2.],
  181. index=idx,
  182. columns=['values'])
  183. result = grouped.apply(lambda x: np.mean(x))
  184. assert_frame_equal(result, expected)
  185. # we coerce back to ints
  186. expected = expected.astype('int')
  187. result = grouped.mean()
  188. assert_frame_equal(result, expected)
  189. result = grouped.agg(np.mean)
  190. assert_frame_equal(result, expected)
  191. # but for transform we should still get back the original index
  192. idx = MultiIndex.from_arrays([missing, dense],
  193. names=['missing', 'dense'])
  194. expected = Series(1, index=idx)
  195. result = grouped.apply(lambda x: 1)
  196. assert_series_equal(result, expected)
  197. def test_observed(observed):
  198. # multiple groupers, don't re-expand the output space
  199. # of the grouper
  200. # gh-14942 (implement)
  201. # gh-10132 (back-compat)
  202. # gh-8138 (back-compat)
  203. # gh-8869
  204. cat1 = Categorical(["a", "a", "b", "b"],
  205. categories=["a", "b", "z"], ordered=True)
  206. cat2 = Categorical(["c", "d", "c", "d"],
  207. categories=["c", "d", "y"], ordered=True)
  208. df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
  209. df['C'] = ['foo', 'bar'] * 2
  210. # multiple groupers with a non-cat
  211. gb = df.groupby(['A', 'B', 'C'], observed=observed)
  212. exp_index = pd.MultiIndex.from_arrays(
  213. [cat1, cat2, ['foo', 'bar'] * 2],
  214. names=['A', 'B', 'C'])
  215. expected = DataFrame({'values': Series(
  216. [1, 2, 3, 4], index=exp_index)}).sort_index()
  217. result = gb.sum()
  218. if not observed:
  219. expected = cartesian_product_for_groupers(
  220. expected,
  221. [cat1, cat2, ['foo', 'bar']],
  222. list('ABC'))
  223. tm.assert_frame_equal(result, expected)
  224. gb = df.groupby(['A', 'B'], observed=observed)
  225. exp_index = pd.MultiIndex.from_arrays(
  226. [cat1, cat2],
  227. names=['A', 'B'])
  228. expected = DataFrame({'values': [1, 2, 3, 4]},
  229. index=exp_index)
  230. result = gb.sum()
  231. if not observed:
  232. expected = cartesian_product_for_groupers(
  233. expected,
  234. [cat1, cat2],
  235. list('AB'))
  236. tm.assert_frame_equal(result, expected)
  237. # https://github.com/pandas-dev/pandas/issues/8138
  238. d = {'cat':
  239. pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"],
  240. ordered=True),
  241. 'ints': [1, 1, 2, 2],
  242. 'val': [10, 20, 30, 40]}
  243. df = pd.DataFrame(d)
  244. # Grouping on a single column
  245. groups_single_key = df.groupby("cat", observed=observed)
  246. result = groups_single_key.mean()
  247. exp_index = pd.CategoricalIndex(list('ab'), name="cat",
  248. categories=list('abc'),
  249. ordered=True)
  250. expected = DataFrame({"ints": [1.5, 1.5], "val": [20., 30]},
  251. index=exp_index)
  252. if not observed:
  253. index = pd.CategoricalIndex(list('abc'), name="cat",
  254. categories=list('abc'),
  255. ordered=True)
  256. expected = expected.reindex(index)
  257. tm.assert_frame_equal(result, expected)
  258. # Grouping on two columns
  259. groups_double_key = df.groupby(["cat", "ints"], observed=observed)
  260. result = groups_double_key.agg('mean')
  261. expected = DataFrame(
  262. {"val": [10, 30, 20, 40],
  263. "cat": pd.Categorical(['a', 'a', 'b', 'b'],
  264. categories=['a', 'b', 'c'],
  265. ordered=True),
  266. "ints": [1, 2, 1, 2]}).set_index(["cat", "ints"])
  267. if not observed:
  268. expected = cartesian_product_for_groupers(
  269. expected,
  270. [df.cat.values, [1, 2]],
  271. ['cat', 'ints'])
  272. tm.assert_frame_equal(result, expected)
  273. # GH 10132
  274. for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]:
  275. c, i = key
  276. result = groups_double_key.get_group(key)
  277. expected = df[(df.cat == c) & (df.ints == i)]
  278. assert_frame_equal(result, expected)
  279. # gh-8869
  280. # with as_index
  281. d = {'foo': [10, 8, 4, 8, 4, 1, 1], 'bar': [10, 20, 30, 40, 50, 60, 70],
  282. 'baz': ['d', 'c', 'e', 'a', 'a', 'd', 'c']}
  283. df = pd.DataFrame(d)
  284. cat = pd.cut(df['foo'], np.linspace(0, 10, 3))
  285. df['range'] = cat
  286. groups = df.groupby(['range', 'baz'], as_index=False, observed=observed)
  287. result = groups.agg('mean')
  288. groups2 = df.groupby(['range', 'baz'], as_index=True, observed=observed)
  289. expected = groups2.agg('mean').reset_index()
  290. tm.assert_frame_equal(result, expected)
  291. def test_observed_codes_remap(observed):
  292. d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]}
  293. df = pd.DataFrame(d)
  294. values = pd.cut(df['C1'], [1, 2, 3, 6])
  295. values.name = "cat"
  296. groups_double_key = df.groupby([values, 'C2'], observed=observed)
  297. idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]],
  298. names=["cat", "C2"])
  299. expected = DataFrame({"C1": [3, 3, 4, 5],
  300. "C3": [10, 100, 200, 34]}, index=idx)
  301. if not observed:
  302. expected = cartesian_product_for_groupers(
  303. expected,
  304. [values.values, [1, 2, 3, 4]],
  305. ['cat', 'C2'])
  306. result = groups_double_key.agg('mean')
  307. tm.assert_frame_equal(result, expected)
  308. def test_observed_perf():
  309. # we create a cartesian product, so this is
  310. # non-performant if we don't use observed values
  311. # gh-14942
  312. df = DataFrame({
  313. 'cat': np.random.randint(0, 255, size=30000),
  314. 'int_id': np.random.randint(0, 255, size=30000),
  315. 'other_id': np.random.randint(0, 10000, size=30000),
  316. 'foo': 0})
  317. df['cat'] = df.cat.astype(str).astype('category')
  318. grouped = df.groupby(['cat', 'int_id', 'other_id'], observed=True)
  319. result = grouped.count()
  320. assert result.index.levels[0].nunique() == df.cat.nunique()
  321. assert result.index.levels[1].nunique() == df.int_id.nunique()
  322. assert result.index.levels[2].nunique() == df.other_id.nunique()
  323. def test_observed_groups(observed):
  324. # gh-20583
  325. # test that we have the appropriate groups
  326. cat = pd.Categorical(['a', 'c', 'a'], categories=['a', 'b', 'c'])
  327. df = pd.DataFrame({'cat': cat, 'vals': [1, 2, 3]})
  328. g = df.groupby('cat', observed=observed)
  329. result = g.groups
  330. if observed:
  331. expected = {'a': Index([0, 2], dtype='int64'),
  332. 'c': Index([1], dtype='int64')}
  333. else:
  334. expected = {'a': Index([0, 2], dtype='int64'),
  335. 'b': Index([], dtype='int64'),
  336. 'c': Index([1], dtype='int64')}
  337. tm.assert_dict_equal(result, expected)
  338. def test_observed_groups_with_nan(observed):
  339. # GH 24740
  340. df = pd.DataFrame({'cat': pd.Categorical(['a', np.nan, 'a'],
  341. categories=['a', 'b', 'd']),
  342. 'vals': [1, 2, 3]})
  343. g = df.groupby('cat', observed=observed)
  344. result = g.groups
  345. if observed:
  346. expected = {'a': Index([0, 2], dtype='int64')}
  347. else:
  348. expected = {'a': Index([0, 2], dtype='int64'),
  349. 'b': Index([], dtype='int64'),
  350. 'd': Index([], dtype='int64')}
  351. tm.assert_dict_equal(result, expected)
  352. def test_dataframe_categorical_with_nan(observed):
  353. # GH 21151
  354. s1 = pd.Categorical([np.nan, 'a', np.nan, 'a'],
  355. categories=['a', 'b', 'c'])
  356. s2 = pd.Series([1, 2, 3, 4])
  357. df = pd.DataFrame({'s1': s1, 's2': s2})
  358. result = df.groupby('s1', observed=observed).first().reset_index()
  359. if observed:
  360. expected = DataFrame({'s1': pd.Categorical(['a'],
  361. categories=['a', 'b', 'c']), 's2': [2]})
  362. else:
  363. expected = DataFrame({'s1': pd.Categorical(['a', 'b', 'c'],
  364. categories=['a', 'b', 'c']),
  365. 's2': [2, np.nan, np.nan]})
  366. tm.assert_frame_equal(result, expected)
  367. def test_datetime():
  368. # GH9049: ensure backward compatibility
  369. levels = pd.date_range('2014-01-01', periods=4)
  370. codes = np.random.randint(0, 4, size=100)
  371. cats = Categorical.from_codes(codes, levels, ordered=True)
  372. data = DataFrame(np.random.randn(100, 4))
  373. result = data.groupby(cats, observed=False).mean()
  374. expected = data.groupby(np.asarray(cats), observed=False).mean()
  375. expected = expected.reindex(levels)
  376. expected.index = CategoricalIndex(expected.index,
  377. categories=expected.index,
  378. ordered=True)
  379. assert_frame_equal(result, expected)
  380. grouped = data.groupby(cats, observed=False)
  381. desc_result = grouped.describe()
  382. idx = cats.codes.argsort()
  383. ord_labels = cats.take_nd(idx)
  384. ord_data = data.take(idx)
  385. expected = ord_data.groupby(ord_labels, observed=False).describe()
  386. assert_frame_equal(desc_result, expected)
  387. tm.assert_index_equal(desc_result.index, expected.index)
  388. tm.assert_index_equal(
  389. desc_result.index.get_level_values(0),
  390. expected.index.get_level_values(0))
  391. # GH 10460
  392. expc = Categorical.from_codes(
  393. np.arange(4).repeat(8), levels, ordered=True)
  394. exp = CategoricalIndex(expc)
  395. tm.assert_index_equal((desc_result.stack().index
  396. .get_level_values(0)), exp)
  397. exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
  398. '75%', 'max'] * 4)
  399. tm.assert_index_equal((desc_result.stack().index
  400. .get_level_values(1)), exp)
  401. def test_categorical_index():
  402. s = np.random.RandomState(12345)
  403. levels = ['foo', 'bar', 'baz', 'qux']
  404. codes = s.randint(0, 4, size=20)
  405. cats = Categorical.from_codes(codes, levels, ordered=True)
  406. df = DataFrame(
  407. np.repeat(
  408. np.arange(20), 4).reshape(-1, 4), columns=list('abcd'))
  409. df['cats'] = cats
  410. # with a cat index
  411. result = df.set_index('cats').groupby(level=0, observed=False).sum()
  412. expected = df[list('abcd')].groupby(cats.codes, observed=False).sum()
  413. expected.index = CategoricalIndex(
  414. Categorical.from_codes(
  415. [0, 1, 2, 3], levels, ordered=True), name='cats')
  416. assert_frame_equal(result, expected)
  417. # with a cat column, should produce a cat index
  418. result = df.groupby('cats', observed=False).sum()
  419. expected = df[list('abcd')].groupby(cats.codes, observed=False).sum()
  420. expected.index = CategoricalIndex(
  421. Categorical.from_codes(
  422. [0, 1, 2, 3], levels, ordered=True), name='cats')
  423. assert_frame_equal(result, expected)
  424. def test_describe_categorical_columns():
  425. # GH 11558
  426. cats = pd.CategoricalIndex(['qux', 'foo', 'baz', 'bar'],
  427. categories=['foo', 'bar', 'baz', 'qux'],
  428. ordered=True)
  429. df = DataFrame(np.random.randn(20, 4), columns=cats)
  430. result = df.groupby([1, 2, 3, 4] * 5).describe()
  431. tm.assert_index_equal(result.stack().columns, cats)
  432. tm.assert_categorical_equal(result.stack().columns.values, cats.values)
  433. def test_unstack_categorical():
  434. # GH11558 (example is taken from the original issue)
  435. df = pd.DataFrame({'a': range(10),
  436. 'medium': ['A', 'B'] * 5,
  437. 'artist': list('XYXXY') * 2})
  438. df['medium'] = df['medium'].astype('category')
  439. gcat = df.groupby(
  440. ['artist', 'medium'], observed=False)['a'].count().unstack()
  441. result = gcat.describe()
  442. exp_columns = pd.CategoricalIndex(['A', 'B'], ordered=False,
  443. name='medium')
  444. tm.assert_index_equal(result.columns, exp_columns)
  445. tm.assert_categorical_equal(result.columns.values, exp_columns.values)
  446. result = gcat['A'] + gcat['B']
  447. expected = pd.Series([6, 4], index=pd.Index(['X', 'Y'], name='artist'))
  448. tm.assert_series_equal(result, expected)
  449. def test_bins_unequal_len():
  450. # GH3011
  451. series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4])
  452. bins = pd.cut(series.dropna().values, 4)
  453. # len(bins) != len(series) here
  454. with pytest.raises(ValueError):
  455. series.groupby(bins).mean()
  456. def test_as_index():
  457. # GH13204
  458. df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]),
  459. 'A': [10, 11, 11],
  460. 'B': [101, 102, 103]})
  461. result = df.groupby(['cat', 'A'], as_index=False, observed=True).sum()
  462. expected = DataFrame(
  463. {'cat': Categorical([1, 2], categories=df.cat.cat.categories),
  464. 'A': [10, 11],
  465. 'B': [101, 205]},
  466. columns=['cat', 'A', 'B'])
  467. tm.assert_frame_equal(result, expected)
  468. # function grouper
  469. f = lambda r: df.loc[r, 'A']
  470. result = df.groupby(['cat', f], as_index=False, observed=True).sum()
  471. expected = DataFrame(
  472. {'cat': Categorical([1, 2], categories=df.cat.cat.categories),
  473. 'A': [10, 22],
  474. 'B': [101, 205]},
  475. columns=['cat', 'A', 'B'])
  476. tm.assert_frame_equal(result, expected)
  477. # another not in-axis grouper (conflicting names in index)
  478. s = Series(['a', 'b', 'b'], name='cat')
  479. result = df.groupby(['cat', s], as_index=False, observed=True).sum()
  480. tm.assert_frame_equal(result, expected)
  481. # is original index dropped?
  482. group_columns = ['cat', 'A']
  483. expected = DataFrame(
  484. {'cat': Categorical([1, 2], categories=df.cat.cat.categories),
  485. 'A': [10, 11],
  486. 'B': [101, 205]},
  487. columns=['cat', 'A', 'B'])
  488. for name in [None, 'X', 'B']:
  489. df.index = Index(list("abc"), name=name)
  490. result = df.groupby(group_columns, as_index=False, observed=True).sum()
  491. tm.assert_frame_equal(result, expected)
  492. def test_preserve_categories():
  493. # GH-13179
  494. categories = list('abc')
  495. # ordered=True
  496. df = DataFrame({'A': pd.Categorical(list('ba'),
  497. categories=categories,
  498. ordered=True)})
  499. index = pd.CategoricalIndex(categories, categories, ordered=True)
  500. tm.assert_index_equal(
  501. df.groupby('A', sort=True, observed=False).first().index, index)
  502. tm.assert_index_equal(
  503. df.groupby('A', sort=False, observed=False).first().index, index)
  504. # ordered=False
  505. df = DataFrame({'A': pd.Categorical(list('ba'),
  506. categories=categories,
  507. ordered=False)})
  508. sort_index = pd.CategoricalIndex(categories, categories, ordered=False)
  509. nosort_index = pd.CategoricalIndex(list('bac'), list('bac'),
  510. ordered=False)
  511. tm.assert_index_equal(
  512. df.groupby('A', sort=True, observed=False).first().index,
  513. sort_index)
  514. tm.assert_index_equal(
  515. df.groupby('A', sort=False, observed=False).first().index,
  516. nosort_index)
  517. def test_preserve_categorical_dtype():
  518. # GH13743, GH13854
  519. df = DataFrame({'A': [1, 2, 1, 1, 2],
  520. 'B': [10, 16, 22, 28, 34],
  521. 'C1': Categorical(list("abaab"),
  522. categories=list("bac"),
  523. ordered=False),
  524. 'C2': Categorical(list("abaab"),
  525. categories=list("bac"),
  526. ordered=True)})
  527. # single grouper
  528. exp_full = DataFrame({'A': [2.0, 1.0, np.nan],
  529. 'B': [25.0, 20.0, np.nan],
  530. 'C1': Categorical(list("bac"),
  531. categories=list("bac"),
  532. ordered=False),
  533. 'C2': Categorical(list("bac"),
  534. categories=list("bac"),
  535. ordered=True)})
  536. for col in ['C1', 'C2']:
  537. result1 = df.groupby(by=col, as_index=False, observed=False).mean()
  538. result2 = df.groupby(
  539. by=col, as_index=True, observed=False).mean().reset_index()
  540. expected = exp_full.reindex(columns=result1.columns)
  541. tm.assert_frame_equal(result1, expected)
  542. tm.assert_frame_equal(result2, expected)
  543. def test_categorical_no_compress():
  544. data = Series(np.random.randn(9))
  545. codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
  546. cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True)
  547. result = data.groupby(cats, observed=False).mean()
  548. exp = data.groupby(codes, observed=False).mean()
  549. exp.index = CategoricalIndex(exp.index, categories=cats.categories,
  550. ordered=cats.ordered)
  551. assert_series_equal(result, exp)
  552. codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3])
  553. cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True)
  554. result = data.groupby(cats, observed=False).mean()
  555. exp = data.groupby(codes, observed=False).mean().reindex(cats.categories)
  556. exp.index = CategoricalIndex(exp.index, categories=cats.categories,
  557. ordered=cats.ordered)
  558. assert_series_equal(result, exp)
  559. cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
  560. categories=["a", "b", "c", "d"], ordered=True)
  561. data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})
  562. result = data.groupby("b", observed=False).mean()
  563. result = result["a"].values
  564. exp = np.array([1, 2, 4, np.nan])
  565. tm.assert_numpy_array_equal(result, exp)
  566. def test_sort():
  567. # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby # noqa: flake8
  568. # This should result in a properly sorted Series so that the plot
  569. # has a sorted x axis
  570. # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar')
  571. df = DataFrame({'value': np.random.randint(0, 10000, 100)})
  572. labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
  573. cat_labels = Categorical(labels, labels)
  574. df = df.sort_values(by=['value'], ascending=True)
  575. df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
  576. right=False, labels=cat_labels)
  577. res = df.groupby(['value_group'], observed=False)['value_group'].count()
  578. exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))]
  579. exp.index = CategoricalIndex(exp.index, name=exp.index.name)
  580. tm.assert_series_equal(res, exp)
  581. def test_sort2():
  582. # dataframe groupby sort was being ignored # GH 8868
  583. df = DataFrame([['(7.5, 10]', 10, 10],
  584. ['(7.5, 10]', 8, 20],
  585. ['(2.5, 5]', 5, 30],
  586. ['(5, 7.5]', 6, 40],
  587. ['(2.5, 5]', 4, 50],
  588. ['(0, 2.5]', 1, 60],
  589. ['(5, 7.5]', 7, 70]], columns=['range', 'foo', 'bar'])
  590. df['range'] = Categorical(df['range'], ordered=True)
  591. index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]',
  592. '(7.5, 10]'], name='range', ordered=True)
  593. expected_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
  594. columns=['foo', 'bar'], index=index)
  595. col = 'range'
  596. result_sort = df.groupby(col, sort=True, observed=False).first()
  597. assert_frame_equal(result_sort, expected_sort)
  598. # when categories is ordered, group is ordered by category's order
  599. expected_sort = result_sort
  600. result_sort = df.groupby(col, sort=False, observed=False).first()
  601. assert_frame_equal(result_sort, expected_sort)
  602. df['range'] = Categorical(df['range'], ordered=False)
  603. index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]',
  604. '(7.5, 10]'], name='range')
  605. expected_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
  606. columns=['foo', 'bar'], index=index)
  607. index = CategoricalIndex(['(7.5, 10]', '(2.5, 5]', '(5, 7.5]',
  608. '(0, 2.5]'],
  609. categories=['(7.5, 10]', '(2.5, 5]',
  610. '(5, 7.5]', '(0, 2.5]'],
  611. name='range')
  612. expected_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
  613. index=index, columns=['foo', 'bar'])
  614. col = 'range'
  615. # this is an unordered categorical, but we allow this ####
  616. result_sort = df.groupby(col, sort=True, observed=False).first()
  617. assert_frame_equal(result_sort, expected_sort)
  618. result_nosort = df.groupby(col, sort=False, observed=False).first()
  619. assert_frame_equal(result_nosort, expected_nosort)
  620. def test_sort_datetimelike():
  621. # GH10505
  622. # use same data as test_groupby_sort_categorical, which category is
  623. # corresponding to datetime.month
  624. df = DataFrame({'dt': [datetime(2011, 7, 1), datetime(2011, 7, 1),
  625. datetime(2011, 2, 1), datetime(2011, 5, 1),
  626. datetime(2011, 2, 1), datetime(2011, 1, 1),
  627. datetime(2011, 5, 1)],
  628. 'foo': [10, 8, 5, 6, 4, 1, 7],
  629. 'bar': [10, 20, 30, 40, 50, 60, 70]},
  630. columns=['dt', 'foo', 'bar'])
  631. # ordered=True
  632. df['dt'] = Categorical(df['dt'], ordered=True)
  633. index = [datetime(2011, 1, 1), datetime(2011, 2, 1),
  634. datetime(2011, 5, 1), datetime(2011, 7, 1)]
  635. result_sort = DataFrame(
  636. [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'])
  637. result_sort.index = CategoricalIndex(index, name='dt', ordered=True)
  638. index = [datetime(2011, 7, 1), datetime(2011, 2, 1),
  639. datetime(2011, 5, 1), datetime(2011, 1, 1)]
  640. result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
  641. columns=['foo', 'bar'])
  642. result_nosort.index = CategoricalIndex(index, categories=index,
  643. name='dt', ordered=True)
  644. col = 'dt'
  645. assert_frame_equal(
  646. result_sort, df.groupby(col, sort=True, observed=False).first())
  647. # when categories is ordered, group is ordered by category's order
  648. assert_frame_equal(
  649. result_sort, df.groupby(col, sort=False, observed=False).first())
  650. # ordered = False
  651. df['dt'] = Categorical(df['dt'], ordered=False)
  652. index = [datetime(2011, 1, 1), datetime(2011, 2, 1),
  653. datetime(2011, 5, 1), datetime(2011, 7, 1)]
  654. result_sort = DataFrame(
  655. [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'])
  656. result_sort.index = CategoricalIndex(index, name='dt')
  657. index = [datetime(2011, 7, 1), datetime(2011, 2, 1),
  658. datetime(2011, 5, 1), datetime(2011, 1, 1)]
  659. result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
  660. columns=['foo', 'bar'])
  661. result_nosort.index = CategoricalIndex(index, categories=index,
  662. name='dt')
  663. col = 'dt'
  664. assert_frame_equal(
  665. result_sort, df.groupby(col, sort=True, observed=False).first())
  666. assert_frame_equal(
  667. result_nosort, df.groupby(col, sort=False, observed=False).first())
  668. def test_empty_sum():
  669. # https://github.com/pandas-dev/pandas/issues/18678
  670. df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'],
  671. categories=['a', 'b', 'c']),
  672. 'B': [1, 2, 1]})
  673. expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A')
  674. # 0 by default
  675. result = df.groupby("A", observed=False).B.sum()
  676. expected = pd.Series([3, 1, 0], expected_idx, name='B')
  677. tm.assert_series_equal(result, expected)
  678. # min_count=0
  679. result = df.groupby("A", observed=False).B.sum(min_count=0)
  680. expected = pd.Series([3, 1, 0], expected_idx, name='B')
  681. tm.assert_series_equal(result, expected)
  682. # min_count=1
  683. result = df.groupby("A", observed=False).B.sum(min_count=1)
  684. expected = pd.Series([3, 1, np.nan], expected_idx, name='B')
  685. tm.assert_series_equal(result, expected)
  686. # min_count>1
  687. result = df.groupby("A", observed=False).B.sum(min_count=2)
  688. expected = pd.Series([3, np.nan, np.nan], expected_idx, name='B')
  689. tm.assert_series_equal(result, expected)
  690. def test_empty_prod():
  691. # https://github.com/pandas-dev/pandas/issues/18678
  692. df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'],
  693. categories=['a', 'b', 'c']),
  694. 'B': [1, 2, 1]})
  695. expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A')
  696. # 1 by default
  697. result = df.groupby("A", observed=False).B.prod()
  698. expected = pd.Series([2, 1, 1], expected_idx, name='B')
  699. tm.assert_series_equal(result, expected)
  700. # min_count=0
  701. result = df.groupby("A", observed=False).B.prod(min_count=0)
  702. expected = pd.Series([2, 1, 1], expected_idx, name='B')
  703. tm.assert_series_equal(result, expected)
  704. # min_count=1
  705. result = df.groupby("A", observed=False).B.prod(min_count=1)
  706. expected = pd.Series([2, 1, np.nan], expected_idx, name='B')
  707. tm.assert_series_equal(result, expected)
  708. def test_groupby_multiindex_categorical_datetime():
  709. # https://github.com/pandas-dev/pandas/issues/21390
  710. df = pd.DataFrame({
  711. 'key1': pd.Categorical(list('abcbabcba')),
  712. 'key2': pd.Categorical(
  713. list(pd.date_range('2018-06-01 00', freq='1T', periods=3)) * 3),
  714. 'values': np.arange(9),
  715. })
  716. result = df.groupby(['key1', 'key2']).mean()
  717. idx = pd.MultiIndex.from_product(
  718. [pd.Categorical(['a', 'b', 'c']),
  719. pd.Categorical(pd.date_range('2018-06-01 00', freq='1T', periods=3))],
  720. names=['key1', 'key2'])
  721. expected = pd.DataFrame(
  722. {'values': [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx)
  723. assert_frame_equal(result, expected)
  724. @pytest.mark.parametrize("as_index, expected", [
  725. (True, pd.Series(
  726. index=pd.MultiIndex.from_arrays(
  727. [pd.Series([1, 1, 2], dtype='category'),
  728. [1, 2, 2]], names=['a', 'b']
  729. ),
  730. data=[1, 2, 3], name='x'
  731. )),
  732. (False, pd.DataFrame({
  733. 'a': pd.Series([1, 1, 2], dtype='category'),
  734. 'b': [1, 2, 2],
  735. 'x': [1, 2, 3]
  736. }))
  737. ])
  738. def test_groupby_agg_observed_true_single_column(as_index, expected):
  739. # GH-23970
  740. df = pd.DataFrame({
  741. 'a': pd.Series([1, 1, 2], dtype='category'),
  742. 'b': [1, 2, 2],
  743. 'x': [1, 2, 3]
  744. })
  745. result = df.groupby(
  746. ['a', 'b'], as_index=as_index, observed=True)['x'].sum()
  747. assert_equal(result, expected)
  748. @pytest.mark.parametrize('fill_value', [None, np.nan, pd.NaT])
  749. def test_shift(fill_value):
  750. ct = pd.Categorical(['a', 'b', 'c', 'd'],
  751. categories=['a', 'b', 'c', 'd'], ordered=False)
  752. expected = pd.Categorical([None, 'a', 'b', 'c'],
  753. categories=['a', 'b', 'c', 'd'], ordered=False)
  754. res = ct.shift(1, fill_value=fill_value)
  755. assert_equal(res, expected)