test_apply.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542
  1. from datetime import datetime
  2. import numpy as np
  3. import pytest
  4. import pandas as pd
  5. from pandas import DataFrame, Index, MultiIndex, Series, bdate_range, compat
  6. from pandas.util import testing as tm
  7. def test_apply_issues():
  8. # GH 5788
  9. s = """2011.05.16,00:00,1.40893
  10. 2011.05.16,01:00,1.40760
  11. 2011.05.16,02:00,1.40750
  12. 2011.05.16,03:00,1.40649
  13. 2011.05.17,02:00,1.40893
  14. 2011.05.17,03:00,1.40760
  15. 2011.05.17,04:00,1.40750
  16. 2011.05.17,05:00,1.40649
  17. 2011.05.18,02:00,1.40893
  18. 2011.05.18,03:00,1.40760
  19. 2011.05.18,04:00,1.40750
  20. 2011.05.18,05:00,1.40649"""
  21. df = pd.read_csv(
  22. compat.StringIO(s), header=None, names=['date', 'time', 'value'],
  23. parse_dates=[['date', 'time']])
  24. df = df.set_index('date_time')
  25. expected = df.groupby(df.index.date).idxmax()
  26. result = df.groupby(df.index.date).apply(lambda x: x.idxmax())
  27. tm.assert_frame_equal(result, expected)
  28. # GH 5789
  29. # don't auto coerce dates
  30. df = pd.read_csv(
  31. compat.StringIO(s), header=None, names=['date', 'time', 'value'])
  32. exp_idx = pd.Index(
  33. ['2011.05.16', '2011.05.17', '2011.05.18'
  34. ], dtype=object, name='date')
  35. expected = Series(['00:00', '02:00', '02:00'], index=exp_idx)
  36. result = df.groupby('date').apply(
  37. lambda x: x['time'][x['value'].idxmax()])
  38. tm.assert_series_equal(result, expected)
  39. def test_apply_trivial():
  40. # GH 20066
  41. # trivial apply: ignore input and return a constant dataframe.
  42. df = pd.DataFrame({'key': ['a', 'a', 'b', 'b', 'a'],
  43. 'data': [1.0, 2.0, 3.0, 4.0, 5.0]},
  44. columns=['key', 'data'])
  45. expected = pd.concat([df.iloc[1:], df.iloc[1:]],
  46. axis=1, keys=['float64', 'object'])
  47. result = df.groupby([str(x) for x in df.dtypes],
  48. axis=1).apply(lambda x: df.iloc[1:])
  49. tm.assert_frame_equal(result, expected)
  50. @pytest.mark.xfail(reason="GH#20066; function passed into apply "
  51. "returns a DataFrame with the same index "
  52. "as the one to create GroupBy object.")
  53. def test_apply_trivial_fail():
  54. # GH 20066
  55. # trivial apply fails if the constant dataframe has the same index
  56. # with the one used to create GroupBy object.
  57. df = pd.DataFrame({'key': ['a', 'a', 'b', 'b', 'a'],
  58. 'data': [1.0, 2.0, 3.0, 4.0, 5.0]},
  59. columns=['key', 'data'])
  60. expected = pd.concat([df, df],
  61. axis=1, keys=['float64', 'object'])
  62. result = df.groupby([str(x) for x in df.dtypes],
  63. axis=1).apply(lambda x: df)
  64. tm.assert_frame_equal(result, expected)
  65. def test_fast_apply():
  66. # make sure that fast apply is correctly called
  67. # rather than raising any kind of error
  68. # otherwise the python path will be callsed
  69. # which slows things down
  70. N = 1000
  71. labels = np.random.randint(0, 2000, size=N)
  72. labels2 = np.random.randint(0, 3, size=N)
  73. df = DataFrame({'key': labels,
  74. 'key2': labels2,
  75. 'value1': np.random.randn(N),
  76. 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)})
  77. def f(g):
  78. return 1
  79. g = df.groupby(['key', 'key2'])
  80. grouper = g.grouper
  81. splitter = grouper._get_splitter(g._selected_obj, axis=g.axis)
  82. group_keys = grouper._get_group_keys()
  83. values, mutated = splitter.fast_apply(f, group_keys)
  84. assert not mutated
  85. def test_apply_with_mixed_dtype():
  86. # GH3480, apply with mixed dtype on axis=1 breaks in 0.11
  87. df = DataFrame({'foo1': np.random.randn(6),
  88. 'foo2': ['one', 'two', 'two', 'three', 'one', 'two']})
  89. result = df.apply(lambda x: x, axis=1)
  90. tm.assert_series_equal(df.get_dtype_counts(), result.get_dtype_counts())
  91. # GH 3610 incorrect dtype conversion with as_index=False
  92. df = DataFrame({"c1": [1, 2, 6, 6, 8]})
  93. df["c2"] = df.c1 / 2.0
  94. result1 = df.groupby("c2").mean().reset_index().c2
  95. result2 = df.groupby("c2", as_index=False).mean().c2
  96. tm.assert_series_equal(result1, result2)
  97. def test_groupby_as_index_apply(df):
  98. # GH #4648 and #3417
  99. df = DataFrame({'item_id': ['b', 'b', 'a', 'c', 'a', 'b'],
  100. 'user_id': [1, 2, 1, 1, 3, 1],
  101. 'time': range(6)})
  102. g_as = df.groupby('user_id', as_index=True)
  103. g_not_as = df.groupby('user_id', as_index=False)
  104. res_as = g_as.head(2).index
  105. res_not_as = g_not_as.head(2).index
  106. exp = Index([0, 1, 2, 4])
  107. tm.assert_index_equal(res_as, exp)
  108. tm.assert_index_equal(res_not_as, exp)
  109. res_as_apply = g_as.apply(lambda x: x.head(2)).index
  110. res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index
  111. # apply doesn't maintain the original ordering
  112. # changed in GH5610 as the as_index=False returns a MI here
  113. exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), (
  114. 2, 4)])
  115. tp = [(1, 0), (1, 2), (2, 1), (3, 4)]
  116. exp_as_apply = MultiIndex.from_tuples(tp, names=['user_id', None])
  117. tm.assert_index_equal(res_as_apply, exp_as_apply)
  118. tm.assert_index_equal(res_not_as_apply, exp_not_as_apply)
  119. ind = Index(list('abcde'))
  120. df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind)
  121. res = df.groupby(0, as_index=False).apply(lambda x: x).index
  122. tm.assert_index_equal(res, ind)
  123. def test_apply_concat_preserve_names(three_group):
  124. grouped = three_group.groupby(['A', 'B'])
  125. def desc(group):
  126. result = group.describe()
  127. result.index.name = 'stat'
  128. return result
  129. def desc2(group):
  130. result = group.describe()
  131. result.index.name = 'stat'
  132. result = result[:len(group)]
  133. # weirdo
  134. return result
  135. def desc3(group):
  136. result = group.describe()
  137. # names are different
  138. result.index.name = 'stat_%d' % len(group)
  139. result = result[:len(group)]
  140. # weirdo
  141. return result
  142. result = grouped.apply(desc)
  143. assert result.index.names == ('A', 'B', 'stat')
  144. result2 = grouped.apply(desc2)
  145. assert result2.index.names == ('A', 'B', 'stat')
  146. result3 = grouped.apply(desc3)
  147. assert result3.index.names == ('A', 'B', None)
  148. def test_apply_series_to_frame():
  149. def f(piece):
  150. with np.errstate(invalid='ignore'):
  151. logged = np.log(piece)
  152. return DataFrame({'value': piece,
  153. 'demeaned': piece - piece.mean(),
  154. 'logged': logged})
  155. dr = bdate_range('1/1/2000', periods=100)
  156. ts = Series(np.random.randn(100), index=dr)
  157. grouped = ts.groupby(lambda x: x.month)
  158. result = grouped.apply(f)
  159. assert isinstance(result, DataFrame)
  160. tm.assert_index_equal(result.index, ts.index)
  161. def test_apply_series_yield_constant(df):
  162. result = df.groupby(['A', 'B'])['C'].apply(len)
  163. assert result.index.names[:2] == ('A', 'B')
  164. def test_apply_frame_yield_constant(df):
  165. # GH13568
  166. result = df.groupby(['A', 'B']).apply(len)
  167. assert isinstance(result, Series)
  168. assert result.name is None
  169. result = df.groupby(['A', 'B'])[['C', 'D']].apply(len)
  170. assert isinstance(result, Series)
  171. assert result.name is None
  172. def test_apply_frame_to_series(df):
  173. grouped = df.groupby(['A', 'B'])
  174. result = grouped.apply(len)
  175. expected = grouped.count()['C']
  176. tm.assert_index_equal(result.index, expected.index)
  177. tm.assert_numpy_array_equal(result.values, expected.values)
  178. def test_apply_frame_concat_series():
  179. def trans(group):
  180. return group.groupby('B')['C'].sum().sort_values()[:2]
  181. def trans2(group):
  182. grouped = group.groupby(df.reindex(group.index)['B'])
  183. return grouped.sum().sort_values()[:2]
  184. df = DataFrame({'A': np.random.randint(0, 5, 1000),
  185. 'B': np.random.randint(0, 5, 1000),
  186. 'C': np.random.randn(1000)})
  187. result = df.groupby('A').apply(trans)
  188. exp = df.groupby('A')['C'].apply(trans2)
  189. tm.assert_series_equal(result, exp, check_names=False)
  190. assert result.name == 'C'
  191. def test_apply_transform(ts):
  192. grouped = ts.groupby(lambda x: x.month)
  193. result = grouped.apply(lambda x: x * 2)
  194. expected = grouped.transform(lambda x: x * 2)
  195. tm.assert_series_equal(result, expected)
  196. def test_apply_multikey_corner(tsframe):
  197. grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
  198. def f(group):
  199. return group.sort_values('A')[-5:]
  200. result = grouped.apply(f)
  201. for key, group in grouped:
  202. tm.assert_frame_equal(result.loc[key], f(group))
  203. def test_apply_chunk_view():
  204. # Low level tinkering could be unsafe, make sure not
  205. df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3],
  206. 'value': compat.lrange(9)})
  207. result = df.groupby('key', group_keys=False).apply(lambda x: x[:2])
  208. expected = df.take([0, 1, 3, 4, 6, 7])
  209. tm.assert_frame_equal(result, expected)
  210. def test_apply_no_name_column_conflict():
  211. df = DataFrame({'name': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2],
  212. 'name2': [0, 0, 0, 1, 1, 1, 0, 0, 1, 1],
  213. 'value': compat.lrange(10)[::-1]})
  214. # it works! #2605
  215. grouped = df.groupby(['name', 'name2'])
  216. grouped.apply(lambda x: x.sort_values('value', inplace=True))
  217. def test_apply_typecast_fail():
  218. df = DataFrame({'d': [1., 1., 1., 2., 2., 2.],
  219. 'c': np.tile(
  220. ['a', 'b', 'c'], 2),
  221. 'v': np.arange(1., 7.)})
  222. def f(group):
  223. v = group['v']
  224. group['v2'] = (v - v.min()) / (v.max() - v.min())
  225. return group
  226. result = df.groupby('d').apply(f)
  227. expected = df.copy()
  228. expected['v2'] = np.tile([0., 0.5, 1], 2)
  229. tm.assert_frame_equal(result, expected)
  230. def test_apply_multiindex_fail():
  231. index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]
  232. ])
  233. df = DataFrame({'d': [1., 1., 1., 2., 2., 2.],
  234. 'c': np.tile(['a', 'b', 'c'], 2),
  235. 'v': np.arange(1., 7.)}, index=index)
  236. def f(group):
  237. v = group['v']
  238. group['v2'] = (v - v.min()) / (v.max() - v.min())
  239. return group
  240. result = df.groupby('d').apply(f)
  241. expected = df.copy()
  242. expected['v2'] = np.tile([0., 0.5, 1], 2)
  243. tm.assert_frame_equal(result, expected)
  244. def test_apply_corner(tsframe):
  245. result = tsframe.groupby(lambda x: x.year).apply(lambda x: x * 2)
  246. expected = tsframe * 2
  247. tm.assert_frame_equal(result, expected)
  248. def test_apply_without_copy():
  249. # GH 5545
  250. # returning a non-copy in an applied function fails
  251. data = DataFrame({'id_field': [100, 100, 200, 300],
  252. 'category': ['a', 'b', 'c', 'c'],
  253. 'value': [1, 2, 3, 4]})
  254. def filt1(x):
  255. if x.shape[0] == 1:
  256. return x.copy()
  257. else:
  258. return x[x.category == 'c']
  259. def filt2(x):
  260. if x.shape[0] == 1:
  261. return x
  262. else:
  263. return x[x.category == 'c']
  264. expected = data.groupby('id_field').apply(filt1)
  265. result = data.groupby('id_field').apply(filt2)
  266. tm.assert_frame_equal(result, expected)
  267. def test_apply_corner_cases():
  268. # #535, can't use sliding iterator
  269. N = 1000
  270. labels = np.random.randint(0, 100, size=N)
  271. df = DataFrame({'key': labels,
  272. 'value1': np.random.randn(N),
  273. 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)})
  274. grouped = df.groupby('key')
  275. def f(g):
  276. g['value3'] = g['value1'] * 2
  277. return g
  278. result = grouped.apply(f)
  279. assert 'value3' in result
  280. def test_apply_numeric_coercion_when_datetime():
  281. # In the past, group-by/apply operations have been over-eager
  282. # in converting dtypes to numeric, in the presence of datetime
  283. # columns. Various GH issues were filed, the reproductions
  284. # for which are here.
  285. # GH 15670
  286. df = pd.DataFrame({'Number': [1, 2],
  287. 'Date': ["2017-03-02"] * 2,
  288. 'Str': ["foo", "inf"]})
  289. expected = df.groupby(['Number']).apply(lambda x: x.iloc[0])
  290. df.Date = pd.to_datetime(df.Date)
  291. result = df.groupby(['Number']).apply(lambda x: x.iloc[0])
  292. tm.assert_series_equal(result['Str'], expected['Str'])
  293. # GH 15421
  294. df = pd.DataFrame({'A': [10, 20, 30],
  295. 'B': ['foo', '3', '4'],
  296. 'T': [pd.Timestamp("12:31:22")] * 3})
  297. def get_B(g):
  298. return g.iloc[0][['B']]
  299. result = df.groupby('A').apply(get_B)['B']
  300. expected = df.B
  301. expected.index = df.A
  302. tm.assert_series_equal(result, expected)
  303. # GH 14423
  304. def predictions(tool):
  305. out = pd.Series(index=['p1', 'p2', 'useTime'], dtype=object)
  306. if 'step1' in list(tool.State):
  307. out['p1'] = str(tool[tool.State == 'step1'].Machine.values[0])
  308. if 'step2' in list(tool.State):
  309. out['p2'] = str(tool[tool.State == 'step2'].Machine.values[0])
  310. out['useTime'] = str(
  311. tool[tool.State == 'step2'].oTime.values[0])
  312. return out
  313. df1 = pd.DataFrame({'Key': ['B', 'B', 'A', 'A'],
  314. 'State': ['step1', 'step2', 'step1', 'step2'],
  315. 'oTime': ['', '2016-09-19 05:24:33',
  316. '', '2016-09-19 23:59:04'],
  317. 'Machine': ['23', '36L', '36R', '36R']})
  318. df2 = df1.copy()
  319. df2.oTime = pd.to_datetime(df2.oTime)
  320. expected = df1.groupby('Key').apply(predictions).p1
  321. result = df2.groupby('Key').apply(predictions).p1
  322. tm.assert_series_equal(expected, result)
  323. def test_time_field_bug():
  324. # Test a fix for the following error related to GH issue 11324 When
  325. # non-key fields in a group-by dataframe contained time-based fields
  326. # that were not returned by the apply function, an exception would be
  327. # raised.
  328. df = pd.DataFrame({'a': 1, 'b': [datetime.now() for nn in range(10)]})
  329. def func_with_no_date(batch):
  330. return pd.Series({'c': 2})
  331. def func_with_date(batch):
  332. return pd.Series({'b': datetime(2015, 1, 1), 'c': 2})
  333. dfg_no_conversion = df.groupby(by=['a']).apply(func_with_no_date)
  334. dfg_no_conversion_expected = pd.DataFrame({'c': 2}, index=[1])
  335. dfg_no_conversion_expected.index.name = 'a'
  336. dfg_conversion = df.groupby(by=['a']).apply(func_with_date)
  337. dfg_conversion_expected = pd.DataFrame(
  338. {'b': datetime(2015, 1, 1),
  339. 'c': 2}, index=[1])
  340. dfg_conversion_expected.index.name = 'a'
  341. tm.assert_frame_equal(dfg_no_conversion, dfg_no_conversion_expected)
  342. tm.assert_frame_equal(dfg_conversion, dfg_conversion_expected)
  343. def test_gb_apply_list_of_unequal_len_arrays():
  344. # GH1738
  345. df = DataFrame({'group1': ['a', 'a', 'a', 'b', 'b', 'b', 'a', 'a', 'a',
  346. 'b', 'b', 'b'],
  347. 'group2': ['c', 'c', 'd', 'd', 'd', 'e', 'c', 'c', 'd',
  348. 'd', 'd', 'e'],
  349. 'weight': [1.1, 2, 3, 4, 5, 6, 2, 4, 6, 8, 1, 2],
  350. 'value': [7.1, 8, 9, 10, 11, 12, 8, 7, 6, 5, 4, 3]})
  351. df = df.set_index(['group1', 'group2'])
  352. df_grouped = df.groupby(level=['group1', 'group2'], sort=True)
  353. def noddy(value, weight):
  354. out = np.array(value * weight).repeat(3)
  355. return out
  356. # the kernel function returns arrays of unequal length
  357. # pandas sniffs the first one, sees it's an array and not
  358. # a list, and assumed the rest are of equal length
  359. # and so tries a vstack
  360. # don't die
  361. df_grouped.apply(lambda x: noddy(x.value, x.weight))
  362. def test_groupby_apply_all_none():
  363. # Tests to make sure no errors if apply function returns all None
  364. # values. Issue 9684.
  365. test_df = DataFrame({'groups': [0, 0, 1, 1],
  366. 'random_vars': [8, 7, 4, 5]})
  367. def test_func(x):
  368. pass
  369. result = test_df.groupby('groups').apply(test_func)
  370. expected = DataFrame()
  371. tm.assert_frame_equal(result, expected)
  372. def test_groupby_apply_none_first():
  373. # GH 12824. Tests if apply returns None first.
  374. test_df1 = DataFrame({'groups': [1, 1, 1, 2], 'vars': [0, 1, 2, 3]})
  375. test_df2 = DataFrame({'groups': [1, 2, 2, 2], 'vars': [0, 1, 2, 3]})
  376. def test_func(x):
  377. if x.shape[0] < 2:
  378. return None
  379. return x.iloc[[0, -1]]
  380. result1 = test_df1.groupby('groups').apply(test_func)
  381. result2 = test_df2.groupby('groups').apply(test_func)
  382. index1 = MultiIndex.from_arrays([[1, 1], [0, 2]],
  383. names=['groups', None])
  384. index2 = MultiIndex.from_arrays([[2, 2], [1, 3]],
  385. names=['groups', None])
  386. expected1 = DataFrame({'groups': [1, 1], 'vars': [0, 2]},
  387. index=index1)
  388. expected2 = DataFrame({'groups': [2, 2], 'vars': [1, 3]},
  389. index=index2)
  390. tm.assert_frame_equal(result1, expected1)
  391. tm.assert_frame_equal(result2, expected2)
  392. def test_groupby_apply_return_empty_chunk():
  393. # GH 22221: apply filter which returns some empty groups
  394. df = pd.DataFrame(dict(value=[0, 1], group=['filled', 'empty']))
  395. groups = df.groupby('group')
  396. result = groups.apply(lambda group: group[group.value != 1]['value'])
  397. expected = pd.Series([0], name='value',
  398. index=MultiIndex.from_product([['empty', 'filled'],
  399. [0]],
  400. names=['group', None]
  401. ).drop('empty'))
  402. tm.assert_series_equal(result, expected)
  403. def test_apply_with_mixed_types():
  404. # gh-20949
  405. df = pd.DataFrame({'A': 'a a b'.split(), 'B': [1, 2, 3], 'C': [4, 6, 5]})
  406. g = df.groupby('A')
  407. result = g.transform(lambda x: x / x.sum())
  408. expected = pd.DataFrame({'B': [1 / 3., 2 / 3., 1], 'C': [0.4, 0.6, 1.0]})
  409. tm.assert_frame_equal(result, expected)
  410. result = g.apply(lambda x: x / x.sum())
  411. tm.assert_frame_equal(result, expected)