test_transform.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847
  1. """ test with the .transform """
  2. import numpy as np
  3. import pytest
  4. from pandas._libs import groupby
  5. from pandas.compat import StringIO
  6. from pandas.core.dtypes.common import ensure_platform_int, is_timedelta64_dtype
  7. import pandas as pd
  8. from pandas import DataFrame, MultiIndex, Series, Timestamp, concat, date_range
  9. from pandas.core.config import option_context
  10. from pandas.core.groupby.groupby import DataError
  11. from pandas.util import testing as tm
  12. from pandas.util.testing import assert_frame_equal, assert_series_equal
  13. def assert_fp_equal(a, b):
  14. assert (np.abs(a - b) < 1e-12).all()
  15. def test_transform():
  16. data = Series(np.arange(9) // 3, index=np.arange(9))
  17. index = np.arange(9)
  18. np.random.shuffle(index)
  19. data = data.reindex(index)
  20. grouped = data.groupby(lambda x: x // 3)
  21. transformed = grouped.transform(lambda x: x * x.sum())
  22. assert transformed[7] == 12
  23. # GH 8046
  24. # make sure that we preserve the input order
  25. df = DataFrame(
  26. np.arange(6, dtype='int64').reshape(
  27. 3, 2), columns=["a", "b"], index=[0, 2, 1])
  28. key = [0, 0, 1]
  29. expected = df.sort_index().groupby(key).transform(
  30. lambda x: x - x.mean()).groupby(key).mean()
  31. result = df.groupby(key).transform(lambda x: x - x.mean()).groupby(
  32. key).mean()
  33. assert_frame_equal(result, expected)
  34. def demean(arr):
  35. return arr - arr.mean()
  36. people = DataFrame(np.random.randn(5, 5),
  37. columns=['a', 'b', 'c', 'd', 'e'],
  38. index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
  39. key = ['one', 'two', 'one', 'two', 'one']
  40. result = people.groupby(key).transform(demean).groupby(key).mean()
  41. expected = people.groupby(key).apply(demean).groupby(key).mean()
  42. assert_frame_equal(result, expected)
  43. # GH 8430
  44. df = tm.makeTimeDataFrame()
  45. g = df.groupby(pd.Grouper(freq='M'))
  46. g.transform(lambda x: x - 1)
  47. # GH 9700
  48. df = DataFrame({'a': range(5, 10), 'b': range(5)})
  49. result = df.groupby('a').transform(max)
  50. expected = DataFrame({'b': range(5)})
  51. tm.assert_frame_equal(result, expected)
  52. def test_transform_fast():
  53. df = DataFrame({'id': np.arange(100000) / 3,
  54. 'val': np.random.randn(100000)})
  55. grp = df.groupby('id')['val']
  56. values = np.repeat(grp.mean().values,
  57. ensure_platform_int(grp.count().values))
  58. expected = pd.Series(values, index=df.index, name='val')
  59. result = grp.transform(np.mean)
  60. assert_series_equal(result, expected)
  61. result = grp.transform('mean')
  62. assert_series_equal(result, expected)
  63. # GH 12737
  64. df = pd.DataFrame({'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5],
  65. 'd': pd.date_range('2014-1-1', '2014-1-4'),
  66. 'i': [1, 2, 3, 4]},
  67. columns=['grouping', 'f', 'i', 'd'])
  68. result = df.groupby('grouping').transform('first')
  69. dates = [pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'),
  70. pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')]
  71. expected = pd.DataFrame({'f': [1.1, 2.1, 2.1, 4.5],
  72. 'd': dates,
  73. 'i': [1, 2, 2, 4]},
  74. columns=['f', 'i', 'd'])
  75. assert_frame_equal(result, expected)
  76. # selection
  77. result = df.groupby('grouping')[['f', 'i']].transform('first')
  78. expected = expected[['f', 'i']]
  79. assert_frame_equal(result, expected)
  80. # dup columns
  81. df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a'])
  82. result = df.groupby('g').transform('first')
  83. expected = df.drop('g', axis=1)
  84. assert_frame_equal(result, expected)
  85. def test_transform_broadcast(tsframe, ts):
  86. grouped = ts.groupby(lambda x: x.month)
  87. result = grouped.transform(np.mean)
  88. tm.assert_index_equal(result.index, ts.index)
  89. for _, gp in grouped:
  90. assert_fp_equal(result.reindex(gp.index), gp.mean())
  91. grouped = tsframe.groupby(lambda x: x.month)
  92. result = grouped.transform(np.mean)
  93. tm.assert_index_equal(result.index, tsframe.index)
  94. for _, gp in grouped:
  95. agged = gp.mean()
  96. res = result.reindex(gp.index)
  97. for col in tsframe:
  98. assert_fp_equal(res[col], agged[col])
  99. # group columns
  100. grouped = tsframe.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1},
  101. axis=1)
  102. result = grouped.transform(np.mean)
  103. tm.assert_index_equal(result.index, tsframe.index)
  104. tm.assert_index_equal(result.columns, tsframe.columns)
  105. for _, gp in grouped:
  106. agged = gp.mean(1)
  107. res = result.reindex(columns=gp.columns)
  108. for idx in gp.index:
  109. assert_fp_equal(res.xs(idx), agged[idx])
  110. def test_transform_axis(tsframe):
  111. # make sure that we are setting the axes
  112. # correctly when on axis=0 or 1
  113. # in the presence of a non-monotonic indexer
  114. # GH12713
  115. base = tsframe.iloc[0:5]
  116. r = len(base.index)
  117. c = len(base.columns)
  118. tso = DataFrame(np.random.randn(r, c),
  119. index=base.index,
  120. columns=base.columns,
  121. dtype='float64')
  122. # monotonic
  123. ts = tso
  124. grouped = ts.groupby(lambda x: x.weekday())
  125. result = ts - grouped.transform('mean')
  126. expected = grouped.apply(lambda x: x - x.mean())
  127. assert_frame_equal(result, expected)
  128. ts = ts.T
  129. grouped = ts.groupby(lambda x: x.weekday(), axis=1)
  130. result = ts - grouped.transform('mean')
  131. expected = grouped.apply(lambda x: (x.T - x.mean(1)).T)
  132. assert_frame_equal(result, expected)
  133. # non-monotonic
  134. ts = tso.iloc[[1, 0] + list(range(2, len(base)))]
  135. grouped = ts.groupby(lambda x: x.weekday())
  136. result = ts - grouped.transform('mean')
  137. expected = grouped.apply(lambda x: x - x.mean())
  138. assert_frame_equal(result, expected)
  139. ts = ts.T
  140. grouped = ts.groupby(lambda x: x.weekday(), axis=1)
  141. result = ts - grouped.transform('mean')
  142. expected = grouped.apply(lambda x: (x.T - x.mean(1)).T)
  143. assert_frame_equal(result, expected)
  144. def test_transform_dtype():
  145. # GH 9807
  146. # Check transform dtype output is preserved
  147. df = DataFrame([[1, 3], [2, 3]])
  148. result = df.groupby(1).transform('mean')
  149. expected = DataFrame([[1.5], [1.5]])
  150. assert_frame_equal(result, expected)
  151. def test_transform_bug():
  152. # GH 5712
  153. # transforming on a datetime column
  154. df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5)))
  155. result = df.groupby('A')['B'].transform(
  156. lambda x: x.rank(ascending=False))
  157. expected = Series(np.arange(5, 0, step=-1), name='B')
  158. assert_series_equal(result, expected)
  159. def test_transform_numeric_to_boolean():
  160. # GH 16875
  161. # inconsistency in transforming boolean values
  162. expected = pd.Series([True, True], name='A')
  163. df = pd.DataFrame({'A': [1.1, 2.2], 'B': [1, 2]})
  164. result = df.groupby('B').A.transform(lambda x: True)
  165. assert_series_equal(result, expected)
  166. df = pd.DataFrame({'A': [1, 2], 'B': [1, 2]})
  167. result = df.groupby('B').A.transform(lambda x: True)
  168. assert_series_equal(result, expected)
  169. def test_transform_datetime_to_timedelta():
  170. # GH 15429
  171. # transforming a datetime to timedelta
  172. df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5)))
  173. expected = pd.Series([
  174. Timestamp('20130101') - Timestamp('20130101')] * 5, name='A')
  175. # this does date math without changing result type in transform
  176. base_time = df['A'][0]
  177. result = df.groupby('A')['A'].transform(
  178. lambda x: x.max() - x.min() + base_time) - base_time
  179. assert_series_equal(result, expected)
  180. # this does date math and causes the transform to return timedelta
  181. result = df.groupby('A')['A'].transform(lambda x: x.max() - x.min())
  182. assert_series_equal(result, expected)
  183. def test_transform_datetime_to_numeric():
  184. # GH 10972
  185. # convert dt to float
  186. df = DataFrame({
  187. 'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')})
  188. result = df.groupby('a').b.transform(
  189. lambda x: x.dt.dayofweek - x.dt.dayofweek.mean())
  190. expected = Series([-0.5, 0.5], name='b')
  191. assert_series_equal(result, expected)
  192. # convert dt to int
  193. df = DataFrame({
  194. 'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')})
  195. result = df.groupby('a').b.transform(
  196. lambda x: x.dt.dayofweek - x.dt.dayofweek.min())
  197. expected = Series([0, 1], name='b')
  198. assert_series_equal(result, expected)
  199. def test_transform_casting():
  200. # 13046
  201. data = """
  202. idx A ID3 DATETIME
  203. 0 B-028 b76cd912ff "2014-10-08 13:43:27"
  204. 1 B-054 4a57ed0b02 "2014-10-08 14:26:19"
  205. 2 B-076 1a682034f8 "2014-10-08 14:29:01"
  206. 3 B-023 b76cd912ff "2014-10-08 18:39:34"
  207. 4 B-023 f88g8d7sds "2014-10-08 18:40:18"
  208. 5 B-033 b76cd912ff "2014-10-08 18:44:30"
  209. 6 B-032 b76cd912ff "2014-10-08 18:46:00"
  210. 7 B-037 b76cd912ff "2014-10-08 18:52:15"
  211. 8 B-046 db959faf02 "2014-10-08 18:59:59"
  212. 9 B-053 b76cd912ff "2014-10-08 19:17:48"
  213. 10 B-065 b76cd912ff "2014-10-08 19:21:38"
  214. """
  215. df = pd.read_csv(StringIO(data), sep=r'\s+',
  216. index_col=[0], parse_dates=['DATETIME'])
  217. result = df.groupby('ID3')['DATETIME'].transform(lambda x: x.diff())
  218. assert is_timedelta64_dtype(result.dtype)
  219. result = df[['ID3', 'DATETIME']].groupby('ID3').transform(
  220. lambda x: x.diff())
  221. assert is_timedelta64_dtype(result.DATETIME.dtype)
  222. def test_transform_multiple(ts):
  223. grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
  224. grouped.transform(lambda x: x * 2)
  225. grouped.transform(np.mean)
  226. def test_dispatch_transform(tsframe):
  227. df = tsframe[::5].reindex(tsframe.index)
  228. grouped = df.groupby(lambda x: x.month)
  229. filled = grouped.fillna(method='pad')
  230. fillit = lambda x: x.fillna(method='pad')
  231. expected = df.groupby(lambda x: x.month).transform(fillit)
  232. assert_frame_equal(filled, expected)
  233. def test_transform_select_columns(df):
  234. f = lambda x: x.mean()
  235. result = df.groupby('A')['C', 'D'].transform(f)
  236. selection = df[['C', 'D']]
  237. expected = selection.groupby(df['A']).transform(f)
  238. assert_frame_equal(result, expected)
  239. def test_transform_exclude_nuisance(df):
  240. # this also tests orderings in transform between
  241. # series/frame to make sure it's consistent
  242. expected = {}
  243. grouped = df.groupby('A')
  244. expected['C'] = grouped['C'].transform(np.mean)
  245. expected['D'] = grouped['D'].transform(np.mean)
  246. expected = DataFrame(expected)
  247. result = df.groupby('A').transform(np.mean)
  248. assert_frame_equal(result, expected)
  249. def test_transform_function_aliases(df):
  250. result = df.groupby('A').transform('mean')
  251. expected = df.groupby('A').transform(np.mean)
  252. assert_frame_equal(result, expected)
  253. result = df.groupby('A')['C'].transform('mean')
  254. expected = df.groupby('A')['C'].transform(np.mean)
  255. assert_series_equal(result, expected)
  256. def test_series_fast_transform_date():
  257. # GH 13191
  258. df = pd.DataFrame({'grouping': [np.nan, 1, 1, 3],
  259. 'd': pd.date_range('2014-1-1', '2014-1-4')})
  260. result = df.groupby('grouping')['d'].transform('first')
  261. dates = [pd.NaT, pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-2'),
  262. pd.Timestamp('2014-1-4')]
  263. expected = pd.Series(dates, name='d')
  264. assert_series_equal(result, expected)
  265. def test_transform_length():
  266. # GH 9697
  267. df = pd.DataFrame({'col1': [1, 1, 2, 2], 'col2': [1, 2, 3, np.nan]})
  268. expected = pd.Series([3.0] * 4)
  269. def nsum(x):
  270. return np.nansum(x)
  271. results = [df.groupby('col1').transform(sum)['col2'],
  272. df.groupby('col1')['col2'].transform(sum),
  273. df.groupby('col1').transform(nsum)['col2'],
  274. df.groupby('col1')['col2'].transform(nsum)]
  275. for result in results:
  276. assert_series_equal(result, expected, check_names=False)
  277. def test_transform_coercion():
  278. # 14457
  279. # when we are transforming be sure to not coerce
  280. # via assignment
  281. df = pd.DataFrame(dict(A=['a', 'a'], B=[0, 1]))
  282. g = df.groupby('A')
  283. expected = g.transform(np.mean)
  284. result = g.transform(lambda x: np.mean(x))
  285. assert_frame_equal(result, expected)
  286. def test_groupby_transform_with_int():
  287. # GH 3740, make sure that we might upcast on item-by-item transform
  288. # floats
  289. df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=Series(1, dtype='float64'),
  290. C=Series(
  291. [1, 2, 3, 1, 2, 3], dtype='float64'), D='foo'))
  292. with np.errstate(all='ignore'):
  293. result = df.groupby('A').transform(
  294. lambda x: (x - x.mean()) / x.std())
  295. expected = DataFrame(dict(B=np.nan, C=Series(
  296. [-1, 0, 1, -1, 0, 1], dtype='float64')))
  297. assert_frame_equal(result, expected)
  298. # int case
  299. df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1,
  300. C=[1, 2, 3, 1, 2, 3], D='foo'))
  301. with np.errstate(all='ignore'):
  302. result = df.groupby('A').transform(
  303. lambda x: (x - x.mean()) / x.std())
  304. expected = DataFrame(dict(B=np.nan, C=[-1, 0, 1, -1, 0, 1]))
  305. assert_frame_equal(result, expected)
  306. # int that needs float conversion
  307. s = Series([2, 3, 4, 10, 5, -1])
  308. df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=s, D='foo'))
  309. with np.errstate(all='ignore'):
  310. result = df.groupby('A').transform(
  311. lambda x: (x - x.mean()) / x.std())
  312. s1 = s.iloc[0:3]
  313. s1 = (s1 - s1.mean()) / s1.std()
  314. s2 = s.iloc[3:6]
  315. s2 = (s2 - s2.mean()) / s2.std()
  316. expected = DataFrame(dict(B=np.nan, C=concat([s1, s2])))
  317. assert_frame_equal(result, expected)
  318. # int downcasting
  319. result = df.groupby('A').transform(lambda x: x * 2 / 2)
  320. expected = DataFrame(dict(B=1, C=[2, 3, 4, 10, 5, -1]))
  321. assert_frame_equal(result, expected)
  322. def test_groupby_transform_with_nan_group():
  323. # GH 9941
  324. df = pd.DataFrame({'a': range(10),
  325. 'b': [1, 1, 2, 3, np.nan, 4, 4, 5, 5, 5]})
  326. result = df.groupby(df.b)['a'].transform(max)
  327. expected = pd.Series([1., 1., 2., 3., np.nan, 6., 6., 9., 9., 9.],
  328. name='a')
  329. assert_series_equal(result, expected)
  330. def test_transform_mixed_type():
  331. index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]
  332. ])
  333. df = DataFrame({'d': [1., 1., 1., 2., 2., 2.],
  334. 'c': np.tile(['a', 'b', 'c'], 2),
  335. 'v': np.arange(1., 7.)}, index=index)
  336. def f(group):
  337. group['g'] = group['d'] * 2
  338. return group[:1]
  339. grouped = df.groupby('c')
  340. result = grouped.apply(f)
  341. assert result['d'].dtype == np.float64
  342. # this is by definition a mutating operation!
  343. with option_context('mode.chained_assignment', None):
  344. for key, group in grouped:
  345. res = f(group)
  346. assert_frame_equal(res, result.loc[key])
  347. def _check_cython_group_transform_cumulative(pd_op, np_op, dtype):
  348. """
  349. Check a group transform that executes a cumulative function.
  350. Parameters
  351. ----------
  352. pd_op : callable
  353. The pandas cumulative function.
  354. np_op : callable
  355. The analogous one in NumPy.
  356. dtype : type
  357. The specified dtype of the data.
  358. """
  359. is_datetimelike = False
  360. data = np.array([[1], [2], [3], [4]], dtype=dtype)
  361. ans = np.zeros_like(data)
  362. labels = np.array([0, 0, 0, 0], dtype=np.int64)
  363. pd_op(ans, data, labels, is_datetimelike)
  364. tm.assert_numpy_array_equal(np_op(data), ans[:, 0],
  365. check_dtype=False)
  366. def test_cython_group_transform_cumsum(any_real_dtype):
  367. # see gh-4095
  368. dtype = np.dtype(any_real_dtype).type
  369. pd_op, np_op = groupby.group_cumsum, np.cumsum
  370. _check_cython_group_transform_cumulative(pd_op, np_op, dtype)
  371. def test_cython_group_transform_cumprod():
  372. # see gh-4095
  373. dtype = np.float64
  374. pd_op, np_op = groupby.group_cumprod_float64, np.cumproduct
  375. _check_cython_group_transform_cumulative(pd_op, np_op, dtype)
  376. def test_cython_group_transform_algos():
  377. # see gh-4095
  378. is_datetimelike = False
  379. # with nans
  380. labels = np.array([0, 0, 0, 0, 0], dtype=np.int64)
  381. data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64')
  382. actual = np.zeros_like(data)
  383. actual.fill(np.nan)
  384. groupby.group_cumprod_float64(actual, data, labels, is_datetimelike)
  385. expected = np.array([1, 2, 6, np.nan, 24], dtype='float64')
  386. tm.assert_numpy_array_equal(actual[:, 0], expected)
  387. actual = np.zeros_like(data)
  388. actual.fill(np.nan)
  389. groupby.group_cumsum(actual, data, labels, is_datetimelike)
  390. expected = np.array([1, 3, 6, np.nan, 10], dtype='float64')
  391. tm.assert_numpy_array_equal(actual[:, 0], expected)
  392. # timedelta
  393. is_datetimelike = True
  394. data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None]
  395. actual = np.zeros_like(data, dtype='int64')
  396. groupby.group_cumsum(actual, data.view('int64'), labels,
  397. is_datetimelike)
  398. expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64(
  399. 2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'),
  400. np.timedelta64(5, 'ns')])
  401. tm.assert_numpy_array_equal(actual[:, 0].view('m8[ns]'), expected)
  402. @pytest.mark.parametrize(
  403. "op, args, targop",
  404. [('cumprod', (), lambda x: x.cumprod()),
  405. ('cumsum', (), lambda x: x.cumsum()),
  406. ('shift', (-1, ), lambda x: x.shift(-1)),
  407. ('shift', (1, ), lambda x: x.shift())])
  408. def test_cython_transform_series(op, args, targop):
  409. # GH 4095
  410. s = Series(np.random.randn(1000))
  411. s_missing = s.copy()
  412. s_missing.iloc[2:10] = np.nan
  413. labels = np.random.randint(0, 50, size=1000).astype(float)
  414. # series
  415. for data in [s, s_missing]:
  416. # print(data.head())
  417. expected = data.groupby(labels).transform(targop)
  418. tm.assert_series_equal(
  419. expected,
  420. data.groupby(labels).transform(op, *args))
  421. tm.assert_series_equal(expected, getattr(
  422. data.groupby(labels), op)(*args))
  423. @pytest.mark.parametrize("op", ['cumprod', 'cumsum'])
  424. @pytest.mark.parametrize("skipna", [False, True])
  425. @pytest.mark.parametrize('input, exp', [
  426. # When everything is NaN
  427. ({'key': ['b'] * 10, 'value': np.nan},
  428. pd.Series([np.nan] * 10, name='value')),
  429. # When there is a single NaN
  430. ({'key': ['b'] * 10 + ['a'] * 2,
  431. 'value': [3] * 3 + [np.nan] + [3] * 8},
  432. {('cumprod', False): [3.0, 9.0, 27.0] + [np.nan] * 7 + [3.0, 9.0],
  433. ('cumprod', True): [3.0, 9.0, 27.0, np.nan, 81., 243., 729.,
  434. 2187., 6561., 19683., 3.0, 9.0],
  435. ('cumsum', False): [3.0, 6.0, 9.0] + [np.nan] * 7 + [3.0, 6.0],
  436. ('cumsum', True): [3.0, 6.0, 9.0, np.nan, 12., 15., 18.,
  437. 21., 24., 27., 3.0, 6.0]})])
  438. def test_groupby_cum_skipna(op, skipna, input, exp):
  439. df = pd.DataFrame(input)
  440. result = df.groupby('key')['value'].transform(op, skipna=skipna)
  441. if isinstance(exp, dict):
  442. expected = exp[(op, skipna)]
  443. else:
  444. expected = exp
  445. expected = pd.Series(expected, name='value')
  446. tm.assert_series_equal(expected, result)
  447. @pytest.mark.parametrize(
  448. "op, args, targop",
  449. [('cumprod', (), lambda x: x.cumprod()),
  450. ('cumsum', (), lambda x: x.cumsum()),
  451. ('shift', (-1, ), lambda x: x.shift(-1)),
  452. ('shift', (1, ), lambda x: x.shift())])
  453. def test_cython_transform_frame(op, args, targop):
  454. s = Series(np.random.randn(1000))
  455. s_missing = s.copy()
  456. s_missing.iloc[2:10] = np.nan
  457. labels = np.random.randint(0, 50, size=1000).astype(float)
  458. strings = list('qwertyuiopasdfghjklz')
  459. strings_missing = strings[:]
  460. strings_missing[5] = np.nan
  461. df = DataFrame({'float': s,
  462. 'float_missing': s_missing,
  463. 'int': [1, 1, 1, 1, 2] * 200,
  464. 'datetime': pd.date_range('1990-1-1', periods=1000),
  465. 'timedelta': pd.timedelta_range(1, freq='s',
  466. periods=1000),
  467. 'string': strings * 50,
  468. 'string_missing': strings_missing * 50},
  469. columns=['float', 'float_missing', 'int', 'datetime',
  470. 'timedelta', 'string', 'string_missing'])
  471. df['cat'] = df['string'].astype('category')
  472. df2 = df.copy()
  473. df2.index = pd.MultiIndex.from_product([range(100), range(10)])
  474. # DataFrame - Single and MultiIndex,
  475. # group by values, index level, columns
  476. for df in [df, df2]:
  477. for gb_target in [dict(by=labels), dict(level=0), dict(by='string')
  478. ]: # dict(by='string_missing')]:
  479. # dict(by=['int','string'])]:
  480. gb = df.groupby(**gb_target)
  481. # whitelisted methods set the selection before applying
  482. # bit a of hack to make sure the cythonized shift
  483. # is equivalent to pre 0.17.1 behavior
  484. if op == 'shift':
  485. gb._set_group_selection()
  486. if op != 'shift' and 'int' not in gb_target:
  487. # numeric apply fastpath promotes dtype so have
  488. # to apply separately and concat
  489. i = gb[['int']].apply(targop)
  490. f = gb[['float', 'float_missing']].apply(targop)
  491. expected = pd.concat([f, i], axis=1)
  492. else:
  493. expected = gb.apply(targop)
  494. expected = expected.sort_index(axis=1)
  495. tm.assert_frame_equal(expected,
  496. gb.transform(op, *args).sort_index(
  497. axis=1))
  498. tm.assert_frame_equal(
  499. expected,
  500. getattr(gb, op)(*args).sort_index(axis=1))
  501. # individual columns
  502. for c in df:
  503. if c not in ['float', 'int', 'float_missing'
  504. ] and op != 'shift':
  505. msg = "No numeric types to aggregate"
  506. with pytest.raises(DataError, match=msg):
  507. gb[c].transform(op)
  508. with pytest.raises(DataError, match=msg):
  509. getattr(gb[c], op)()
  510. else:
  511. expected = gb[c].apply(targop)
  512. expected.name = c
  513. tm.assert_series_equal(expected,
  514. gb[c].transform(op, *args))
  515. tm.assert_series_equal(expected,
  516. getattr(gb[c], op)(*args))
  517. def test_transform_with_non_scalar_group():
  518. # GH 10165
  519. cols = pd.MultiIndex.from_tuples([
  520. ('syn', 'A'), ('mis', 'A'), ('non', 'A'),
  521. ('syn', 'C'), ('mis', 'C'), ('non', 'C'),
  522. ('syn', 'T'), ('mis', 'T'), ('non', 'T'),
  523. ('syn', 'G'), ('mis', 'G'), ('non', 'G')])
  524. df = pd.DataFrame(np.random.randint(1, 10, (4, 12)),
  525. columns=cols,
  526. index=['A', 'C', 'G', 'T'])
  527. msg = 'transform must return a scalar value for each group.*'
  528. with pytest.raises(ValueError, match=msg):
  529. df.groupby(axis=1, level=1).transform(
  530. lambda z: z.div(z.sum(axis=1), axis=0))
  531. @pytest.mark.parametrize('cols,exp,comp_func', [
  532. ('a', pd.Series([1, 1, 1], name='a'), tm.assert_series_equal),
  533. (['a', 'c'], pd.DataFrame({'a': [1, 1, 1], 'c': [1, 1, 1]}),
  534. tm.assert_frame_equal)
  535. ])
  536. @pytest.mark.parametrize('agg_func', [
  537. 'count', 'rank', 'size'])
  538. def test_transform_numeric_ret(cols, exp, comp_func, agg_func):
  539. if agg_func == 'size' and isinstance(cols, list):
  540. pytest.xfail("'size' transformation not supported with "
  541. "NDFrameGroupy")
  542. # GH 19200
  543. df = pd.DataFrame(
  544. {'a': pd.date_range('2018-01-01', periods=3),
  545. 'b': range(3),
  546. 'c': range(7, 10)})
  547. result = df.groupby('b')[cols].transform(agg_func)
  548. if agg_func == 'rank':
  549. exp = exp.astype('float')
  550. comp_func(result, exp)
  551. @pytest.mark.parametrize("mix_groupings", [True, False])
  552. @pytest.mark.parametrize("as_series", [True, False])
  553. @pytest.mark.parametrize("val1,val2", [
  554. ('foo', 'bar'), (1, 2), (1., 2.)])
  555. @pytest.mark.parametrize("fill_method,limit,exp_vals", [
  556. ("ffill", None,
  557. [np.nan, np.nan, 'val1', 'val1', 'val1', 'val2', 'val2', 'val2']),
  558. ("ffill", 1,
  559. [np.nan, np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan]),
  560. ("bfill", None,
  561. ['val1', 'val1', 'val1', 'val2', 'val2', 'val2', np.nan, np.nan]),
  562. ("bfill", 1,
  563. [np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan, np.nan])
  564. ])
  565. def test_group_fill_methods(mix_groupings, as_series, val1, val2,
  566. fill_method, limit, exp_vals):
  567. vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan]
  568. _exp_vals = list(exp_vals)
  569. # Overwrite placeholder values
  570. for index, exp_val in enumerate(_exp_vals):
  571. if exp_val == 'val1':
  572. _exp_vals[index] = val1
  573. elif exp_val == 'val2':
  574. _exp_vals[index] = val2
  575. # Need to modify values and expectations depending on the
  576. # Series / DataFrame that we ultimately want to generate
  577. if mix_groupings: # ['a', 'b', 'a, 'b', ...]
  578. keys = ['a', 'b'] * len(vals)
  579. def interweave(list_obj):
  580. temp = list()
  581. for x in list_obj:
  582. temp.extend([x, x])
  583. return temp
  584. _exp_vals = interweave(_exp_vals)
  585. vals = interweave(vals)
  586. else: # ['a', 'a', 'a', ... 'b', 'b', 'b']
  587. keys = ['a'] * len(vals) + ['b'] * len(vals)
  588. _exp_vals = _exp_vals * 2
  589. vals = vals * 2
  590. df = DataFrame({'key': keys, 'val': vals})
  591. if as_series:
  592. result = getattr(
  593. df.groupby('key')['val'], fill_method)(limit=limit)
  594. exp = Series(_exp_vals, name='val')
  595. assert_series_equal(result, exp)
  596. else:
  597. result = getattr(df.groupby('key'), fill_method)(limit=limit)
  598. exp = DataFrame({'key': keys, 'val': _exp_vals})
  599. assert_frame_equal(result, exp)
  600. @pytest.mark.parametrize("fill_method", ['ffill', 'bfill'])
  601. def test_pad_stable_sorting(fill_method):
  602. # GH 21207
  603. x = [0] * 20
  604. y = [np.nan] * 10 + [1] * 10
  605. if fill_method == 'bfill':
  606. y = y[::-1]
  607. df = pd.DataFrame({'x': x, 'y': y})
  608. expected = df.copy()
  609. result = getattr(df.groupby('x'), fill_method)()
  610. tm.assert_frame_equal(result, expected)
  611. @pytest.mark.parametrize("test_series", [True, False])
  612. @pytest.mark.parametrize("freq", [
  613. None,
  614. pytest.param('D', marks=pytest.mark.xfail(
  615. reason='GH#23918 before method uses freq in vectorized approach'))])
  616. @pytest.mark.parametrize("periods,fill_method,limit", [
  617. (1, 'ffill', None), (1, 'ffill', 1),
  618. (1, 'bfill', None), (1, 'bfill', 1),
  619. (-1, 'ffill', None), (-1, 'ffill', 1),
  620. (-1, 'bfill', None), (-1, 'bfill', 1),
  621. ])
  622. def test_pct_change(test_series, freq, periods, fill_method, limit):
  623. # GH 21200, 21621
  624. vals = [3, np.nan, np.nan, np.nan, 1, 2, 4, 10, np.nan, 4]
  625. keys = ['a', 'b']
  626. key_v = np.repeat(keys, len(vals))
  627. df = DataFrame({'key': key_v, 'vals': vals * 2})
  628. df_g = getattr(df.groupby('key'), fill_method)(limit=limit)
  629. grp = df_g.groupby('key')
  630. expected = grp['vals'].obj / grp['vals'].shift(periods) - 1
  631. if test_series:
  632. result = df.groupby('key')['vals'].pct_change(
  633. periods=periods, fill_method=fill_method, limit=limit, freq=freq)
  634. tm.assert_series_equal(result, expected)
  635. else:
  636. result = df.groupby('key').pct_change(
  637. periods=periods, fill_method=fill_method, limit=limit, freq=freq)
  638. tm.assert_frame_equal(result, expected.to_frame('vals'))
  639. @pytest.mark.parametrize("func", [np.any, np.all])
  640. def test_any_all_np_func(func):
  641. # GH 20653
  642. df = pd.DataFrame([['foo', True],
  643. [np.nan, True],
  644. ['foo', True]], columns=['key', 'val'])
  645. exp = pd.Series([True, np.nan, True], name='val')
  646. res = df.groupby('key')['val'].transform(func)
  647. tm.assert_series_equal(res, exp)
  648. def test_groupby_transform_rename():
  649. # https://github.com/pandas-dev/pandas/issues/23461
  650. def demean_rename(x):
  651. result = x - x.mean()
  652. if isinstance(x, pd.Series):
  653. return result
  654. result = result.rename(
  655. columns={c: '{}_demeaned'.format(c) for c in result.columns})
  656. return result
  657. df = pd.DataFrame({'group': list('ababa'),
  658. 'value': [1, 1, 1, 2, 2]})
  659. expected = pd.DataFrame({'value': [-1. / 3, -0.5, -1. / 3, 0.5, 2. / 3]})
  660. result = df.groupby('group').transform(demean_rename)
  661. tm.assert_frame_equal(result, expected)
  662. result_single = df.groupby('group').value.transform(demean_rename)
  663. tm.assert_series_equal(result_single, expected['value'])
  664. @pytest.mark.parametrize('func', [min, max, np.min, np.max, 'first', 'last'])
  665. def test_groupby_transform_timezone_column(func):
  666. # GH 24198
  667. ts = pd.to_datetime('now', utc=True).tz_convert('Asia/Singapore')
  668. result = pd.DataFrame({'end_time': [ts], 'id': [1]})
  669. result['max_end_time'] = result.groupby('id').end_time.transform(func)
  670. expected = pd.DataFrame([[ts, 1, ts]], columns=['end_time', 'id',
  671. 'max_end_time'])
  672. tm.assert_frame_equal(result, expected)