test_cython.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. # -*- coding: utf-8 -*-
  2. """
  3. test cython .agg behavior
  4. """
  5. from __future__ import print_function
  6. import numpy as np
  7. import pytest
  8. import pandas as pd
  9. from pandas import (
  10. DataFrame, Index, NaT, Series, Timedelta, Timestamp, bdate_range)
  11. from pandas.core.groupby.groupby import DataError
  12. import pandas.util.testing as tm
  13. @pytest.mark.parametrize('op_name', [
  14. 'count',
  15. 'sum',
  16. 'std',
  17. 'var',
  18. 'sem',
  19. 'mean',
  20. pytest.param('median',
  21. # ignore mean of empty slice
  22. # and all-NaN
  23. marks=[pytest.mark.filterwarnings(
  24. "ignore::RuntimeWarning"
  25. )]),
  26. 'prod',
  27. 'min',
  28. 'max',
  29. ])
  30. def test_cythonized_aggers(op_name):
  31. data = {'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., np.nan, np.nan],
  32. 'B': ['A', 'B'] * 6,
  33. 'C': np.random.randn(12)}
  34. df = DataFrame(data)
  35. df.loc[2:10:2, 'C'] = np.nan
  36. op = lambda x: getattr(x, op_name)()
  37. # single column
  38. grouped = df.drop(['B'], axis=1).groupby('A')
  39. exp = {cat: op(group['C']) for cat, group in grouped}
  40. exp = DataFrame({'C': exp})
  41. exp.index.name = 'A'
  42. result = op(grouped)
  43. tm.assert_frame_equal(result, exp)
  44. # multiple columns
  45. grouped = df.groupby(['A', 'B'])
  46. expd = {}
  47. for (cat1, cat2), group in grouped:
  48. expd.setdefault(cat1, {})[cat2] = op(group['C'])
  49. exp = DataFrame(expd).T.stack(dropna=False)
  50. exp.index.names = ['A', 'B']
  51. exp.name = 'C'
  52. result = op(grouped)['C']
  53. if op_name in ['sum', 'prod']:
  54. tm.assert_series_equal(result, exp)
  55. def test_cython_agg_boolean():
  56. frame = DataFrame({'a': np.random.randint(0, 5, 50),
  57. 'b': np.random.randint(0, 2, 50).astype('bool')})
  58. result = frame.groupby('a')['b'].mean()
  59. expected = frame.groupby('a')['b'].agg(np.mean)
  60. tm.assert_series_equal(result, expected)
  61. def test_cython_agg_nothing_to_agg():
  62. frame = DataFrame({'a': np.random.randint(0, 5, 50),
  63. 'b': ['foo', 'bar'] * 25})
  64. msg = "No numeric types to aggregate"
  65. with pytest.raises(DataError, match=msg):
  66. frame.groupby('a')['b'].mean()
  67. frame = DataFrame({'a': np.random.randint(0, 5, 50),
  68. 'b': ['foo', 'bar'] * 25})
  69. with pytest.raises(DataError, match=msg):
  70. frame[['b']].groupby(frame['a']).mean()
  71. def test_cython_agg_nothing_to_agg_with_dates():
  72. frame = DataFrame({'a': np.random.randint(0, 5, 50),
  73. 'b': ['foo', 'bar'] * 25,
  74. 'dates': pd.date_range('now', periods=50, freq='T')})
  75. msg = "No numeric types to aggregate"
  76. with pytest.raises(DataError, match=msg):
  77. frame.groupby('b').dates.mean()
  78. def test_cython_agg_frame_columns():
  79. # #2113
  80. df = DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]})
  81. df.groupby(level=0, axis='columns').mean()
  82. df.groupby(level=0, axis='columns').mean()
  83. df.groupby(level=0, axis='columns').mean()
  84. df.groupby(level=0, axis='columns').mean()
  85. def test_cython_agg_return_dict():
  86. # GH 16741
  87. df = DataFrame(
  88. {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
  89. 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
  90. 'C': np.random.randn(8),
  91. 'D': np.random.randn(8)})
  92. ts = df.groupby('A')['B'].agg(lambda x: x.value_counts().to_dict())
  93. expected = Series([{'two': 1, 'one': 1, 'three': 1},
  94. {'two': 2, 'one': 2, 'three': 1}],
  95. index=Index(['bar', 'foo'], name='A'),
  96. name='B')
  97. tm.assert_series_equal(ts, expected)
  98. def test_cython_fail_agg():
  99. dr = bdate_range('1/1/2000', periods=50)
  100. ts = Series(['A', 'B', 'C', 'D', 'E'] * 10, index=dr)
  101. grouped = ts.groupby(lambda x: x.month)
  102. summed = grouped.sum()
  103. expected = grouped.agg(np.sum)
  104. tm.assert_series_equal(summed, expected)
  105. @pytest.mark.parametrize('op, targop', [
  106. ('mean', np.mean),
  107. ('median', np.median),
  108. ('var', np.var),
  109. ('add', np.sum),
  110. ('prod', np.prod),
  111. ('min', np.min),
  112. ('max', np.max),
  113. ('first', lambda x: x.iloc[0]),
  114. ('last', lambda x: x.iloc[-1]),
  115. ])
  116. def test__cython_agg_general(op, targop):
  117. df = DataFrame(np.random.randn(1000))
  118. labels = np.random.randint(0, 50, size=1000).astype(float)
  119. result = df.groupby(labels)._cython_agg_general(op)
  120. expected = df.groupby(labels).agg(targop)
  121. tm.assert_frame_equal(result, expected)
  122. @pytest.mark.parametrize('op, targop', [
  123. ('mean', np.mean),
  124. ('median', lambda x: np.median(x) if len(x) > 0 else np.nan),
  125. ('var', lambda x: np.var(x, ddof=1)),
  126. ('min', np.min),
  127. ('max', np.max), ]
  128. )
  129. def test_cython_agg_empty_buckets(op, targop, observed):
  130. df = pd.DataFrame([11, 12, 13])
  131. grps = range(0, 55, 5)
  132. # calling _cython_agg_general directly, instead of via the user API
  133. # which sets different values for min_count, so do that here.
  134. g = df.groupby(pd.cut(df[0], grps), observed=observed)
  135. result = g._cython_agg_general(op)
  136. g = df.groupby(pd.cut(df[0], grps), observed=observed)
  137. expected = g.agg(lambda x: targop(x))
  138. tm.assert_frame_equal(result, expected)
  139. def test_cython_agg_empty_buckets_nanops(observed):
  140. # GH-18869 can't call nanops on empty groups, so hardcode expected
  141. # for these
  142. df = pd.DataFrame([11, 12, 13], columns=['a'])
  143. grps = range(0, 25, 5)
  144. # add / sum
  145. result = df.groupby(pd.cut(df['a'], grps),
  146. observed=observed)._cython_agg_general('add')
  147. intervals = pd.interval_range(0, 20, freq=5)
  148. expected = pd.DataFrame(
  149. {"a": [0, 0, 36, 0]},
  150. index=pd.CategoricalIndex(intervals, name='a', ordered=True))
  151. if observed:
  152. expected = expected[expected.a != 0]
  153. tm.assert_frame_equal(result, expected)
  154. # prod
  155. result = df.groupby(pd.cut(df['a'], grps),
  156. observed=observed)._cython_agg_general('prod')
  157. expected = pd.DataFrame(
  158. {"a": [1, 1, 1716, 1]},
  159. index=pd.CategoricalIndex(intervals, name='a', ordered=True))
  160. if observed:
  161. expected = expected[expected.a != 1]
  162. tm.assert_frame_equal(result, expected)
  163. @pytest.mark.parametrize('op', ['first', 'last', 'max', 'min'])
  164. @pytest.mark.parametrize('data', [
  165. Timestamp('2016-10-14 21:00:44.557'),
  166. Timedelta('17088 days 21:00:44.557'), ])
  167. def test_cython_with_timestamp_and_nat(op, data):
  168. # https://github.com/pandas-dev/pandas/issues/19526
  169. df = DataFrame({'a': [0, 1], 'b': [data, NaT]})
  170. index = Index([0, 1], name='a')
  171. # We will group by a and test the cython aggregations
  172. expected = DataFrame({'b': [data, NaT]}, index=index)
  173. result = df.groupby('a').aggregate(op)
  174. tm.assert_frame_equal(expected, result)