test_counting.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
  1. # -*- coding: utf-8 -*-
  2. from __future__ import print_function
  3. import numpy as np
  4. import pytest
  5. from pandas.compat import product as cart_product, range
  6. from pandas import DataFrame, MultiIndex, Period, Series, Timedelta, Timestamp
  7. from pandas.util.testing import assert_frame_equal, assert_series_equal
  8. class TestCounting(object):
  9. def test_cumcount(self):
  10. df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'])
  11. g = df.groupby('A')
  12. sg = g.A
  13. expected = Series([0, 1, 2, 0, 3])
  14. assert_series_equal(expected, g.cumcount())
  15. assert_series_equal(expected, sg.cumcount())
  16. def test_cumcount_empty(self):
  17. ge = DataFrame().groupby(level=0)
  18. se = Series().groupby(level=0)
  19. # edge case, as this is usually considered float
  20. e = Series(dtype='int64')
  21. assert_series_equal(e, ge.cumcount())
  22. assert_series_equal(e, se.cumcount())
  23. def test_cumcount_dupe_index(self):
  24. df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
  25. index=[0] * 5)
  26. g = df.groupby('A')
  27. sg = g.A
  28. expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
  29. assert_series_equal(expected, g.cumcount())
  30. assert_series_equal(expected, sg.cumcount())
  31. def test_cumcount_mi(self):
  32. mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
  33. df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
  34. index=mi)
  35. g = df.groupby('A')
  36. sg = g.A
  37. expected = Series([0, 1, 2, 0, 3], index=mi)
  38. assert_series_equal(expected, g.cumcount())
  39. assert_series_equal(expected, sg.cumcount())
  40. def test_cumcount_groupby_not_col(self):
  41. df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
  42. index=[0] * 5)
  43. g = df.groupby([0, 0, 0, 1, 0])
  44. sg = g.A
  45. expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
  46. assert_series_equal(expected, g.cumcount())
  47. assert_series_equal(expected, sg.cumcount())
  48. def test_ngroup(self):
  49. df = DataFrame({'A': list('aaaba')})
  50. g = df.groupby('A')
  51. sg = g.A
  52. expected = Series([0, 0, 0, 1, 0])
  53. assert_series_equal(expected, g.ngroup())
  54. assert_series_equal(expected, sg.ngroup())
  55. def test_ngroup_distinct(self):
  56. df = DataFrame({'A': list('abcde')})
  57. g = df.groupby('A')
  58. sg = g.A
  59. expected = Series(range(5), dtype='int64')
  60. assert_series_equal(expected, g.ngroup())
  61. assert_series_equal(expected, sg.ngroup())
  62. def test_ngroup_one_group(self):
  63. df = DataFrame({'A': [0] * 5})
  64. g = df.groupby('A')
  65. sg = g.A
  66. expected = Series([0] * 5)
  67. assert_series_equal(expected, g.ngroup())
  68. assert_series_equal(expected, sg.ngroup())
  69. def test_ngroup_empty(self):
  70. ge = DataFrame().groupby(level=0)
  71. se = Series().groupby(level=0)
  72. # edge case, as this is usually considered float
  73. e = Series(dtype='int64')
  74. assert_series_equal(e, ge.ngroup())
  75. assert_series_equal(e, se.ngroup())
  76. def test_ngroup_series_matches_frame(self):
  77. df = DataFrame({'A': list('aaaba')})
  78. s = Series(list('aaaba'))
  79. assert_series_equal(df.groupby(s).ngroup(),
  80. s.groupby(s).ngroup())
  81. def test_ngroup_dupe_index(self):
  82. df = DataFrame({'A': list('aaaba')}, index=[0] * 5)
  83. g = df.groupby('A')
  84. sg = g.A
  85. expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
  86. assert_series_equal(expected, g.ngroup())
  87. assert_series_equal(expected, sg.ngroup())
  88. def test_ngroup_mi(self):
  89. mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
  90. df = DataFrame({'A': list('aaaba')}, index=mi)
  91. g = df.groupby('A')
  92. sg = g.A
  93. expected = Series([0, 0, 0, 1, 0], index=mi)
  94. assert_series_equal(expected, g.ngroup())
  95. assert_series_equal(expected, sg.ngroup())
  96. def test_ngroup_groupby_not_col(self):
  97. df = DataFrame({'A': list('aaaba')}, index=[0] * 5)
  98. g = df.groupby([0, 0, 0, 1, 0])
  99. sg = g.A
  100. expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
  101. assert_series_equal(expected, g.ngroup())
  102. assert_series_equal(expected, sg.ngroup())
  103. def test_ngroup_descending(self):
  104. df = DataFrame(['a', 'a', 'b', 'a', 'b'], columns=['A'])
  105. g = df.groupby(['A'])
  106. ascending = Series([0, 0, 1, 0, 1])
  107. descending = Series([1, 1, 0, 1, 0])
  108. assert_series_equal(descending, (g.ngroups - 1) - ascending)
  109. assert_series_equal(ascending, g.ngroup(ascending=True))
  110. assert_series_equal(descending, g.ngroup(ascending=False))
  111. def test_ngroup_matches_cumcount(self):
  112. # verify one manually-worked out case works
  113. df = DataFrame([['a', 'x'], ['a', 'y'], ['b', 'x'],
  114. ['a', 'x'], ['b', 'y']], columns=['A', 'X'])
  115. g = df.groupby(['A', 'X'])
  116. g_ngroup = g.ngroup()
  117. g_cumcount = g.cumcount()
  118. expected_ngroup = Series([0, 1, 2, 0, 3])
  119. expected_cumcount = Series([0, 0, 0, 1, 0])
  120. assert_series_equal(g_ngroup, expected_ngroup)
  121. assert_series_equal(g_cumcount, expected_cumcount)
  122. def test_ngroup_cumcount_pair(self):
  123. # brute force comparison for all small series
  124. for p in cart_product(range(3), repeat=4):
  125. df = DataFrame({'a': p})
  126. g = df.groupby(['a'])
  127. order = sorted(set(p))
  128. ngroupd = [order.index(val) for val in p]
  129. cumcounted = [p[:i].count(val) for i, val in enumerate(p)]
  130. assert_series_equal(g.ngroup(), Series(ngroupd))
  131. assert_series_equal(g.cumcount(), Series(cumcounted))
  132. def test_ngroup_respects_groupby_order(self):
  133. np.random.seed(0)
  134. df = DataFrame({'a': np.random.choice(list('abcdef'), 100)})
  135. for sort_flag in (False, True):
  136. g = df.groupby(['a'], sort=sort_flag)
  137. df['group_id'] = -1
  138. df['group_index'] = -1
  139. for i, (_, group) in enumerate(g):
  140. df.loc[group.index, 'group_id'] = i
  141. for j, ind in enumerate(group.index):
  142. df.loc[ind, 'group_index'] = j
  143. assert_series_equal(Series(df['group_id'].values),
  144. g.ngroup())
  145. assert_series_equal(Series(df['group_index'].values),
  146. g.cumcount())
  147. @pytest.mark.parametrize('datetimelike', [
  148. [Timestamp('2016-05-%02d 20:09:25+00:00' % i) for i in range(1, 4)],
  149. [Timestamp('2016-05-%02d 20:09:25' % i) for i in range(1, 4)],
  150. [Timedelta(x, unit="h") for x in range(1, 4)],
  151. [Period(freq="2W", year=2017, month=x) for x in range(1, 4)]])
  152. def test_count_with_datetimelike(self, datetimelike):
  153. # test for #13393, where DataframeGroupBy.count() fails
  154. # when counting a datetimelike column.
  155. df = DataFrame({'x': ['a', 'a', 'b'], 'y': datetimelike})
  156. res = df.groupby('x').count()
  157. expected = DataFrame({'y': [2, 1]}, index=['a', 'b'])
  158. expected.index.name = "x"
  159. assert_frame_equal(expected, res)
  160. def test_count_with_only_nans_in_first_group(self):
  161. # GH21956
  162. df = DataFrame({'A': [np.nan, np.nan], 'B': ['a', 'b'], 'C': [1, 2]})
  163. result = df.groupby(['A', 'B']).C.count()
  164. mi = MultiIndex(levels=[[], ['a', 'b']],
  165. codes=[[], []],
  166. names=['A', 'B'])
  167. expected = Series([], index=mi, dtype=np.int64, name='C')
  168. assert_series_equal(result, expected, check_index_type=False)