test_rank.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. from pandas import DataFrame, Series, concat
  5. from pandas.util import testing as tm
  6. def test_rank_apply():
  7. lev1 = tm.rands_array(10, 100)
  8. lev2 = tm.rands_array(10, 130)
  9. lab1 = np.random.randint(0, 100, size=500)
  10. lab2 = np.random.randint(0, 130, size=500)
  11. df = DataFrame({'value': np.random.randn(500),
  12. 'key1': lev1.take(lab1),
  13. 'key2': lev2.take(lab2)})
  14. result = df.groupby(['key1', 'key2']).value.rank()
  15. expected = [piece.value.rank()
  16. for key, piece in df.groupby(['key1', 'key2'])]
  17. expected = concat(expected, axis=0)
  18. expected = expected.reindex(result.index)
  19. tm.assert_series_equal(result, expected)
  20. result = df.groupby(['key1', 'key2']).value.rank(pct=True)
  21. expected = [piece.value.rank(pct=True)
  22. for key, piece in df.groupby(['key1', 'key2'])]
  23. expected = concat(expected, axis=0)
  24. expected = expected.reindex(result.index)
  25. tm.assert_series_equal(result, expected)
  26. @pytest.mark.parametrize("grps", [
  27. ['qux'], ['qux', 'quux']])
  28. @pytest.mark.parametrize("vals", [
  29. [2, 2, 8, 2, 6],
  30. [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'),
  31. pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'),
  32. pd.Timestamp('2018-01-06')]])
  33. @pytest.mark.parametrize("ties_method,ascending,pct,exp", [
  34. ('average', True, False, [2., 2., 5., 2., 4.]),
  35. ('average', True, True, [0.4, 0.4, 1.0, 0.4, 0.8]),
  36. ('average', False, False, [4., 4., 1., 4., 2.]),
  37. ('average', False, True, [.8, .8, .2, .8, .4]),
  38. ('min', True, False, [1., 1., 5., 1., 4.]),
  39. ('min', True, True, [0.2, 0.2, 1.0, 0.2, 0.8]),
  40. ('min', False, False, [3., 3., 1., 3., 2.]),
  41. ('min', False, True, [.6, .6, .2, .6, .4]),
  42. ('max', True, False, [3., 3., 5., 3., 4.]),
  43. ('max', True, True, [0.6, 0.6, 1.0, 0.6, 0.8]),
  44. ('max', False, False, [5., 5., 1., 5., 2.]),
  45. ('max', False, True, [1., 1., .2, 1., .4]),
  46. ('first', True, False, [1., 2., 5., 3., 4.]),
  47. ('first', True, True, [0.2, 0.4, 1.0, 0.6, 0.8]),
  48. ('first', False, False, [3., 4., 1., 5., 2.]),
  49. ('first', False, True, [.6, .8, .2, 1., .4]),
  50. ('dense', True, False, [1., 1., 3., 1., 2.]),
  51. ('dense', True, True, [1. / 3., 1. / 3., 3. / 3., 1. / 3., 2. / 3.]),
  52. ('dense', False, False, [3., 3., 1., 3., 2.]),
  53. ('dense', False, True, [3. / 3., 3. / 3., 1. / 3., 3. / 3., 2. / 3.]),
  54. ])
  55. def test_rank_args(grps, vals, ties_method, ascending, pct, exp):
  56. key = np.repeat(grps, len(vals))
  57. vals = vals * len(grps)
  58. df = DataFrame({'key': key, 'val': vals})
  59. result = df.groupby('key').rank(method=ties_method,
  60. ascending=ascending, pct=pct)
  61. exp_df = DataFrame(exp * len(grps), columns=['val'])
  62. tm.assert_frame_equal(result, exp_df)
  63. @pytest.mark.parametrize("grps", [
  64. ['qux'], ['qux', 'quux']])
  65. @pytest.mark.parametrize("vals", [
  66. [-np.inf, -np.inf, np.nan, 1., np.nan, np.inf, np.inf],
  67. ])
  68. @pytest.mark.parametrize("ties_method,ascending,na_option,exp", [
  69. ('average', True, 'keep', [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]),
  70. ('average', True, 'top', [3.5, 3.5, 1.5, 5., 1.5, 6.5, 6.5]),
  71. ('average', True, 'bottom', [1.5, 1.5, 6.5, 3., 6.5, 4.5, 4.5]),
  72. ('average', False, 'keep', [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]),
  73. ('average', False, 'top', [6.5, 6.5, 1.5, 5., 1.5, 3.5, 3.5]),
  74. ('average', False, 'bottom', [4.5, 4.5, 6.5, 3., 6.5, 1.5, 1.5]),
  75. ('min', True, 'keep', [1., 1., np.nan, 3., np.nan, 4., 4.]),
  76. ('min', True, 'top', [3., 3., 1., 5., 1., 6., 6.]),
  77. ('min', True, 'bottom', [1., 1., 6., 3., 6., 4., 4.]),
  78. ('min', False, 'keep', [4., 4., np.nan, 3., np.nan, 1., 1.]),
  79. ('min', False, 'top', [6., 6., 1., 5., 1., 3., 3.]),
  80. ('min', False, 'bottom', [4., 4., 6., 3., 6., 1., 1.]),
  81. ('max', True, 'keep', [2., 2., np.nan, 3., np.nan, 5., 5.]),
  82. ('max', True, 'top', [4., 4., 2., 5., 2., 7., 7.]),
  83. ('max', True, 'bottom', [2., 2., 7., 3., 7., 5., 5.]),
  84. ('max', False, 'keep', [5., 5., np.nan, 3., np.nan, 2., 2.]),
  85. ('max', False, 'top', [7., 7., 2., 5., 2., 4., 4.]),
  86. ('max', False, 'bottom', [5., 5., 7., 3., 7., 2., 2.]),
  87. ('first', True, 'keep', [1., 2., np.nan, 3., np.nan, 4., 5.]),
  88. ('first', True, 'top', [3., 4., 1., 5., 2., 6., 7.]),
  89. ('first', True, 'bottom', [1., 2., 6., 3., 7., 4., 5.]),
  90. ('first', False, 'keep', [4., 5., np.nan, 3., np.nan, 1., 2.]),
  91. ('first', False, 'top', [6., 7., 1., 5., 2., 3., 4.]),
  92. ('first', False, 'bottom', [4., 5., 6., 3., 7., 1., 2.]),
  93. ('dense', True, 'keep', [1., 1., np.nan, 2., np.nan, 3., 3.]),
  94. ('dense', True, 'top', [2., 2., 1., 3., 1., 4., 4.]),
  95. ('dense', True, 'bottom', [1., 1., 4., 2., 4., 3., 3.]),
  96. ('dense', False, 'keep', [3., 3., np.nan, 2., np.nan, 1., 1.]),
  97. ('dense', False, 'top', [4., 4., 1., 3., 1., 2., 2.]),
  98. ('dense', False, 'bottom', [3., 3., 4., 2., 4., 1., 1.])
  99. ])
  100. def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
  101. # GH 20561
  102. key = np.repeat(grps, len(vals))
  103. vals = vals * len(grps)
  104. df = DataFrame({'key': key, 'val': vals})
  105. result = df.groupby('key').rank(method=ties_method,
  106. ascending=ascending,
  107. na_option=na_option)
  108. exp_df = DataFrame(exp * len(grps), columns=['val'])
  109. tm.assert_frame_equal(result, exp_df)
  110. @pytest.mark.parametrize("grps", [
  111. ['qux'], ['qux', 'quux']])
  112. @pytest.mark.parametrize("vals", [
  113. [2, 2, np.nan, 8, 2, 6, np.nan, np.nan],
  114. [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan,
  115. pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'),
  116. pd.Timestamp('2018-01-06'), np.nan, np.nan]
  117. ])
  118. @pytest.mark.parametrize("ties_method,ascending,na_option,pct,exp", [
  119. ('average', True, 'keep', False,
  120. [2., 2., np.nan, 5., 2., 4., np.nan, np.nan]),
  121. ('average', True, 'keep', True,
  122. [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan]),
  123. ('average', False, 'keep', False,
  124. [4., 4., np.nan, 1., 4., 2., np.nan, np.nan]),
  125. ('average', False, 'keep', True,
  126. [.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan]),
  127. ('min', True, 'keep', False,
  128. [1., 1., np.nan, 5., 1., 4., np.nan, np.nan]),
  129. ('min', True, 'keep', True,
  130. [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]),
  131. ('min', False, 'keep', False,
  132. [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]),
  133. ('min', False, 'keep', True,
  134. [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]),
  135. ('max', True, 'keep', False,
  136. [3., 3., np.nan, 5., 3., 4., np.nan, np.nan]),
  137. ('max', True, 'keep', True,
  138. [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]),
  139. ('max', False, 'keep', False,
  140. [5., 5., np.nan, 1., 5., 2., np.nan, np.nan]),
  141. ('max', False, 'keep', True,
  142. [1., 1., np.nan, 0.2, 1., 0.4, np.nan, np.nan]),
  143. ('first', True, 'keep', False,
  144. [1., 2., np.nan, 5., 3., 4., np.nan, np.nan]),
  145. ('first', True, 'keep', True,
  146. [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]),
  147. ('first', False, 'keep', False,
  148. [3., 4., np.nan, 1., 5., 2., np.nan, np.nan]),
  149. ('first', False, 'keep', True,
  150. [.6, 0.8, np.nan, 0.2, 1., 0.4, np.nan, np.nan]),
  151. ('dense', True, 'keep', False,
  152. [1., 1., np.nan, 3., 1., 2., np.nan, np.nan]),
  153. ('dense', True, 'keep', True,
  154. [1. / 3., 1. / 3., np.nan, 3. / 3., 1. / 3., 2. / 3., np.nan, np.nan]),
  155. ('dense', False, 'keep', False,
  156. [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]),
  157. ('dense', False, 'keep', True,
  158. [3. / 3., 3. / 3., np.nan, 1. / 3., 3. / 3., 2. / 3., np.nan, np.nan]),
  159. ('average', True, 'bottom', False, [2., 2., 7., 5., 2., 4., 7., 7.]),
  160. ('average', True, 'bottom', True,
  161. [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875]),
  162. ('average', False, 'bottom', False, [4., 4., 7., 1., 4., 2., 7., 7.]),
  163. ('average', False, 'bottom', True,
  164. [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875]),
  165. ('min', True, 'bottom', False, [1., 1., 6., 5., 1., 4., 6., 6.]),
  166. ('min', True, 'bottom', True,
  167. [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75]),
  168. ('min', False, 'bottom', False, [3., 3., 6., 1., 3., 2., 6., 6.]),
  169. ('min', False, 'bottom', True,
  170. [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75]),
  171. ('max', True, 'bottom', False, [3., 3., 8., 5., 3., 4., 8., 8.]),
  172. ('max', True, 'bottom', True,
  173. [0.375, 0.375, 1., 0.625, 0.375, 0.5, 1., 1.]),
  174. ('max', False, 'bottom', False, [5., 5., 8., 1., 5., 2., 8., 8.]),
  175. ('max', False, 'bottom', True,
  176. [0.625, 0.625, 1., 0.125, 0.625, 0.25, 1., 1.]),
  177. ('first', True, 'bottom', False, [1., 2., 6., 5., 3., 4., 7., 8.]),
  178. ('first', True, 'bottom', True,
  179. [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.]),
  180. ('first', False, 'bottom', False, [3., 4., 6., 1., 5., 2., 7., 8.]),
  181. ('first', False, 'bottom', True,
  182. [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.]),
  183. ('dense', True, 'bottom', False, [1., 1., 4., 3., 1., 2., 4., 4.]),
  184. ('dense', True, 'bottom', True,
  185. [0.25, 0.25, 1., 0.75, 0.25, 0.5, 1., 1.]),
  186. ('dense', False, 'bottom', False, [3., 3., 4., 1., 3., 2., 4., 4.]),
  187. ('dense', False, 'bottom', True,
  188. [0.75, 0.75, 1., 0.25, 0.75, 0.5, 1., 1.])
  189. ])
  190. def test_rank_args_missing(grps, vals, ties_method, ascending,
  191. na_option, pct, exp):
  192. key = np.repeat(grps, len(vals))
  193. vals = vals * len(grps)
  194. df = DataFrame({'key': key, 'val': vals})
  195. result = df.groupby('key').rank(method=ties_method,
  196. ascending=ascending,
  197. na_option=na_option, pct=pct)
  198. exp_df = DataFrame(exp * len(grps), columns=['val'])
  199. tm.assert_frame_equal(result, exp_df)
  200. @pytest.mark.parametrize("pct,exp", [
  201. (False, [3., 3., 3., 3., 3.]),
  202. (True, [.6, .6, .6, .6, .6])])
  203. def test_rank_resets_each_group(pct, exp):
  204. df = DataFrame(
  205. {'key': ['a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'],
  206. 'val': [1] * 10}
  207. )
  208. result = df.groupby('key').rank(pct=pct)
  209. exp_df = DataFrame(exp * 2, columns=['val'])
  210. tm.assert_frame_equal(result, exp_df)
  211. def test_rank_avg_even_vals():
  212. df = DataFrame({'key': ['a'] * 4, 'val': [1] * 4})
  213. result = df.groupby('key').rank()
  214. exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=['val'])
  215. tm.assert_frame_equal(result, exp_df)
  216. @pytest.mark.parametrize("ties_method", [
  217. 'average', 'min', 'max', 'first', 'dense'])
  218. @pytest.mark.parametrize("ascending", [True, False])
  219. @pytest.mark.parametrize("na_option", ["keep", "top", "bottom"])
  220. @pytest.mark.parametrize("pct", [True, False])
  221. @pytest.mark.parametrize("vals", [
  222. ['bar', 'bar', 'foo', 'bar', 'baz'],
  223. ['bar', np.nan, 'foo', np.nan, 'baz']
  224. ])
  225. def test_rank_object_raises(ties_method, ascending, na_option,
  226. pct, vals):
  227. df = DataFrame({'key': ['foo'] * 5, 'val': vals})
  228. with pytest.raises(TypeError, match="not callable"):
  229. df.groupby('key').rank(method=ties_method,
  230. ascending=ascending,
  231. na_option=na_option, pct=pct)
  232. @pytest.mark.parametrize("na_option", [True, "bad", 1])
  233. @pytest.mark.parametrize("ties_method", [
  234. 'average', 'min', 'max', 'first', 'dense'])
  235. @pytest.mark.parametrize("ascending", [True, False])
  236. @pytest.mark.parametrize("pct", [True, False])
  237. @pytest.mark.parametrize("vals", [
  238. ['bar', 'bar', 'foo', 'bar', 'baz'],
  239. ['bar', np.nan, 'foo', np.nan, 'baz'],
  240. [1, np.nan, 2, np.nan, 3]
  241. ])
  242. def test_rank_naoption_raises(ties_method, ascending, na_option, pct, vals):
  243. df = DataFrame({'key': ['foo'] * 5, 'val': vals})
  244. msg = "na_option must be one of 'keep', 'top', or 'bottom'"
  245. with pytest.raises(ValueError, match=msg):
  246. df.groupby('key').rank(method=ties_method,
  247. ascending=ascending,
  248. na_option=na_option, pct=pct)
  249. def test_rank_empty_group():
  250. # see gh-22519
  251. column = "A"
  252. df = DataFrame({
  253. "A": [0, 1, 0],
  254. "B": [1., np.nan, 2.]
  255. })
  256. result = df.groupby(column).B.rank(pct=True)
  257. expected = Series([0.5, np.nan, 1.0], name="B")
  258. tm.assert_series_equal(result, expected)
  259. result = df.groupby(column).rank(pct=True)
  260. expected = DataFrame({"B": [0.5, np.nan, 1.0]})
  261. tm.assert_frame_equal(result, expected)
  262. @pytest.mark.parametrize("input_key,input_value,output_value", [
  263. ([1, 2], [1, 1], [1.0, 1.0]),
  264. ([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]),
  265. ([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]),
  266. ([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan])
  267. ])
  268. def test_rank_zero_div(input_key, input_value, output_value):
  269. # GH 23666
  270. df = DataFrame({"A": input_key, "B": input_value})
  271. result = df.groupby("A").rank(method="dense", pct=True)
  272. expected = DataFrame({"B": output_value})
  273. tm.assert_frame_equal(result, expected)