test_rank.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318
  1. # -*- coding: utf-8 -*-
  2. from datetime import datetime, timedelta
  3. from distutils.version import LooseVersion
  4. import numpy as np
  5. import pytest
  6. from pandas import DataFrame, Series
  7. from pandas.tests.frame.common import TestData
  8. import pandas.util.testing as tm
  9. from pandas.util.testing import assert_frame_equal
  10. class TestRank(TestData):
  11. s = Series([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3])
  12. df = DataFrame({'A': s, 'B': s})
  13. results = {
  14. 'average': np.array([1.5, 5.5, 7.0, 3.5, np.nan,
  15. 3.5, 1.5, 8.0, np.nan, 5.5]),
  16. 'min': np.array([1, 5, 7, 3, np.nan, 3, 1, 8, np.nan, 5]),
  17. 'max': np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6]),
  18. 'first': np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6]),
  19. 'dense': np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]),
  20. }
  21. @pytest.fixture(params=['average', 'min', 'max', 'first', 'dense'])
  22. def method(self, request):
  23. """
  24. Fixture for trying all rank methods
  25. """
  26. return request.param
  27. def test_rank(self):
  28. rankdata = pytest.importorskip('scipy.stats.rankdata')
  29. self.frame['A'][::2] = np.nan
  30. self.frame['B'][::3] = np.nan
  31. self.frame['C'][::4] = np.nan
  32. self.frame['D'][::5] = np.nan
  33. ranks0 = self.frame.rank()
  34. ranks1 = self.frame.rank(1)
  35. mask = np.isnan(self.frame.values)
  36. fvals = self.frame.fillna(np.inf).values
  37. exp0 = np.apply_along_axis(rankdata, 0, fvals)
  38. exp0[mask] = np.nan
  39. exp1 = np.apply_along_axis(rankdata, 1, fvals)
  40. exp1[mask] = np.nan
  41. tm.assert_almost_equal(ranks0.values, exp0)
  42. tm.assert_almost_equal(ranks1.values, exp1)
  43. # integers
  44. df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4)))
  45. result = df.rank()
  46. exp = df.astype(float).rank()
  47. tm.assert_frame_equal(result, exp)
  48. result = df.rank(1)
  49. exp = df.astype(float).rank(1)
  50. tm.assert_frame_equal(result, exp)
  51. def test_rank2(self):
  52. df = DataFrame([[1, 3, 2], [1, 2, 3]])
  53. expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0
  54. result = df.rank(1, pct=True)
  55. tm.assert_frame_equal(result, expected)
  56. df = DataFrame([[1, 3, 2], [1, 2, 3]])
  57. expected = df.rank(0) / 2.0
  58. result = df.rank(0, pct=True)
  59. tm.assert_frame_equal(result, expected)
  60. df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']])
  61. expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]])
  62. result = df.rank(1, numeric_only=False)
  63. tm.assert_frame_equal(result, expected)
  64. expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]])
  65. result = df.rank(0, numeric_only=False)
  66. tm.assert_frame_equal(result, expected)
  67. df = DataFrame([['b', np.nan, 'a'], ['a', 'c', 'b']])
  68. expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 3.0, 2.0]])
  69. result = df.rank(1, numeric_only=False)
  70. tm.assert_frame_equal(result, expected)
  71. expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 1.0, 2.0]])
  72. result = df.rank(0, numeric_only=False)
  73. tm.assert_frame_equal(result, expected)
  74. # f7u12, this does not work without extensive workaround
  75. data = [[datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)],
  76. [datetime(2000, 1, 2), datetime(2000, 1, 3),
  77. datetime(2000, 1, 1)]]
  78. df = DataFrame(data)
  79. # check the rank
  80. expected = DataFrame([[2., np.nan, 1.],
  81. [2., 3., 1.]])
  82. result = df.rank(1, numeric_only=False, ascending=True)
  83. tm.assert_frame_equal(result, expected)
  84. expected = DataFrame([[1., np.nan, 2.],
  85. [2., 1., 3.]])
  86. result = df.rank(1, numeric_only=False, ascending=False)
  87. tm.assert_frame_equal(result, expected)
  88. # mixed-type frames
  89. self.mixed_frame['datetime'] = datetime.now()
  90. self.mixed_frame['timedelta'] = timedelta(days=1, seconds=1)
  91. result = self.mixed_frame.rank(1)
  92. expected = self.mixed_frame.rank(1, numeric_only=True)
  93. tm.assert_frame_equal(result, expected)
  94. df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10,
  95. 1e60, 1e80, 1e-30]})
  96. exp = DataFrame({"a": [3.5, 1., 3.5, 5., 6., 7., 2.]})
  97. tm.assert_frame_equal(df.rank(), exp)
  98. def test_rank_na_option(self):
  99. rankdata = pytest.importorskip('scipy.stats.rankdata')
  100. self.frame['A'][::2] = np.nan
  101. self.frame['B'][::3] = np.nan
  102. self.frame['C'][::4] = np.nan
  103. self.frame['D'][::5] = np.nan
  104. # bottom
  105. ranks0 = self.frame.rank(na_option='bottom')
  106. ranks1 = self.frame.rank(1, na_option='bottom')
  107. fvals = self.frame.fillna(np.inf).values
  108. exp0 = np.apply_along_axis(rankdata, 0, fvals)
  109. exp1 = np.apply_along_axis(rankdata, 1, fvals)
  110. tm.assert_almost_equal(ranks0.values, exp0)
  111. tm.assert_almost_equal(ranks1.values, exp1)
  112. # top
  113. ranks0 = self.frame.rank(na_option='top')
  114. ranks1 = self.frame.rank(1, na_option='top')
  115. fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values
  116. fval1 = self.frame.T
  117. fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
  118. fval1 = fval1.fillna(np.inf).values
  119. exp0 = np.apply_along_axis(rankdata, 0, fval0)
  120. exp1 = np.apply_along_axis(rankdata, 1, fval1)
  121. tm.assert_almost_equal(ranks0.values, exp0)
  122. tm.assert_almost_equal(ranks1.values, exp1)
  123. # descending
  124. # bottom
  125. ranks0 = self.frame.rank(na_option='top', ascending=False)
  126. ranks1 = self.frame.rank(1, na_option='top', ascending=False)
  127. fvals = self.frame.fillna(np.inf).values
  128. exp0 = np.apply_along_axis(rankdata, 0, -fvals)
  129. exp1 = np.apply_along_axis(rankdata, 1, -fvals)
  130. tm.assert_almost_equal(ranks0.values, exp0)
  131. tm.assert_almost_equal(ranks1.values, exp1)
  132. # descending
  133. # top
  134. ranks0 = self.frame.rank(na_option='bottom', ascending=False)
  135. ranks1 = self.frame.rank(1, na_option='bottom', ascending=False)
  136. fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values
  137. fval1 = self.frame.T
  138. fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
  139. fval1 = fval1.fillna(np.inf).values
  140. exp0 = np.apply_along_axis(rankdata, 0, -fval0)
  141. exp1 = np.apply_along_axis(rankdata, 1, -fval1)
  142. tm.assert_numpy_array_equal(ranks0.values, exp0)
  143. tm.assert_numpy_array_equal(ranks1.values, exp1)
  144. # bad values throw error
  145. msg = "na_option must be one of 'keep', 'top', or 'bottom'"
  146. with pytest.raises(ValueError, match=msg):
  147. self.frame.rank(na_option='bad', ascending=False)
  148. # invalid type
  149. with pytest.raises(ValueError, match=msg):
  150. self.frame.rank(na_option=True, ascending=False)
  151. def test_rank_axis(self):
  152. # check if using axes' names gives the same result
  153. df = DataFrame([[2, 1], [4, 3]])
  154. tm.assert_frame_equal(df.rank(axis=0), df.rank(axis='index'))
  155. tm.assert_frame_equal(df.rank(axis=1), df.rank(axis='columns'))
  156. def test_rank_methods_frame(self):
  157. pytest.importorskip('scipy.stats.special')
  158. rankdata = pytest.importorskip('scipy.stats.rankdata')
  159. import scipy
  160. xs = np.random.randint(0, 21, (100, 26))
  161. xs = (xs - 10.0) / 10.0
  162. cols = [chr(ord('z') - i) for i in range(xs.shape[1])]
  163. for vals in [xs, xs + 1e6, xs * 1e-6]:
  164. df = DataFrame(vals, columns=cols)
  165. for ax in [0, 1]:
  166. for m in ['average', 'min', 'max', 'first', 'dense']:
  167. result = df.rank(axis=ax, method=m)
  168. sprank = np.apply_along_axis(
  169. rankdata, ax, vals,
  170. m if m != 'first' else 'ordinal')
  171. sprank = sprank.astype(np.float64)
  172. expected = DataFrame(sprank, columns=cols)
  173. if (LooseVersion(scipy.__version__) >=
  174. LooseVersion('0.17.0')):
  175. expected = expected.astype('float64')
  176. tm.assert_frame_equal(result, expected)
  177. @pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
  178. def test_rank_descending(self, method, dtype):
  179. if 'i' in dtype:
  180. df = self.df.dropna()
  181. else:
  182. df = self.df.astype(dtype)
  183. res = df.rank(ascending=False)
  184. expected = (df.max() - df).rank()
  185. assert_frame_equal(res, expected)
  186. if method == 'first' and dtype == 'O':
  187. return
  188. expected = (df.max() - df).rank(method=method)
  189. if dtype != 'O':
  190. res2 = df.rank(method=method, ascending=False,
  191. numeric_only=True)
  192. assert_frame_equal(res2, expected)
  193. res3 = df.rank(method=method, ascending=False,
  194. numeric_only=False)
  195. assert_frame_equal(res3, expected)
  196. @pytest.mark.parametrize('axis', [0, 1])
  197. @pytest.mark.parametrize('dtype', [None, object])
  198. def test_rank_2d_tie_methods(self, method, axis, dtype):
  199. df = self.df
  200. def _check2d(df, expected, method='average', axis=0):
  201. exp_df = DataFrame({'A': expected, 'B': expected})
  202. if axis == 1:
  203. df = df.T
  204. exp_df = exp_df.T
  205. result = df.rank(method=method, axis=axis)
  206. assert_frame_equal(result, exp_df)
  207. disabled = {(object, 'first')}
  208. if (dtype, method) in disabled:
  209. return
  210. frame = df if dtype is None else df.astype(dtype)
  211. _check2d(frame, self.results[method], method=method, axis=axis)
  212. @pytest.mark.parametrize(
  213. "method,exp", [("dense",
  214. [[1., 1., 1.],
  215. [1., 0.5, 2. / 3],
  216. [1., 0.5, 1. / 3]]),
  217. ("min",
  218. [[1. / 3, 1., 1.],
  219. [1. / 3, 1. / 3, 2. / 3],
  220. [1. / 3, 1. / 3, 1. / 3]]),
  221. ("max",
  222. [[1., 1., 1.],
  223. [1., 2. / 3, 2. / 3],
  224. [1., 2. / 3, 1. / 3]]),
  225. ("average",
  226. [[2. / 3, 1., 1.],
  227. [2. / 3, 0.5, 2. / 3],
  228. [2. / 3, 0.5, 1. / 3]]),
  229. ("first",
  230. [[1. / 3, 1., 1.],
  231. [2. / 3, 1. / 3, 2. / 3],
  232. [3. / 3, 2. / 3, 1. / 3]])])
  233. def test_rank_pct_true(self, method, exp):
  234. # see gh-15630.
  235. df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]])
  236. result = df.rank(method=method, pct=True)
  237. expected = DataFrame(exp)
  238. tm.assert_frame_equal(result, expected)
  239. @pytest.mark.single
  240. def test_pct_max_many_rows(self):
  241. # GH 18271
  242. df = DataFrame({'A': np.arange(2**24 + 1),
  243. 'B': np.arange(2**24 + 1, 0, -1)})
  244. result = df.rank(pct=True).max()
  245. assert (result == 1).all()