test_sorting.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435
  1. from collections import defaultdict
  2. from datetime import datetime
  3. from itertools import product
  4. import warnings
  5. import numpy as np
  6. from numpy import nan
  7. import pytest
  8. from pandas import (
  9. DataFrame, MultiIndex, Series, compat, concat, merge, to_datetime)
  10. from pandas.core import common as com
  11. from pandas.core.sorting import (
  12. decons_group_index, get_group_index, is_int64_overflow_possible,
  13. lexsort_indexer, nargsort, safe_sort)
  14. from pandas.util import testing as tm
  15. from pandas.util.testing import assert_frame_equal, assert_series_equal
  16. class TestSorting(object):
  17. @pytest.mark.slow
  18. def test_int64_overflow(self):
  19. B = np.concatenate((np.arange(1000), np.arange(1000), np.arange(500)))
  20. A = np.arange(2500)
  21. df = DataFrame({'A': A,
  22. 'B': B,
  23. 'C': A,
  24. 'D': B,
  25. 'E': A,
  26. 'F': B,
  27. 'G': A,
  28. 'H': B,
  29. 'values': np.random.randn(2500)})
  30. lg = df.groupby(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'])
  31. rg = df.groupby(['H', 'G', 'F', 'E', 'D', 'C', 'B', 'A'])
  32. left = lg.sum()['values']
  33. right = rg.sum()['values']
  34. exp_index, _ = left.index.sortlevel()
  35. tm.assert_index_equal(left.index, exp_index)
  36. exp_index, _ = right.index.sortlevel(0)
  37. tm.assert_index_equal(right.index, exp_index)
  38. tups = list(map(tuple, df[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'
  39. ]].values))
  40. tups = com.asarray_tuplesafe(tups)
  41. expected = df.groupby(tups).sum()['values']
  42. for k, v in compat.iteritems(expected):
  43. assert left[k] == right[k[::-1]]
  44. assert left[k] == v
  45. assert len(left) == len(right)
  46. def test_int64_overflow_moar(self):
  47. # GH9096
  48. values = range(55109)
  49. data = DataFrame.from_dict(
  50. {'a': values, 'b': values, 'c': values, 'd': values})
  51. grouped = data.groupby(['a', 'b', 'c', 'd'])
  52. assert len(grouped) == len(values)
  53. arr = np.random.randint(-1 << 12, 1 << 12, (1 << 15, 5))
  54. i = np.random.choice(len(arr), len(arr) * 4)
  55. arr = np.vstack((arr, arr[i])) # add sume duplicate rows
  56. i = np.random.permutation(len(arr))
  57. arr = arr[i] # shuffle rows
  58. df = DataFrame(arr, columns=list('abcde'))
  59. df['jim'], df['joe'] = np.random.randn(2, len(df)) * 10
  60. gr = df.groupby(list('abcde'))
  61. # verify this is testing what it is supposed to test!
  62. assert is_int64_overflow_possible(gr.grouper.shape)
  63. # manually compute groupings
  64. jim, joe = defaultdict(list), defaultdict(list)
  65. for key, a, b in zip(map(tuple, arr), df['jim'], df['joe']):
  66. jim[key].append(a)
  67. joe[key].append(b)
  68. assert len(gr) == len(jim)
  69. mi = MultiIndex.from_tuples(jim.keys(), names=list('abcde'))
  70. def aggr(func):
  71. f = lambda a: np.fromiter(map(func, a), dtype='f8')
  72. arr = np.vstack((f(jim.values()), f(joe.values()))).T
  73. res = DataFrame(arr, columns=['jim', 'joe'], index=mi)
  74. return res.sort_index()
  75. assert_frame_equal(gr.mean(), aggr(np.mean))
  76. assert_frame_equal(gr.median(), aggr(np.median))
  77. def test_lexsort_indexer(self):
  78. keys = [[nan] * 5 + list(range(100)) + [nan] * 5]
  79. # orders=True, na_position='last'
  80. result = lexsort_indexer(keys, orders=True, na_position='last')
  81. exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110))
  82. tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp))
  83. # orders=True, na_position='first'
  84. result = lexsort_indexer(keys, orders=True, na_position='first')
  85. exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105))
  86. tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp))
  87. # orders=False, na_position='last'
  88. result = lexsort_indexer(keys, orders=False, na_position='last')
  89. exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110))
  90. tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp))
  91. # orders=False, na_position='first'
  92. result = lexsort_indexer(keys, orders=False, na_position='first')
  93. exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1))
  94. tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp))
  95. def test_nargsort(self):
  96. # np.argsort(items) places NaNs last
  97. items = [nan] * 5 + list(range(100)) + [nan] * 5
  98. # np.argsort(items2) may not place NaNs first
  99. items2 = np.array(items, dtype='O')
  100. # mergesort is the most difficult to get right because we want it to be
  101. # stable.
  102. # According to numpy/core/tests/test_multiarray, """The number of
  103. # sorted items must be greater than ~50 to check the actual algorithm
  104. # because quick and merge sort fall over to insertion sort for small
  105. # arrays."""
  106. # mergesort, ascending=True, na_position='last'
  107. result = nargsort(items, kind='mergesort', ascending=True,
  108. na_position='last')
  109. exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110))
  110. tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False)
  111. # mergesort, ascending=True, na_position='first'
  112. result = nargsort(items, kind='mergesort', ascending=True,
  113. na_position='first')
  114. exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105))
  115. tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False)
  116. # mergesort, ascending=False, na_position='last'
  117. result = nargsort(items, kind='mergesort', ascending=False,
  118. na_position='last')
  119. exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110))
  120. tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False)
  121. # mergesort, ascending=False, na_position='first'
  122. result = nargsort(items, kind='mergesort', ascending=False,
  123. na_position='first')
  124. exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1))
  125. tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False)
  126. # mergesort, ascending=True, na_position='last'
  127. result = nargsort(items2, kind='mergesort', ascending=True,
  128. na_position='last')
  129. exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110))
  130. tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False)
  131. # mergesort, ascending=True, na_position='first'
  132. result = nargsort(items2, kind='mergesort', ascending=True,
  133. na_position='first')
  134. exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105))
  135. tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False)
  136. # mergesort, ascending=False, na_position='last'
  137. result = nargsort(items2, kind='mergesort', ascending=False,
  138. na_position='last')
  139. exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110))
  140. tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False)
  141. # mergesort, ascending=False, na_position='first'
  142. result = nargsort(items2, kind='mergesort', ascending=False,
  143. na_position='first')
  144. exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1))
  145. tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False)
  146. def test_nargsort_datetimearray_warning(self):
  147. # https://github.com/pandas-dev/pandas/issues/25439
  148. # can be removed once the FutureWarning for np.array(DTA) is removed
  149. data = to_datetime([0, 2, 0, 1]).tz_localize('Europe/Brussels')
  150. with tm.assert_produces_warning(None):
  151. nargsort(data)
  152. class TestMerge(object):
  153. @pytest.mark.slow
  154. def test_int64_overflow_issues(self):
  155. # #2690, combinatorial explosion
  156. df1 = DataFrame(np.random.randn(1000, 7),
  157. columns=list('ABCDEF') + ['G1'])
  158. df2 = DataFrame(np.random.randn(1000, 7),
  159. columns=list('ABCDEF') + ['G2'])
  160. # it works!
  161. result = merge(df1, df2, how='outer')
  162. assert len(result) == 2000
  163. low, high, n = -1 << 10, 1 << 10, 1 << 20
  164. left = DataFrame(np.random.randint(low, high, (n, 7)),
  165. columns=list('ABCDEFG'))
  166. left['left'] = left.sum(axis=1)
  167. # one-2-one match
  168. i = np.random.permutation(len(left))
  169. right = left.iloc[i].copy()
  170. right.columns = right.columns[:-1].tolist() + ['right']
  171. right.index = np.arange(len(right))
  172. right['right'] *= -1
  173. out = merge(left, right, how='outer')
  174. assert len(out) == len(left)
  175. assert_series_equal(out['left'], - out['right'], check_names=False)
  176. result = out.iloc[:, :-2].sum(axis=1)
  177. assert_series_equal(out['left'], result, check_names=False)
  178. assert result.name is None
  179. out.sort_values(out.columns.tolist(), inplace=True)
  180. out.index = np.arange(len(out))
  181. for how in ['left', 'right', 'outer', 'inner']:
  182. assert_frame_equal(out, merge(left, right, how=how, sort=True))
  183. # check that left merge w/ sort=False maintains left frame order
  184. out = merge(left, right, how='left', sort=False)
  185. assert_frame_equal(left, out[left.columns.tolist()])
  186. out = merge(right, left, how='left', sort=False)
  187. assert_frame_equal(right, out[right.columns.tolist()])
  188. # one-2-many/none match
  189. n = 1 << 11
  190. left = DataFrame(np.random.randint(low, high, (n, 7)).astype('int64'),
  191. columns=list('ABCDEFG'))
  192. # confirm that this is checking what it is supposed to check
  193. shape = left.apply(Series.nunique).values
  194. assert is_int64_overflow_possible(shape)
  195. # add duplicates to left frame
  196. left = concat([left, left], ignore_index=True)
  197. right = DataFrame(np.random.randint(low, high, (n // 2, 7))
  198. .astype('int64'),
  199. columns=list('ABCDEFG'))
  200. # add duplicates & overlap with left to the right frame
  201. i = np.random.choice(len(left), n)
  202. right = concat([right, right, left.iloc[i]], ignore_index=True)
  203. left['left'] = np.random.randn(len(left))
  204. right['right'] = np.random.randn(len(right))
  205. # shuffle left & right frames
  206. i = np.random.permutation(len(left))
  207. left = left.iloc[i].copy()
  208. left.index = np.arange(len(left))
  209. i = np.random.permutation(len(right))
  210. right = right.iloc[i].copy()
  211. right.index = np.arange(len(right))
  212. # manually compute outer merge
  213. ldict, rdict = defaultdict(list), defaultdict(list)
  214. for idx, row in left.set_index(list('ABCDEFG')).iterrows():
  215. ldict[idx].append(row['left'])
  216. for idx, row in right.set_index(list('ABCDEFG')).iterrows():
  217. rdict[idx].append(row['right'])
  218. vals = []
  219. for k, lval in ldict.items():
  220. rval = rdict.get(k, [np.nan])
  221. for lv, rv in product(lval, rval):
  222. vals.append(k + tuple([lv, rv]))
  223. for k, rval in rdict.items():
  224. if k not in ldict:
  225. for rv in rval:
  226. vals.append(k + tuple([np.nan, rv]))
  227. def align(df):
  228. df = df.sort_values(df.columns.tolist())
  229. df.index = np.arange(len(df))
  230. return df
  231. def verify_order(df):
  232. kcols = list('ABCDEFG')
  233. assert_frame_equal(df[kcols].copy(),
  234. df[kcols].sort_values(kcols, kind='mergesort'))
  235. out = DataFrame(vals, columns=list('ABCDEFG') + ['left', 'right'])
  236. out = align(out)
  237. jmask = {'left': out['left'].notna(),
  238. 'right': out['right'].notna(),
  239. 'inner': out['left'].notna() & out['right'].notna(),
  240. 'outer': np.ones(len(out), dtype='bool')}
  241. for how in 'left', 'right', 'outer', 'inner':
  242. mask = jmask[how]
  243. frame = align(out[mask].copy())
  244. assert mask.all() ^ mask.any() or how == 'outer'
  245. for sort in [False, True]:
  246. res = merge(left, right, how=how, sort=sort)
  247. if sort:
  248. verify_order(res)
  249. # as in GH9092 dtypes break with outer/right join
  250. assert_frame_equal(frame, align(res),
  251. check_dtype=how not in ('right', 'outer'))
  252. def test_decons():
  253. def testit(label_list, shape):
  254. group_index = get_group_index(label_list, shape, sort=True, xnull=True)
  255. label_list2 = decons_group_index(group_index, shape)
  256. for a, b in zip(label_list, label_list2):
  257. tm.assert_numpy_array_equal(a, b)
  258. shape = (4, 5, 6)
  259. label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100).astype(np.int64),
  260. np.tile([0, 2, 4, 3, 0, 1, 2, 3], 100).astype(np.int64),
  261. np.tile([5, 1, 0, 2, 3, 0, 5, 4], 100).astype(np.int64)]
  262. testit(label_list, shape)
  263. shape = (10000, 10000)
  264. label_list = [np.tile(np.arange(10000, dtype=np.int64), 5),
  265. np.tile(np.arange(10000, dtype=np.int64), 5)]
  266. testit(label_list, shape)
  267. class TestSafeSort(object):
  268. def test_basic_sort(self):
  269. values = [3, 1, 2, 0, 4]
  270. result = safe_sort(values)
  271. expected = np.array([0, 1, 2, 3, 4])
  272. tm.assert_numpy_array_equal(result, expected)
  273. values = list("baaacb")
  274. result = safe_sort(values)
  275. expected = np.array(list("aaabbc"), dtype='object')
  276. tm.assert_numpy_array_equal(result, expected)
  277. values = []
  278. result = safe_sort(values)
  279. expected = np.array([])
  280. tm.assert_numpy_array_equal(result, expected)
  281. def test_labels(self):
  282. values = [3, 1, 2, 0, 4]
  283. expected = np.array([0, 1, 2, 3, 4])
  284. labels = [0, 1, 1, 2, 3, 0, -1, 4]
  285. result, result_labels = safe_sort(values, labels)
  286. expected_labels = np.array([3, 1, 1, 2, 0, 3, -1, 4], dtype=np.intp)
  287. tm.assert_numpy_array_equal(result, expected)
  288. tm.assert_numpy_array_equal(result_labels, expected_labels)
  289. # na_sentinel
  290. labels = [0, 1, 1, 2, 3, 0, 99, 4]
  291. result, result_labels = safe_sort(values, labels,
  292. na_sentinel=99)
  293. expected_labels = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp)
  294. tm.assert_numpy_array_equal(result, expected)
  295. tm.assert_numpy_array_equal(result_labels, expected_labels)
  296. # out of bound indices
  297. labels = [0, 101, 102, 2, 3, 0, 99, 4]
  298. result, result_labels = safe_sort(values, labels)
  299. expected_labels = np.array([3, -1, -1, 2, 0, 3, -1, 4], dtype=np.intp)
  300. tm.assert_numpy_array_equal(result, expected)
  301. tm.assert_numpy_array_equal(result_labels, expected_labels)
  302. labels = []
  303. result, result_labels = safe_sort(values, labels)
  304. expected_labels = np.array([], dtype=np.intp)
  305. tm.assert_numpy_array_equal(result, expected)
  306. tm.assert_numpy_array_equal(result_labels, expected_labels)
  307. def test_mixed_integer(self):
  308. values = np.array(['b', 1, 0, 'a', 0, 'b'], dtype=object)
  309. result = safe_sort(values)
  310. expected = np.array([0, 0, 1, 'a', 'b', 'b'], dtype=object)
  311. tm.assert_numpy_array_equal(result, expected)
  312. values = np.array(['b', 1, 0, 'a'], dtype=object)
  313. labels = [0, 1, 2, 3, 0, -1, 1]
  314. result, result_labels = safe_sort(values, labels)
  315. expected = np.array([0, 1, 'a', 'b'], dtype=object)
  316. expected_labels = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp)
  317. tm.assert_numpy_array_equal(result, expected)
  318. tm.assert_numpy_array_equal(result_labels, expected_labels)
  319. def test_mixed_integer_from_list(self):
  320. values = ['b', 1, 0, 'a', 0, 'b']
  321. result = safe_sort(values)
  322. expected = np.array([0, 0, 1, 'a', 'b', 'b'], dtype=object)
  323. tm.assert_numpy_array_equal(result, expected)
  324. def test_unsortable(self):
  325. # GH 13714
  326. arr = np.array([1, 2, datetime.now(), 0, 3], dtype=object)
  327. if compat.PY2:
  328. # RuntimeWarning: tp_compare didn't return -1 or -2 for exception
  329. with warnings.catch_warnings():
  330. pytest.raises(TypeError, safe_sort, arr)
  331. else:
  332. pytest.raises(TypeError, safe_sort, arr)
  333. def test_exceptions(self):
  334. with pytest.raises(TypeError,
  335. match="Only list-like objects are allowed"):
  336. safe_sort(values=1)
  337. with pytest.raises(TypeError,
  338. match="Only list-like objects or None"):
  339. safe_sort(values=[0, 1, 2], labels=1)
  340. with pytest.raises(ValueError,
  341. match="values should be unique"):
  342. safe_sort(values=[0, 1, 2, 1], labels=[0, 1])