test_duplicates.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278
  1. # -*- coding: utf-8 -*-
  2. from itertools import product
  3. import numpy as np
  4. import pytest
  5. from pandas._libs import hashtable
  6. from pandas.compat import range, u
  7. from pandas import DatetimeIndex, MultiIndex
  8. import pandas.util.testing as tm
  9. @pytest.mark.parametrize('names', [None, ['first', 'second']])
  10. def test_unique(names):
  11. mi = MultiIndex.from_arrays([[1, 2, 1, 2], [1, 1, 1, 2]], names=names)
  12. res = mi.unique()
  13. exp = MultiIndex.from_arrays([[1, 2, 2], [1, 1, 2]], names=mi.names)
  14. tm.assert_index_equal(res, exp)
  15. mi = MultiIndex.from_arrays([list('aaaa'), list('abab')],
  16. names=names)
  17. res = mi.unique()
  18. exp = MultiIndex.from_arrays([list('aa'), list('ab')], names=mi.names)
  19. tm.assert_index_equal(res, exp)
  20. mi = MultiIndex.from_arrays([list('aaaa'), list('aaaa')], names=names)
  21. res = mi.unique()
  22. exp = MultiIndex.from_arrays([['a'], ['a']], names=mi.names)
  23. tm.assert_index_equal(res, exp)
  24. # GH #20568 - empty MI
  25. mi = MultiIndex.from_arrays([[], []], names=names)
  26. res = mi.unique()
  27. tm.assert_index_equal(mi, res)
  28. def test_unique_datetimelike():
  29. idx1 = DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-01',
  30. '2015-01-01', 'NaT', 'NaT'])
  31. idx2 = DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-02',
  32. '2015-01-02', 'NaT', '2015-01-01'],
  33. tz='Asia/Tokyo')
  34. result = MultiIndex.from_arrays([idx1, idx2]).unique()
  35. eidx1 = DatetimeIndex(['2015-01-01', '2015-01-01', 'NaT', 'NaT'])
  36. eidx2 = DatetimeIndex(['2015-01-01', '2015-01-02',
  37. 'NaT', '2015-01-01'],
  38. tz='Asia/Tokyo')
  39. exp = MultiIndex.from_arrays([eidx1, eidx2])
  40. tm.assert_index_equal(result, exp)
  41. @pytest.mark.parametrize('level', [0, 'first', 1, 'second'])
  42. def test_unique_level(idx, level):
  43. # GH #17896 - with level= argument
  44. result = idx.unique(level=level)
  45. expected = idx.get_level_values(level).unique()
  46. tm.assert_index_equal(result, expected)
  47. # With already unique level
  48. mi = MultiIndex.from_arrays([[1, 3, 2, 4], [1, 3, 2, 5]],
  49. names=['first', 'second'])
  50. result = mi.unique(level=level)
  51. expected = mi.get_level_values(level)
  52. tm.assert_index_equal(result, expected)
  53. # With empty MI
  54. mi = MultiIndex.from_arrays([[], []], names=['first', 'second'])
  55. result = mi.unique(level=level)
  56. expected = mi.get_level_values(level)
  57. @pytest.mark.parametrize('dropna', [True, False])
  58. def test_get_unique_index(idx, dropna):
  59. mi = idx[[0, 1, 0, 1, 1, 0, 0]]
  60. expected = mi._shallow_copy(mi[[0, 1]])
  61. result = mi._get_unique_index(dropna=dropna)
  62. assert result.unique
  63. tm.assert_index_equal(result, expected)
  64. def test_duplicate_multiindex_codes():
  65. # GH 17464
  66. # Make sure that a MultiIndex with duplicate levels throws a ValueError
  67. with pytest.raises(ValueError):
  68. mi = MultiIndex([['A'] * 10, range(10)], [[0] * 10, range(10)])
  69. # And that using set_levels with duplicate levels fails
  70. mi = MultiIndex.from_arrays([['A', 'A', 'B', 'B', 'B'],
  71. [1, 2, 1, 2, 3]])
  72. with pytest.raises(ValueError):
  73. mi.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]],
  74. inplace=True)
  75. @pytest.mark.parametrize('names', [['a', 'b', 'a'], [1, 1, 2],
  76. [1, 'a', 1]])
  77. def test_duplicate_level_names(names):
  78. # GH18872, GH19029
  79. mi = MultiIndex.from_product([[0, 1]] * 3, names=names)
  80. assert mi.names == names
  81. # With .rename()
  82. mi = MultiIndex.from_product([[0, 1]] * 3)
  83. mi = mi.rename(names)
  84. assert mi.names == names
  85. # With .rename(., level=)
  86. mi.rename(names[1], level=1, inplace=True)
  87. mi = mi.rename([names[0], names[2]], level=[0, 2])
  88. assert mi.names == names
  89. def test_duplicate_meta_data():
  90. # GH 10115
  91. mi = MultiIndex(
  92. levels=[[0, 1], [0, 1, 2]],
  93. codes=[[0, 0, 0, 0, 1, 1, 1],
  94. [0, 1, 2, 0, 0, 1, 2]])
  95. for idx in [mi,
  96. mi.set_names([None, None]),
  97. mi.set_names([None, 'Num']),
  98. mi.set_names(['Upper', 'Num']), ]:
  99. assert idx.has_duplicates
  100. assert idx.drop_duplicates().names == idx.names
  101. def test_has_duplicates(idx, idx_dup):
  102. # see fixtures
  103. assert idx.is_unique is True
  104. assert idx.has_duplicates is False
  105. assert idx_dup.is_unique is False
  106. assert idx_dup.has_duplicates is True
  107. mi = MultiIndex(levels=[[0, 1], [0, 1, 2]],
  108. codes=[[0, 0, 0, 0, 1, 1, 1],
  109. [0, 1, 2, 0, 0, 1, 2]])
  110. assert mi.is_unique is False
  111. assert mi.has_duplicates is True
  112. # single instance of NaN
  113. mi_nan = MultiIndex(levels=[['a', 'b'], [0, 1]],
  114. codes=[[-1, 0, 0, 1, 1], [-1, 0, 1, 0, 1]])
  115. assert mi_nan.is_unique is True
  116. assert mi_nan.has_duplicates is False
  117. # multiple instances of NaN
  118. mi_nan_dup = MultiIndex(levels=[['a', 'b'], [0, 1]],
  119. codes=[[-1, -1, 0, 0, 1, 1], [-1, -1, 0, 1, 0, 1]])
  120. assert mi_nan_dup.is_unique is False
  121. assert mi_nan_dup.has_duplicates is True
  122. def test_has_duplicates_from_tuples():
  123. # GH 9075
  124. t = [(u('x'), u('out'), u('z'), 5, u('y'), u('in'), u('z'), 169),
  125. (u('x'), u('out'), u('z'), 7, u('y'), u('in'), u('z'), 119),
  126. (u('x'), u('out'), u('z'), 9, u('y'), u('in'), u('z'), 135),
  127. (u('x'), u('out'), u('z'), 13, u('y'), u('in'), u('z'), 145),
  128. (u('x'), u('out'), u('z'), 14, u('y'), u('in'), u('z'), 158),
  129. (u('x'), u('out'), u('z'), 16, u('y'), u('in'), u('z'), 122),
  130. (u('x'), u('out'), u('z'), 17, u('y'), u('in'), u('z'), 160),
  131. (u('x'), u('out'), u('z'), 18, u('y'), u('in'), u('z'), 180),
  132. (u('x'), u('out'), u('z'), 20, u('y'), u('in'), u('z'), 143),
  133. (u('x'), u('out'), u('z'), 21, u('y'), u('in'), u('z'), 128),
  134. (u('x'), u('out'), u('z'), 22, u('y'), u('in'), u('z'), 129),
  135. (u('x'), u('out'), u('z'), 25, u('y'), u('in'), u('z'), 111),
  136. (u('x'), u('out'), u('z'), 28, u('y'), u('in'), u('z'), 114),
  137. (u('x'), u('out'), u('z'), 29, u('y'), u('in'), u('z'), 121),
  138. (u('x'), u('out'), u('z'), 31, u('y'), u('in'), u('z'), 126),
  139. (u('x'), u('out'), u('z'), 32, u('y'), u('in'), u('z'), 155),
  140. (u('x'), u('out'), u('z'), 33, u('y'), u('in'), u('z'), 123),
  141. (u('x'), u('out'), u('z'), 12, u('y'), u('in'), u('z'), 144)]
  142. mi = MultiIndex.from_tuples(t)
  143. assert not mi.has_duplicates
  144. def test_has_duplicates_overflow():
  145. # handle int64 overflow if possible
  146. def check(nlevels, with_nulls):
  147. codes = np.tile(np.arange(500), 2)
  148. level = np.arange(500)
  149. if with_nulls: # inject some null values
  150. codes[500] = -1 # common nan value
  151. codes = [codes.copy() for i in range(nlevels)]
  152. for i in range(nlevels):
  153. codes[i][500 + i - nlevels // 2] = -1
  154. codes += [np.array([-1, 1]).repeat(500)]
  155. else:
  156. codes = [codes] * nlevels + [np.arange(2).repeat(500)]
  157. levels = [level] * nlevels + [[0, 1]]
  158. # no dups
  159. mi = MultiIndex(levels=levels, codes=codes)
  160. assert not mi.has_duplicates
  161. # with a dup
  162. if with_nulls:
  163. def f(a):
  164. return np.insert(a, 1000, a[0])
  165. codes = list(map(f, codes))
  166. mi = MultiIndex(levels=levels, codes=codes)
  167. else:
  168. values = mi.values.tolist()
  169. mi = MultiIndex.from_tuples(values + [values[0]])
  170. assert mi.has_duplicates
  171. # no overflow
  172. check(4, False)
  173. check(4, True)
  174. # overflow possible
  175. check(8, False)
  176. check(8, True)
  177. @pytest.mark.parametrize('keep, expected', [
  178. ('first', np.array([False, False, False, True, True, False])),
  179. ('last', np.array([False, True, True, False, False, False])),
  180. (False, np.array([False, True, True, True, True, False]))
  181. ])
  182. def test_duplicated(idx_dup, keep, expected):
  183. result = idx_dup.duplicated(keep=keep)
  184. tm.assert_numpy_array_equal(result, expected)
  185. @pytest.mark.parametrize('keep', ['first', 'last', False])
  186. def test_duplicated_large(keep):
  187. # GH 9125
  188. n, k = 200, 5000
  189. levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)]
  190. codes = [np.random.choice(n, k * n) for lev in levels]
  191. mi = MultiIndex(levels=levels, codes=codes)
  192. result = mi.duplicated(keep=keep)
  193. expected = hashtable.duplicated_object(mi.values, keep=keep)
  194. tm.assert_numpy_array_equal(result, expected)
  195. def test_get_duplicates():
  196. # GH5873
  197. for a in [101, 102]:
  198. mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]])
  199. assert not mi.has_duplicates
  200. with tm.assert_produces_warning(FutureWarning):
  201. # Deprecated - see GH20239
  202. assert mi.get_duplicates().equals(MultiIndex.from_arrays([[], []]))
  203. tm.assert_numpy_array_equal(mi.duplicated(),
  204. np.zeros(2, dtype='bool'))
  205. for n in range(1, 6): # 1st level shape
  206. for m in range(1, 5): # 2nd level shape
  207. # all possible unique combinations, including nan
  208. codes = product(range(-1, n), range(-1, m))
  209. mi = MultiIndex(levels=[list('abcde')[:n], list('WXYZ')[:m]],
  210. codes=np.random.permutation(list(codes)).T)
  211. assert len(mi) == (n + 1) * (m + 1)
  212. assert not mi.has_duplicates
  213. with tm.assert_produces_warning(FutureWarning):
  214. # Deprecated - see GH20239
  215. assert mi.get_duplicates().equals(MultiIndex.from_arrays(
  216. [[], []]))
  217. tm.assert_numpy_array_equal(mi.duplicated(),
  218. np.zeros(len(mi), dtype='bool'))