test_indexing.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375
  1. # -*- coding: utf-8 -*-
  2. from datetime import timedelta
  3. import numpy as np
  4. import pytest
  5. from pandas.compat import lrange
  6. import pandas as pd
  7. from pandas import (
  8. Categorical, CategoricalIndex, Index, IntervalIndex, MultiIndex,
  9. date_range)
  10. from pandas.core.indexes.base import InvalidIndexError
  11. import pandas.util.testing as tm
  12. from pandas.util.testing import assert_almost_equal
  13. def test_slice_locs_partial(idx):
  14. sorted_idx, _ = idx.sortlevel(0)
  15. result = sorted_idx.slice_locs(('foo', 'two'), ('qux', 'one'))
  16. assert result == (1, 5)
  17. result = sorted_idx.slice_locs(None, ('qux', 'one'))
  18. assert result == (0, 5)
  19. result = sorted_idx.slice_locs(('foo', 'two'), None)
  20. assert result == (1, len(sorted_idx))
  21. result = sorted_idx.slice_locs('bar', 'baz')
  22. assert result == (2, 4)
  23. def test_slice_locs():
  24. df = tm.makeTimeDataFrame()
  25. stacked = df.stack()
  26. idx = stacked.index
  27. slob = slice(*idx.slice_locs(df.index[5], df.index[15]))
  28. sliced = stacked[slob]
  29. expected = df[5:16].stack()
  30. tm.assert_almost_equal(sliced.values, expected.values)
  31. slob = slice(*idx.slice_locs(df.index[5] + timedelta(seconds=30),
  32. df.index[15] - timedelta(seconds=30)))
  33. sliced = stacked[slob]
  34. expected = df[6:15].stack()
  35. tm.assert_almost_equal(sliced.values, expected.values)
  36. def test_slice_locs_with_type_mismatch():
  37. df = tm.makeTimeDataFrame()
  38. stacked = df.stack()
  39. idx = stacked.index
  40. with pytest.raises(TypeError, match='^Level type mismatch'):
  41. idx.slice_locs((1, 3))
  42. with pytest.raises(TypeError, match='^Level type mismatch'):
  43. idx.slice_locs(df.index[5] + timedelta(seconds=30), (5, 2))
  44. df = tm.makeCustomDataframe(5, 5)
  45. stacked = df.stack()
  46. idx = stacked.index
  47. with pytest.raises(TypeError, match='^Level type mismatch'):
  48. idx.slice_locs(timedelta(seconds=30))
  49. # TODO: Try creating a UnicodeDecodeError in exception message
  50. with pytest.raises(TypeError, match='^Level type mismatch'):
  51. idx.slice_locs(df.index[1], (16, "a"))
  52. def test_slice_locs_not_sorted():
  53. index = MultiIndex(levels=[Index(lrange(4)), Index(lrange(4)), Index(
  54. lrange(4))], codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array(
  55. [0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0])])
  56. msg = "[Kk]ey length.*greater than MultiIndex lexsort depth"
  57. with pytest.raises(KeyError, match=msg):
  58. index.slice_locs((1, 0, 1), (2, 1, 0))
  59. # works
  60. sorted_index, _ = index.sortlevel(0)
  61. # should there be a test case here???
  62. sorted_index.slice_locs((1, 0, 1), (2, 1, 0))
  63. def test_slice_locs_not_contained():
  64. # some searchsorted action
  65. index = MultiIndex(levels=[[0, 2, 4, 6], [0, 2, 4]],
  66. codes=[[0, 0, 0, 1, 1, 2, 3, 3, 3],
  67. [0, 1, 2, 1, 2, 2, 0, 1, 2]], sortorder=0)
  68. result = index.slice_locs((1, 0), (5, 2))
  69. assert result == (3, 6)
  70. result = index.slice_locs(1, 5)
  71. assert result == (3, 6)
  72. result = index.slice_locs((2, 2), (5, 2))
  73. assert result == (3, 6)
  74. result = index.slice_locs(2, 5)
  75. assert result == (3, 6)
  76. result = index.slice_locs((1, 0), (6, 3))
  77. assert result == (3, 8)
  78. result = index.slice_locs(-1, 10)
  79. assert result == (0, len(index))
  80. def test_putmask_with_wrong_mask(idx):
  81. # GH18368
  82. with pytest.raises(ValueError):
  83. idx.putmask(np.ones(len(idx) + 1, np.bool), 1)
  84. with pytest.raises(ValueError):
  85. idx.putmask(np.ones(len(idx) - 1, np.bool), 1)
  86. with pytest.raises(ValueError):
  87. idx.putmask('foo', 1)
  88. def test_get_indexer():
  89. major_axis = Index(lrange(4))
  90. minor_axis = Index(lrange(2))
  91. major_codes = np.array([0, 0, 1, 2, 2, 3, 3], dtype=np.intp)
  92. minor_codes = np.array([0, 1, 0, 0, 1, 0, 1], dtype=np.intp)
  93. index = MultiIndex(levels=[major_axis, minor_axis],
  94. codes=[major_codes, minor_codes])
  95. idx1 = index[:5]
  96. idx2 = index[[1, 3, 5]]
  97. r1 = idx1.get_indexer(idx2)
  98. assert_almost_equal(r1, np.array([1, 3, -1], dtype=np.intp))
  99. r1 = idx2.get_indexer(idx1, method='pad')
  100. e1 = np.array([-1, 0, 0, 1, 1], dtype=np.intp)
  101. assert_almost_equal(r1, e1)
  102. r2 = idx2.get_indexer(idx1[::-1], method='pad')
  103. assert_almost_equal(r2, e1[::-1])
  104. rffill1 = idx2.get_indexer(idx1, method='ffill')
  105. assert_almost_equal(r1, rffill1)
  106. r1 = idx2.get_indexer(idx1, method='backfill')
  107. e1 = np.array([0, 0, 1, 1, 2], dtype=np.intp)
  108. assert_almost_equal(r1, e1)
  109. r2 = idx2.get_indexer(idx1[::-1], method='backfill')
  110. assert_almost_equal(r2, e1[::-1])
  111. rbfill1 = idx2.get_indexer(idx1, method='bfill')
  112. assert_almost_equal(r1, rbfill1)
  113. # pass non-MultiIndex
  114. r1 = idx1.get_indexer(idx2.values)
  115. rexp1 = idx1.get_indexer(idx2)
  116. assert_almost_equal(r1, rexp1)
  117. r1 = idx1.get_indexer([1, 2, 3])
  118. assert (r1 == [-1, -1, -1]).all()
  119. # create index with duplicates
  120. idx1 = Index(lrange(10) + lrange(10))
  121. idx2 = Index(lrange(20))
  122. msg = "Reindexing only valid with uniquely valued Index objects"
  123. with pytest.raises(InvalidIndexError, match=msg):
  124. idx1.get_indexer(idx2)
  125. def test_get_indexer_nearest():
  126. midx = MultiIndex.from_tuples([('a', 1), ('b', 2)])
  127. with pytest.raises(NotImplementedError):
  128. midx.get_indexer(['a'], method='nearest')
  129. with pytest.raises(NotImplementedError):
  130. midx.get_indexer(['a'], method='pad', tolerance=2)
  131. def test_getitem(idx):
  132. # scalar
  133. assert idx[2] == ('bar', 'one')
  134. # slice
  135. result = idx[2:5]
  136. expected = idx[[2, 3, 4]]
  137. assert result.equals(expected)
  138. # boolean
  139. result = idx[[True, False, True, False, True, True]]
  140. result2 = idx[np.array([True, False, True, False, True, True])]
  141. expected = idx[[0, 2, 4, 5]]
  142. assert result.equals(expected)
  143. assert result2.equals(expected)
  144. def test_getitem_group_select(idx):
  145. sorted_idx, _ = idx.sortlevel(0)
  146. assert sorted_idx.get_loc('baz') == slice(3, 4)
  147. assert sorted_idx.get_loc('foo') == slice(0, 2)
  148. def test_get_indexer_consistency(idx):
  149. # See GH 16819
  150. if isinstance(idx, IntervalIndex):
  151. pass
  152. if idx.is_unique or isinstance(idx, CategoricalIndex):
  153. indexer = idx.get_indexer(idx[0:2])
  154. assert isinstance(indexer, np.ndarray)
  155. assert indexer.dtype == np.intp
  156. else:
  157. e = "Reindexing only valid with uniquely valued Index objects"
  158. with pytest.raises(InvalidIndexError, match=e):
  159. idx.get_indexer(idx[0:2])
  160. indexer, _ = idx.get_indexer_non_unique(idx[0:2])
  161. assert isinstance(indexer, np.ndarray)
  162. assert indexer.dtype == np.intp
  163. @pytest.mark.parametrize('ind1', [[True] * 5, pd.Index([True] * 5)])
  164. @pytest.mark.parametrize('ind2', [[True, False, True, False, False],
  165. pd.Index([True, False, True, False,
  166. False])])
  167. def test_getitem_bool_index_all(ind1, ind2):
  168. # GH#22533
  169. idx = MultiIndex.from_tuples([(10, 1), (20, 2), (30, 3),
  170. (40, 4), (50, 5)])
  171. tm.assert_index_equal(idx[ind1], idx)
  172. expected = MultiIndex.from_tuples([(10, 1), (30, 3)])
  173. tm.assert_index_equal(idx[ind2], expected)
  174. @pytest.mark.parametrize('ind1', [[True], pd.Index([True])])
  175. @pytest.mark.parametrize('ind2', [[False], pd.Index([False])])
  176. def test_getitem_bool_index_single(ind1, ind2):
  177. # GH#22533
  178. idx = MultiIndex.from_tuples([(10, 1)])
  179. tm.assert_index_equal(idx[ind1], idx)
  180. expected = pd.MultiIndex(levels=[np.array([], dtype=np.int64),
  181. np.array([], dtype=np.int64)],
  182. codes=[[], []])
  183. tm.assert_index_equal(idx[ind2], expected)
  184. def test_get_loc(idx):
  185. assert idx.get_loc(('foo', 'two')) == 1
  186. assert idx.get_loc(('baz', 'two')) == 3
  187. pytest.raises(KeyError, idx.get_loc, ('bar', 'two'))
  188. pytest.raises(KeyError, idx.get_loc, 'quux')
  189. pytest.raises(NotImplementedError, idx.get_loc, 'foo',
  190. method='nearest')
  191. # 3 levels
  192. index = MultiIndex(levels=[Index(lrange(4)), Index(lrange(4)), Index(
  193. lrange(4))], codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array(
  194. [0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0])])
  195. pytest.raises(KeyError, index.get_loc, (1, 1))
  196. assert index.get_loc((2, 0)) == slice(3, 5)
  197. def test_get_loc_duplicates():
  198. index = Index([2, 2, 2, 2])
  199. result = index.get_loc(2)
  200. expected = slice(0, 4)
  201. assert result == expected
  202. # pytest.raises(Exception, index.get_loc, 2)
  203. index = Index(['c', 'a', 'a', 'b', 'b'])
  204. rs = index.get_loc('c')
  205. xp = 0
  206. assert rs == xp
  207. def test_get_loc_level():
  208. index = MultiIndex(levels=[Index(lrange(4)), Index(lrange(4)), Index(
  209. lrange(4))], codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array(
  210. [0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0])])
  211. loc, new_index = index.get_loc_level((0, 1))
  212. expected = slice(1, 2)
  213. exp_index = index[expected].droplevel(0).droplevel(0)
  214. assert loc == expected
  215. assert new_index.equals(exp_index)
  216. loc, new_index = index.get_loc_level((0, 1, 0))
  217. expected = 1
  218. assert loc == expected
  219. assert new_index is None
  220. pytest.raises(KeyError, index.get_loc_level, (2, 2))
  221. # GH 22221: unused label
  222. pytest.raises(KeyError, index.drop(2).get_loc_level, 2)
  223. # Unused label on unsorted level:
  224. pytest.raises(KeyError, index.drop(1, level=2).get_loc_level, 2, 2)
  225. index = MultiIndex(levels=[[2000], lrange(4)], codes=[np.array(
  226. [0, 0, 0, 0]), np.array([0, 1, 2, 3])])
  227. result, new_index = index.get_loc_level((2000, slice(None, None)))
  228. expected = slice(None, None)
  229. assert result == expected
  230. assert new_index.equals(index.droplevel(0))
  231. @pytest.mark.parametrize('dtype1', [int, float, bool, str])
  232. @pytest.mark.parametrize('dtype2', [int, float, bool, str])
  233. def test_get_loc_multiple_dtypes(dtype1, dtype2):
  234. # GH 18520
  235. levels = [np.array([0, 1]).astype(dtype1),
  236. np.array([0, 1]).astype(dtype2)]
  237. idx = pd.MultiIndex.from_product(levels)
  238. assert idx.get_loc(idx[2]) == 2
  239. @pytest.mark.parametrize('level', [0, 1])
  240. @pytest.mark.parametrize('dtypes', [[int, float], [float, int]])
  241. def test_get_loc_implicit_cast(level, dtypes):
  242. # GH 18818, GH 15994 : as flat index, cast int to float and vice-versa
  243. levels = [['a', 'b'], ['c', 'd']]
  244. key = ['b', 'd']
  245. lev_dtype, key_dtype = dtypes
  246. levels[level] = np.array([0, 1], dtype=lev_dtype)
  247. key[level] = key_dtype(1)
  248. idx = MultiIndex.from_product(levels)
  249. assert idx.get_loc(tuple(key)) == 3
  250. def test_get_loc_cast_bool():
  251. # GH 19086 : int is casted to bool, but not vice-versa
  252. levels = [[False, True], np.arange(2, dtype='int64')]
  253. idx = MultiIndex.from_product(levels)
  254. assert idx.get_loc((0, 1)) == 1
  255. assert idx.get_loc((1, 0)) == 2
  256. pytest.raises(KeyError, idx.get_loc, (False, True))
  257. pytest.raises(KeyError, idx.get_loc, (True, False))
  258. @pytest.mark.parametrize('level', [0, 1])
  259. def test_get_loc_nan(level, nulls_fixture):
  260. # GH 18485 : NaN in MultiIndex
  261. levels = [['a', 'b'], ['c', 'd']]
  262. key = ['b', 'd']
  263. levels[level] = np.array([0, nulls_fixture], dtype=type(nulls_fixture))
  264. key[level] = nulls_fixture
  265. idx = MultiIndex.from_product(levels)
  266. assert idx.get_loc(tuple(key)) == 3
  267. def test_get_loc_missing_nan():
  268. # GH 8569
  269. idx = MultiIndex.from_arrays([[1.0, 2.0], [3.0, 4.0]])
  270. assert isinstance(idx.get_loc(1), slice)
  271. pytest.raises(KeyError, idx.get_loc, 3)
  272. pytest.raises(KeyError, idx.get_loc, np.nan)
  273. pytest.raises(KeyError, idx.get_loc, [np.nan])
  274. def test_get_indexer_categorical_time():
  275. # https://github.com/pandas-dev/pandas/issues/21390
  276. midx = MultiIndex.from_product(
  277. [Categorical(['a', 'b', 'c']),
  278. Categorical(date_range("2012-01-01", periods=3, freq='H'))])
  279. result = midx.get_indexer(midx)
  280. tm.assert_numpy_array_equal(result, np.arange(9, dtype=np.intp))