test_integrity.py 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293
  1. # -*- coding: utf-8 -*-
  2. import re
  3. import numpy as np
  4. import pytest
  5. from pandas.compat import lrange, range
  6. from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
  7. import pandas as pd
  8. from pandas import IntervalIndex, MultiIndex, RangeIndex
  9. import pandas.util.testing as tm
  10. def test_labels_dtypes():
  11. # GH 8456
  12. i = MultiIndex.from_tuples([('A', 1), ('A', 2)])
  13. assert i.codes[0].dtype == 'int8'
  14. assert i.codes[1].dtype == 'int8'
  15. i = MultiIndex.from_product([['a'], range(40)])
  16. assert i.codes[1].dtype == 'int8'
  17. i = MultiIndex.from_product([['a'], range(400)])
  18. assert i.codes[1].dtype == 'int16'
  19. i = MultiIndex.from_product([['a'], range(40000)])
  20. assert i.codes[1].dtype == 'int32'
  21. i = pd.MultiIndex.from_product([['a'], range(1000)])
  22. assert (i.codes[0] >= 0).all()
  23. assert (i.codes[1] >= 0).all()
  24. def test_values_boxed():
  25. tuples = [(1, pd.Timestamp('2000-01-01')), (2, pd.NaT),
  26. (3, pd.Timestamp('2000-01-03')),
  27. (1, pd.Timestamp('2000-01-04')),
  28. (2, pd.Timestamp('2000-01-02')),
  29. (3, pd.Timestamp('2000-01-03'))]
  30. result = pd.MultiIndex.from_tuples(tuples)
  31. expected = construct_1d_object_array_from_listlike(tuples)
  32. tm.assert_numpy_array_equal(result.values, expected)
  33. # Check that code branches for boxed values produce identical results
  34. tm.assert_numpy_array_equal(result.values[:4], result[:4].values)
  35. def test_values_multiindex_datetimeindex():
  36. # Test to ensure we hit the boxing / nobox part of MI.values
  37. ints = np.arange(10 ** 18, 10 ** 18 + 5)
  38. naive = pd.DatetimeIndex(ints)
  39. # TODO(GH-24559): Remove the FutureWarning
  40. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  41. aware = pd.DatetimeIndex(ints, tz='US/Central')
  42. idx = pd.MultiIndex.from_arrays([naive, aware])
  43. result = idx.values
  44. outer = pd.DatetimeIndex([x[0] for x in result])
  45. tm.assert_index_equal(outer, naive)
  46. inner = pd.DatetimeIndex([x[1] for x in result])
  47. tm.assert_index_equal(inner, aware)
  48. # n_lev > n_lab
  49. result = idx[:2].values
  50. outer = pd.DatetimeIndex([x[0] for x in result])
  51. tm.assert_index_equal(outer, naive[:2])
  52. inner = pd.DatetimeIndex([x[1] for x in result])
  53. tm.assert_index_equal(inner, aware[:2])
  54. def test_values_multiindex_periodindex():
  55. # Test to ensure we hit the boxing / nobox part of MI.values
  56. ints = np.arange(2007, 2012)
  57. pidx = pd.PeriodIndex(ints, freq='D')
  58. idx = pd.MultiIndex.from_arrays([ints, pidx])
  59. result = idx.values
  60. outer = pd.Int64Index([x[0] for x in result])
  61. tm.assert_index_equal(outer, pd.Int64Index(ints))
  62. inner = pd.PeriodIndex([x[1] for x in result])
  63. tm.assert_index_equal(inner, pidx)
  64. # n_lev > n_lab
  65. result = idx[:2].values
  66. outer = pd.Int64Index([x[0] for x in result])
  67. tm.assert_index_equal(outer, pd.Int64Index(ints[:2]))
  68. inner = pd.PeriodIndex([x[1] for x in result])
  69. tm.assert_index_equal(inner, pidx[:2])
  70. def test_consistency():
  71. # need to construct an overflow
  72. major_axis = lrange(70000)
  73. minor_axis = lrange(10)
  74. major_codes = np.arange(70000)
  75. minor_codes = np.repeat(lrange(10), 7000)
  76. # the fact that is works means it's consistent
  77. index = MultiIndex(levels=[major_axis, minor_axis],
  78. codes=[major_codes, minor_codes])
  79. # inconsistent
  80. major_codes = np.array([0, 0, 1, 1, 1, 2, 2, 3, 3])
  81. minor_codes = np.array([0, 1, 0, 1, 1, 0, 1, 0, 1])
  82. index = MultiIndex(levels=[major_axis, minor_axis],
  83. codes=[major_codes, minor_codes])
  84. assert index.is_unique is False
  85. def test_hash_collisions():
  86. # non-smoke test that we don't get hash collisions
  87. index = MultiIndex.from_product([np.arange(1000), np.arange(1000)],
  88. names=['one', 'two'])
  89. result = index.get_indexer(index.values)
  90. tm.assert_numpy_array_equal(result, np.arange(
  91. len(index), dtype='intp'))
  92. for i in [0, 1, len(index) - 2, len(index) - 1]:
  93. result = index.get_loc(index[i])
  94. assert result == i
  95. def test_dims():
  96. pass
  97. def take_invalid_kwargs():
  98. vals = [['A', 'B'],
  99. [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')]]
  100. idx = pd.MultiIndex.from_product(vals, names=['str', 'dt'])
  101. indices = [1, 2]
  102. msg = r"take\(\) got an unexpected keyword argument 'foo'"
  103. with pytest.raises(TypeError, match=msg):
  104. idx.take(indices, foo=2)
  105. msg = "the 'out' parameter is not supported"
  106. with pytest.raises(ValueError, match=msg):
  107. idx.take(indices, out=indices)
  108. msg = "the 'mode' parameter is not supported"
  109. with pytest.raises(ValueError, match=msg):
  110. idx.take(indices, mode='clip')
  111. def test_isna_behavior(idx):
  112. # should not segfault GH5123
  113. # NOTE: if MI representation changes, may make sense to allow
  114. # isna(MI)
  115. with pytest.raises(NotImplementedError):
  116. pd.isna(idx)
  117. def test_large_multiindex_error():
  118. # GH12527
  119. df_below_1000000 = pd.DataFrame(
  120. 1, index=pd.MultiIndex.from_product([[1, 2], range(499999)]),
  121. columns=['dest'])
  122. with pytest.raises(KeyError):
  123. df_below_1000000.loc[(-1, 0), 'dest']
  124. with pytest.raises(KeyError):
  125. df_below_1000000.loc[(3, 0), 'dest']
  126. df_above_1000000 = pd.DataFrame(
  127. 1, index=pd.MultiIndex.from_product([[1, 2], range(500001)]),
  128. columns=['dest'])
  129. with pytest.raises(KeyError):
  130. df_above_1000000.loc[(-1, 0), 'dest']
  131. with pytest.raises(KeyError):
  132. df_above_1000000.loc[(3, 0), 'dest']
  133. def test_million_record_attribute_error():
  134. # GH 18165
  135. r = list(range(1000000))
  136. df = pd.DataFrame({'a': r, 'b': r},
  137. index=pd.MultiIndex.from_tuples([(x, x) for x in r]))
  138. msg = "'Series' object has no attribute 'foo'"
  139. with pytest.raises(AttributeError, match=msg):
  140. df['a'].foo()
  141. def test_can_hold_identifiers(idx):
  142. key = idx[0]
  143. assert idx._can_hold_identifiers_and_holds_name(key) is True
  144. def test_metadata_immutable(idx):
  145. levels, codes = idx.levels, idx.codes
  146. # shouldn't be able to set at either the top level or base level
  147. mutable_regex = re.compile('does not support mutable operations')
  148. with pytest.raises(TypeError, match=mutable_regex):
  149. levels[0] = levels[0]
  150. with pytest.raises(TypeError, match=mutable_regex):
  151. levels[0][0] = levels[0][0]
  152. # ditto for labels
  153. with pytest.raises(TypeError, match=mutable_regex):
  154. codes[0] = codes[0]
  155. with pytest.raises(TypeError, match=mutable_regex):
  156. codes[0][0] = codes[0][0]
  157. # and for names
  158. names = idx.names
  159. with pytest.raises(TypeError, match=mutable_regex):
  160. names[0] = names[0]
  161. def test_level_setting_resets_attributes():
  162. ind = pd.MultiIndex.from_arrays([
  163. ['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3]
  164. ])
  165. assert ind.is_monotonic
  166. ind.set_levels([['A', 'B'], [1, 3, 2]], inplace=True)
  167. # if this fails, probably didn't reset the cache correctly.
  168. assert not ind.is_monotonic
  169. def test_rangeindex_fallback_coercion_bug():
  170. # GH 12893
  171. foo = pd.DataFrame(np.arange(100).reshape((10, 10)))
  172. bar = pd.DataFrame(np.arange(100).reshape((10, 10)))
  173. df = pd.concat({'foo': foo.stack(), 'bar': bar.stack()}, axis=1)
  174. df.index.names = ['fizz', 'buzz']
  175. str(df)
  176. expected = pd.DataFrame({'bar': np.arange(100),
  177. 'foo': np.arange(100)},
  178. index=pd.MultiIndex.from_product(
  179. [range(10), range(10)],
  180. names=['fizz', 'buzz']))
  181. tm.assert_frame_equal(df, expected, check_like=True)
  182. result = df.index.get_level_values('fizz')
  183. expected = pd.Int64Index(np.arange(10), name='fizz').repeat(10)
  184. tm.assert_index_equal(result, expected)
  185. result = df.index.get_level_values('buzz')
  186. expected = pd.Int64Index(np.tile(np.arange(10), 10), name='buzz')
  187. tm.assert_index_equal(result, expected)
  188. def test_hash_error(indices):
  189. index = indices
  190. with pytest.raises(TypeError, match=("unhashable type: %r" %
  191. type(index).__name__)):
  192. hash(indices)
  193. def test_mutability(indices):
  194. if not len(indices):
  195. return
  196. pytest.raises(TypeError, indices.__setitem__, 0, indices[0])
  197. def test_wrong_number_names(indices):
  198. with pytest.raises(ValueError, match="^Length"):
  199. indices.names = ["apple", "banana", "carrot"]
  200. def test_memory_usage(idx):
  201. result = idx.memory_usage()
  202. if len(idx):
  203. idx.get_loc(idx[0])
  204. result2 = idx.memory_usage()
  205. result3 = idx.memory_usage(deep=True)
  206. # RangeIndex, IntervalIndex
  207. # don't have engines
  208. if not isinstance(idx, (RangeIndex, IntervalIndex)):
  209. assert result2 > result
  210. if idx.inferred_type == 'object':
  211. assert result3 > result2
  212. else:
  213. # we report 0 for no-length
  214. assert result == 0
  215. def test_nlevels(idx):
  216. assert idx.nlevels == 2