test_hashing.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328
  1. import datetime
  2. import numpy as np
  3. import pytest
  4. import pandas as pd
  5. from pandas import DataFrame, Index, MultiIndex, Series
  6. from pandas.core.util.hashing import _hash_scalar, hash_tuple, hash_tuples
  7. from pandas.util import hash_array, hash_pandas_object
  8. import pandas.util.testing as tm
  9. @pytest.fixture(params=[
  10. Series([1, 2, 3] * 3, dtype="int32"),
  11. Series([None, 2.5, 3.5] * 3, dtype="float32"),
  12. Series(["a", "b", "c"] * 3, dtype="category"),
  13. Series(["d", "e", "f"] * 3),
  14. Series([True, False, True] * 3),
  15. Series(pd.date_range("20130101", periods=9)),
  16. Series(pd.date_range("20130101", periods=9, tz="US/Eastern")),
  17. Series(pd.timedelta_range("2000", periods=9))])
  18. def series(request):
  19. return request.param
  20. @pytest.fixture(params=[True, False])
  21. def index(request):
  22. return request.param
  23. def _check_equal(obj, **kwargs):
  24. """
  25. Check that hashing an objects produces the same value each time.
  26. Parameters
  27. ----------
  28. obj : object
  29. The object to hash.
  30. kwargs : kwargs
  31. Keyword arguments to pass to the hashing function.
  32. """
  33. a = hash_pandas_object(obj, **kwargs)
  34. b = hash_pandas_object(obj, **kwargs)
  35. tm.assert_series_equal(a, b)
  36. def _check_not_equal_with_index(obj):
  37. """
  38. Check the hash of an object with and without its index is not the same.
  39. Parameters
  40. ----------
  41. obj : object
  42. The object to hash.
  43. """
  44. if not isinstance(obj, Index):
  45. a = hash_pandas_object(obj, index=True)
  46. b = hash_pandas_object(obj, index=False)
  47. if len(obj):
  48. assert not (a == b).all()
  49. def test_consistency():
  50. # Check that our hash doesn't change because of a mistake
  51. # in the actual code; this is the ground truth.
  52. result = hash_pandas_object(Index(["foo", "bar", "baz"]))
  53. expected = Series(np.array([3600424527151052760, 1374399572096150070,
  54. 477881037637427054], dtype="uint64"),
  55. index=["foo", "bar", "baz"])
  56. tm.assert_series_equal(result, expected)
  57. def test_hash_array(series):
  58. arr = series.values
  59. tm.assert_numpy_array_equal(hash_array(arr), hash_array(arr))
  60. @pytest.mark.parametrize("arr2", [
  61. np.array([3, 4, "All"]),
  62. np.array([3, 4, "All"], dtype=object),
  63. ])
  64. def test_hash_array_mixed(arr2):
  65. result1 = hash_array(np.array(["3", "4", "All"]))
  66. result2 = hash_array(arr2)
  67. tm.assert_numpy_array_equal(result1, result2)
  68. @pytest.mark.parametrize("val", [5, "foo", pd.Timestamp("20130101")])
  69. def test_hash_array_errors(val):
  70. msg = "must pass a ndarray-like"
  71. with pytest.raises(TypeError, match=msg):
  72. hash_array(val)
  73. def test_hash_tuples():
  74. tuples = [(1, "one"), (1, "two"), (2, "one")]
  75. result = hash_tuples(tuples)
  76. expected = hash_pandas_object(MultiIndex.from_tuples(tuples)).values
  77. tm.assert_numpy_array_equal(result, expected)
  78. result = hash_tuples(tuples[0])
  79. assert result == expected[0]
  80. @pytest.mark.parametrize("tup", [
  81. (1, "one"), (1, np.nan), (1.0, pd.NaT, "A"),
  82. ("A", pd.Timestamp("2012-01-01"))])
  83. def test_hash_tuple(tup):
  84. # Test equivalence between
  85. # hash_tuples and hash_tuple.
  86. result = hash_tuple(tup)
  87. expected = hash_tuples([tup])[0]
  88. assert result == expected
  89. @pytest.mark.parametrize("val", [
  90. 1, 1.4, "A", b"A", u"A", pd.Timestamp("2012-01-01"),
  91. pd.Timestamp("2012-01-01", tz="Europe/Brussels"),
  92. datetime.datetime(2012, 1, 1),
  93. pd.Timestamp("2012-01-01", tz="EST").to_pydatetime(),
  94. pd.Timedelta("1 days"), datetime.timedelta(1),
  95. pd.Period("2012-01-01", freq="D"), pd.Interval(0, 1),
  96. np.nan, pd.NaT, None])
  97. def test_hash_scalar(val):
  98. result = _hash_scalar(val)
  99. expected = hash_array(np.array([val], dtype=object), categorize=True)
  100. assert result[0] == expected[0]
  101. @pytest.mark.parametrize("val", [5, "foo", pd.Timestamp("20130101")])
  102. def test_hash_tuples_err(val):
  103. msg = "must be convertible to a list-of-tuples"
  104. with pytest.raises(TypeError, match=msg):
  105. hash_tuples(val)
  106. def test_multiindex_unique():
  107. mi = MultiIndex.from_tuples([(118, 472), (236, 118),
  108. (51, 204), (102, 51)])
  109. assert mi.is_unique is True
  110. result = hash_pandas_object(mi)
  111. assert result.is_unique is True
  112. def test_multiindex_objects():
  113. mi = MultiIndex(levels=[["b", "d", "a"], [1, 2, 3]],
  114. codes=[[0, 1, 0, 2], [2, 0, 0, 1]],
  115. names=["col1", "col2"])
  116. recons = mi._sort_levels_monotonic()
  117. # These are equal.
  118. assert mi.equals(recons)
  119. assert Index(mi.values).equals(Index(recons.values))
  120. # _hashed_values and hash_pandas_object(..., index=False) equivalency.
  121. expected = hash_pandas_object(mi, index=False).values
  122. result = mi._hashed_values
  123. tm.assert_numpy_array_equal(result, expected)
  124. expected = hash_pandas_object(recons, index=False).values
  125. result = recons._hashed_values
  126. tm.assert_numpy_array_equal(result, expected)
  127. expected = mi._hashed_values
  128. result = recons._hashed_values
  129. # Values should match, but in different order.
  130. tm.assert_numpy_array_equal(np.sort(result), np.sort(expected))
  131. @pytest.mark.parametrize("obj", [
  132. Series([1, 2, 3]),
  133. Series([1.0, 1.5, 3.2]),
  134. Series([1.0, 1.5, np.nan]),
  135. Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
  136. Series(["a", "b", "c"]),
  137. Series(["a", np.nan, "c"]),
  138. Series(["a", None, "c"]),
  139. Series([True, False, True]),
  140. Series(),
  141. Index([1, 2, 3]),
  142. Index([True, False, True]),
  143. DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}),
  144. DataFrame(),
  145. tm.makeMissingDataframe(),
  146. tm.makeMixedDataFrame(),
  147. tm.makeTimeDataFrame(),
  148. tm.makeTimeSeries(),
  149. tm.makeTimedeltaIndex(),
  150. tm.makePeriodIndex(),
  151. Series(tm.makePeriodIndex()),
  152. Series(pd.date_range("20130101", periods=3, tz="US/Eastern")),
  153. MultiIndex.from_product([range(5), ["foo", "bar", "baz"],
  154. pd.date_range("20130101", periods=2)]),
  155. MultiIndex.from_product([pd.CategoricalIndex(list("aabc")), range(3)])
  156. ])
  157. def test_hash_pandas_object(obj, index):
  158. _check_equal(obj, index=index)
  159. _check_not_equal_with_index(obj)
  160. def test_hash_pandas_object2(series, index):
  161. _check_equal(series, index=index)
  162. _check_not_equal_with_index(series)
  163. @pytest.mark.parametrize("obj", [
  164. Series([], dtype="float64"), Series([], dtype="object"), Index([])])
  165. def test_hash_pandas_empty_object(obj, index):
  166. # These are by-definition the same with
  167. # or without the index as the data is empty.
  168. _check_equal(obj, index=index)
  169. @pytest.mark.parametrize("s1", [
  170. Series(["a", "b", "c", "d"]),
  171. Series([1000, 2000, 3000, 4000]),
  172. Series(pd.date_range(0, periods=4))])
  173. @pytest.mark.parametrize("categorize", [True, False])
  174. def test_categorical_consistency(s1, categorize):
  175. # see gh-15143
  176. #
  177. # Check that categoricals hash consistent with their values,
  178. # not codes. This should work for categoricals of any dtype.
  179. s2 = s1.astype("category").cat.set_categories(s1)
  180. s3 = s2.cat.set_categories(list(reversed(s1)))
  181. # These should all hash identically.
  182. h1 = hash_pandas_object(s1, categorize=categorize)
  183. h2 = hash_pandas_object(s2, categorize=categorize)
  184. h3 = hash_pandas_object(s3, categorize=categorize)
  185. tm.assert_series_equal(h1, h2)
  186. tm.assert_series_equal(h1, h3)
  187. def test_categorical_with_nan_consistency():
  188. c = pd.Categorical.from_codes(
  189. [-1, 0, 1, 2, 3, 4],
  190. categories=pd.date_range("2012-01-01", periods=5, name="B"))
  191. expected = hash_array(c, categorize=False)
  192. c = pd.Categorical.from_codes(
  193. [-1, 0],
  194. categories=[pd.Timestamp("2012-01-01")])
  195. result = hash_array(c, categorize=False)
  196. assert result[0] in expected
  197. assert result[1] in expected
  198. @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
  199. @pytest.mark.parametrize("obj", [pd.Timestamp("20130101"), tm.makePanel()])
  200. def test_pandas_errors(obj):
  201. msg = "Unexpected type for hashing"
  202. with pytest.raises(TypeError, match=msg):
  203. hash_pandas_object(obj)
  204. def test_hash_keys():
  205. # Using different hash keys, should have
  206. # different hashes for the same data.
  207. #
  208. # This only matters for object dtypes.
  209. obj = Series(list("abc"))
  210. a = hash_pandas_object(obj, hash_key="9876543210123456")
  211. b = hash_pandas_object(obj, hash_key="9876543210123465")
  212. assert (a != b).all()
  213. def test_invalid_key():
  214. # This only matters for object dtypes.
  215. msg = "key should be a 16-byte string encoded"
  216. with pytest.raises(ValueError, match=msg):
  217. hash_pandas_object(Series(list("abc")), hash_key="foo")
  218. def test_already_encoded(index):
  219. # If already encoded, then ok.
  220. obj = Series(list("abc")).str.encode("utf8")
  221. _check_equal(obj, index=index)
  222. def test_alternate_encoding(index):
  223. obj = Series(list("abc"))
  224. _check_equal(obj, index=index, encoding="ascii")
  225. @pytest.mark.parametrize("l_exp", range(8))
  226. @pytest.mark.parametrize("l_add", [0, 1])
  227. def test_same_len_hash_collisions(l_exp, l_add):
  228. length = 2**(l_exp + 8) + l_add
  229. s = tm.rands_array(length, 2)
  230. result = hash_array(s, "utf8")
  231. assert not result[0] == result[1]
  232. def test_hash_collisions():
  233. # Hash collisions are bad.
  234. #
  235. # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726
  236. hashes = ["Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9", # noqa
  237. "Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe"] # noqa
  238. # These should be different.
  239. result1 = hash_array(np.asarray(hashes[0:1], dtype=object), "utf8")
  240. expected1 = np.array([14963968704024874985], dtype=np.uint64)
  241. tm.assert_numpy_array_equal(result1, expected1)
  242. result2 = hash_array(np.asarray(hashes[1:2], dtype=object), "utf8")
  243. expected2 = np.array([16428432627716348016], dtype=np.uint64)
  244. tm.assert_numpy_array_equal(result2, expected2)
  245. result = hash_array(np.asarray(hashes, dtype=object), "utf8")
  246. tm.assert_numpy_array_equal(result, np.concatenate([expected1,
  247. expected2], axis=0))