hashing.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333
  1. """
  2. data hash pandas / numpy objects
  3. """
  4. import itertools
  5. import numpy as np
  6. from pandas._libs import hashing, tslibs
  7. from pandas.core.dtypes.cast import infer_dtype_from_scalar
  8. from pandas.core.dtypes.common import (
  9. is_categorical_dtype, is_extension_array_dtype, is_list_like)
  10. from pandas.core.dtypes.generic import (
  11. ABCDataFrame, ABCIndexClass, ABCMultiIndex, ABCSeries)
  12. from pandas.core.dtypes.missing import isna
  13. # 16 byte long hashing key
  14. _default_hash_key = '0123456789123456'
  15. def _combine_hash_arrays(arrays, num_items):
  16. """
  17. Parameters
  18. ----------
  19. arrays : generator
  20. num_items : int
  21. Should be the same as CPython's tupleobject.c
  22. """
  23. try:
  24. first = next(arrays)
  25. except StopIteration:
  26. return np.array([], dtype=np.uint64)
  27. arrays = itertools.chain([first], arrays)
  28. mult = np.uint64(1000003)
  29. out = np.zeros_like(first) + np.uint64(0x345678)
  30. for i, a in enumerate(arrays):
  31. inverse_i = num_items - i
  32. out ^= a
  33. out *= mult
  34. mult += np.uint64(82520 + inverse_i + inverse_i)
  35. assert i + 1 == num_items, 'Fed in wrong num_items'
  36. out += np.uint64(97531)
  37. return out
  38. def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
  39. categorize=True):
  40. """
  41. Return a data hash of the Index/Series/DataFrame
  42. .. versionadded:: 0.19.2
  43. Parameters
  44. ----------
  45. index : boolean, default True
  46. include the index in the hash (if Series/DataFrame)
  47. encoding : string, default 'utf8'
  48. encoding for data & key when strings
  49. hash_key : string key to encode, default to _default_hash_key
  50. categorize : bool, default True
  51. Whether to first categorize object arrays before hashing. This is more
  52. efficient when the array contains duplicate values.
  53. .. versionadded:: 0.20.0
  54. Returns
  55. -------
  56. Series of uint64, same length as the object
  57. """
  58. from pandas import Series
  59. if hash_key is None:
  60. hash_key = _default_hash_key
  61. if isinstance(obj, ABCMultiIndex):
  62. return Series(hash_tuples(obj, encoding, hash_key),
  63. dtype='uint64', copy=False)
  64. if isinstance(obj, ABCIndexClass):
  65. h = hash_array(obj.values, encoding, hash_key,
  66. categorize).astype('uint64', copy=False)
  67. h = Series(h, index=obj, dtype='uint64', copy=False)
  68. elif isinstance(obj, ABCSeries):
  69. h = hash_array(obj.values, encoding, hash_key,
  70. categorize).astype('uint64', copy=False)
  71. if index:
  72. index_iter = (hash_pandas_object(obj.index,
  73. index=False,
  74. encoding=encoding,
  75. hash_key=hash_key,
  76. categorize=categorize).values
  77. for _ in [None])
  78. arrays = itertools.chain([h], index_iter)
  79. h = _combine_hash_arrays(arrays, 2)
  80. h = Series(h, index=obj.index, dtype='uint64', copy=False)
  81. elif isinstance(obj, ABCDataFrame):
  82. hashes = (hash_array(series.values) for _, series in obj.iteritems())
  83. num_items = len(obj.columns)
  84. if index:
  85. index_hash_generator = (hash_pandas_object(obj.index,
  86. index=False,
  87. encoding=encoding,
  88. hash_key=hash_key,
  89. categorize=categorize).values # noqa
  90. for _ in [None])
  91. num_items += 1
  92. hashes = itertools.chain(hashes, index_hash_generator)
  93. h = _combine_hash_arrays(hashes, num_items)
  94. h = Series(h, index=obj.index, dtype='uint64', copy=False)
  95. else:
  96. raise TypeError("Unexpected type for hashing %s" % type(obj))
  97. return h
  98. def hash_tuples(vals, encoding='utf8', hash_key=None):
  99. """
  100. Hash an MultiIndex / list-of-tuples efficiently
  101. .. versionadded:: 0.20.0
  102. Parameters
  103. ----------
  104. vals : MultiIndex, list-of-tuples, or single tuple
  105. encoding : string, default 'utf8'
  106. hash_key : string key to encode, default to _default_hash_key
  107. Returns
  108. -------
  109. ndarray of hashed values array
  110. """
  111. is_tuple = False
  112. if isinstance(vals, tuple):
  113. vals = [vals]
  114. is_tuple = True
  115. elif not is_list_like(vals):
  116. raise TypeError("must be convertible to a list-of-tuples")
  117. from pandas import Categorical, MultiIndex
  118. if not isinstance(vals, ABCMultiIndex):
  119. vals = MultiIndex.from_tuples(vals)
  120. # create a list-of-Categoricals
  121. vals = [Categorical(vals.codes[level],
  122. vals.levels[level],
  123. ordered=False,
  124. fastpath=True)
  125. for level in range(vals.nlevels)]
  126. # hash the list-of-ndarrays
  127. hashes = (_hash_categorical(cat,
  128. encoding=encoding,
  129. hash_key=hash_key)
  130. for cat in vals)
  131. h = _combine_hash_arrays(hashes, len(vals))
  132. if is_tuple:
  133. h = h[0]
  134. return h
  135. def hash_tuple(val, encoding='utf8', hash_key=None):
  136. """
  137. Hash a single tuple efficiently
  138. Parameters
  139. ----------
  140. val : single tuple
  141. encoding : string, default 'utf8'
  142. hash_key : string key to encode, default to _default_hash_key
  143. Returns
  144. -------
  145. hash
  146. """
  147. hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key)
  148. for v in val)
  149. h = _combine_hash_arrays(hashes, len(val))[0]
  150. return h
  151. def _hash_categorical(c, encoding, hash_key):
  152. """
  153. Hash a Categorical by hashing its categories, and then mapping the codes
  154. to the hashes
  155. Parameters
  156. ----------
  157. c : Categorical
  158. encoding : string, default 'utf8'
  159. hash_key : string key to encode, default to _default_hash_key
  160. Returns
  161. -------
  162. ndarray of hashed values array, same size as len(c)
  163. """
  164. # Convert ExtensionArrays to ndarrays
  165. values = np.asarray(c.categories.values)
  166. hashed = hash_array(values, encoding, hash_key,
  167. categorize=False)
  168. # we have uint64, as we don't directly support missing values
  169. # we don't want to use take_nd which will coerce to float
  170. # instead, directly construct the result with a
  171. # max(np.uint64) as the missing value indicator
  172. #
  173. # TODO: GH 15362
  174. mask = c.isna()
  175. if len(hashed):
  176. result = hashed.take(c.codes)
  177. else:
  178. result = np.zeros(len(mask), dtype='uint64')
  179. if mask.any():
  180. result[mask] = np.iinfo(np.uint64).max
  181. return result
  182. def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
  183. """
  184. Given a 1d array, return an array of deterministic integers.
  185. .. versionadded:: 0.19.2
  186. Parameters
  187. ----------
  188. vals : ndarray, Categorical
  189. encoding : string, default 'utf8'
  190. encoding for data & key when strings
  191. hash_key : string key to encode, default to _default_hash_key
  192. categorize : bool, default True
  193. Whether to first categorize object arrays before hashing. This is more
  194. efficient when the array contains duplicate values.
  195. .. versionadded:: 0.20.0
  196. Returns
  197. -------
  198. 1d uint64 numpy array of hash values, same length as the vals
  199. """
  200. if not hasattr(vals, 'dtype'):
  201. raise TypeError("must pass a ndarray-like")
  202. dtype = vals.dtype
  203. if hash_key is None:
  204. hash_key = _default_hash_key
  205. # For categoricals, we hash the categories, then remap the codes to the
  206. # hash values. (This check is above the complex check so that we don't ask
  207. # numpy if categorical is a subdtype of complex, as it will choke).
  208. if is_categorical_dtype(dtype):
  209. return _hash_categorical(vals, encoding, hash_key)
  210. elif is_extension_array_dtype(dtype):
  211. vals, _ = vals._values_for_factorize()
  212. dtype = vals.dtype
  213. # we'll be working with everything as 64-bit values, so handle this
  214. # 128-bit value early
  215. if np.issubdtype(dtype, np.complex128):
  216. return hash_array(vals.real) + 23 * hash_array(vals.imag)
  217. # First, turn whatever array this is into unsigned 64-bit ints, if we can
  218. # manage it.
  219. elif isinstance(dtype, np.bool):
  220. vals = vals.astype('u8')
  221. elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
  222. vals = vals.view('i8').astype('u8', copy=False)
  223. elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:
  224. vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
  225. else:
  226. # With repeated values, its MUCH faster to categorize object dtypes,
  227. # then hash and rename categories. We allow skipping the categorization
  228. # when the values are known/likely to be unique.
  229. if categorize:
  230. from pandas import factorize, Categorical, Index
  231. codes, categories = factorize(vals, sort=False)
  232. cat = Categorical(codes, Index(categories),
  233. ordered=False, fastpath=True)
  234. return _hash_categorical(cat, encoding, hash_key)
  235. try:
  236. vals = hashing.hash_object_array(vals, hash_key, encoding)
  237. except TypeError:
  238. # we have mixed types
  239. vals = hashing.hash_object_array(vals.astype(str).astype(object),
  240. hash_key, encoding)
  241. # Then, redistribute these 64-bit ints within the space of 64-bit ints
  242. vals ^= vals >> 30
  243. vals *= np.uint64(0xbf58476d1ce4e5b9)
  244. vals ^= vals >> 27
  245. vals *= np.uint64(0x94d049bb133111eb)
  246. vals ^= vals >> 31
  247. return vals
  248. def _hash_scalar(val, encoding='utf8', hash_key=None):
  249. """
  250. Hash scalar value
  251. Returns
  252. -------
  253. 1d uint64 numpy array of hash value, of length 1
  254. """
  255. if isna(val):
  256. # this is to be consistent with the _hash_categorical implementation
  257. return np.array([np.iinfo(np.uint64).max], dtype='u8')
  258. if getattr(val, 'tzinfo', None) is not None:
  259. # for tz-aware datetimes, we need the underlying naive UTC value and
  260. # not the tz aware object or pd extension type (as
  261. # infer_dtype_from_scalar would do)
  262. if not isinstance(val, tslibs.Timestamp):
  263. val = tslibs.Timestamp(val)
  264. val = val.tz_convert(None)
  265. dtype, val = infer_dtype_from_scalar(val)
  266. vals = np.array([val], dtype=dtype)
  267. return hash_array(vals, hash_key=hash_key, encoding=encoding,
  268. categorize=False)