integer.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706
  1. import copy
  2. import sys
  3. import warnings
  4. import numpy as np
  5. from pandas._libs import lib
  6. from pandas.compat import range, set_function_name, string_types
  7. from pandas.util._decorators import cache_readonly
  8. from pandas.core.dtypes.base import ExtensionDtype
  9. from pandas.core.dtypes.cast import astype_nansafe
  10. from pandas.core.dtypes.common import (
  11. is_bool_dtype, is_float, is_float_dtype, is_integer, is_integer_dtype,
  12. is_list_like, is_object_dtype, is_scalar)
  13. from pandas.core.dtypes.dtypes import register_extension_dtype
  14. from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
  15. from pandas.core.dtypes.missing import isna, notna
  16. from pandas.core import nanops
  17. from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
  18. from pandas.core.tools.numeric import to_numeric
  19. class _IntegerDtype(ExtensionDtype):
  20. """
  21. An ExtensionDtype to hold a single size & kind of integer dtype.
  22. These specific implementations are subclasses of the non-public
  23. _IntegerDtype. For example we have Int8Dtype to represnt signed int 8s.
  24. The attributes name & type are set when these subclasses are created.
  25. """
  26. name = None
  27. base = None
  28. type = None
  29. na_value = np.nan
  30. def __repr__(self):
  31. sign = 'U' if self.is_unsigned_integer else ''
  32. return "{sign}Int{size}Dtype()".format(sign=sign,
  33. size=8 * self.itemsize)
  34. @cache_readonly
  35. def is_signed_integer(self):
  36. return self.kind == 'i'
  37. @cache_readonly
  38. def is_unsigned_integer(self):
  39. return self.kind == 'u'
  40. @property
  41. def _is_numeric(self):
  42. return True
  43. @cache_readonly
  44. def numpy_dtype(self):
  45. """ Return an instance of our numpy dtype """
  46. return np.dtype(self.type)
  47. @cache_readonly
  48. def kind(self):
  49. return self.numpy_dtype.kind
  50. @cache_readonly
  51. def itemsize(self):
  52. """ Return the number of bytes in this dtype """
  53. return self.numpy_dtype.itemsize
  54. @classmethod
  55. def construct_array_type(cls):
  56. """Return the array type associated with this dtype
  57. Returns
  58. -------
  59. type
  60. """
  61. return IntegerArray
  62. @classmethod
  63. def construct_from_string(cls, string):
  64. """
  65. Construction from a string, raise a TypeError if not
  66. possible
  67. """
  68. if string == cls.name:
  69. return cls()
  70. raise TypeError("Cannot construct a '{}' from "
  71. "'{}'".format(cls, string))
  72. def integer_array(values, dtype=None, copy=False):
  73. """
  74. Infer and return an integer array of the values.
  75. Parameters
  76. ----------
  77. values : 1D list-like
  78. dtype : dtype, optional
  79. dtype to coerce
  80. copy : boolean, default False
  81. Returns
  82. -------
  83. IntegerArray
  84. Raises
  85. ------
  86. TypeError if incompatible types
  87. """
  88. values, mask = coerce_to_array(values, dtype=dtype, copy=copy)
  89. return IntegerArray(values, mask)
  90. def safe_cast(values, dtype, copy):
  91. """
  92. Safely cast the values to the dtype if they
  93. are equivalent, meaning floats must be equivalent to the
  94. ints.
  95. """
  96. try:
  97. return values.astype(dtype, casting='safe', copy=copy)
  98. except TypeError:
  99. casted = values.astype(dtype, copy=copy)
  100. if (casted == values).all():
  101. return casted
  102. raise TypeError("cannot safely cast non-equivalent {} to {}".format(
  103. values.dtype, np.dtype(dtype)))
  104. def coerce_to_array(values, dtype, mask=None, copy=False):
  105. """
  106. Coerce the input values array to numpy arrays with a mask
  107. Parameters
  108. ----------
  109. values : 1D list-like
  110. dtype : integer dtype
  111. mask : boolean 1D array, optional
  112. copy : boolean, default False
  113. if True, copy the input
  114. Returns
  115. -------
  116. tuple of (values, mask)
  117. """
  118. # if values is integer numpy array, preserve it's dtype
  119. if dtype is None and hasattr(values, 'dtype'):
  120. if is_integer_dtype(values.dtype):
  121. dtype = values.dtype
  122. if dtype is not None:
  123. if (isinstance(dtype, string_types) and
  124. (dtype.startswith("Int") or dtype.startswith("UInt"))):
  125. # Avoid DeprecationWarning from NumPy about np.dtype("Int64")
  126. # https://github.com/numpy/numpy/pull/7476
  127. dtype = dtype.lower()
  128. if not issubclass(type(dtype), _IntegerDtype):
  129. try:
  130. dtype = _dtypes[str(np.dtype(dtype))]
  131. except KeyError:
  132. raise ValueError("invalid dtype specified {}".format(dtype))
  133. if isinstance(values, IntegerArray):
  134. values, mask = values._data, values._mask
  135. if dtype is not None:
  136. values = values.astype(dtype.numpy_dtype, copy=False)
  137. if copy:
  138. values = values.copy()
  139. mask = mask.copy()
  140. return values, mask
  141. values = np.array(values, copy=copy)
  142. if is_object_dtype(values):
  143. inferred_type = lib.infer_dtype(values, skipna=True)
  144. if inferred_type == 'empty':
  145. values = np.empty(len(values))
  146. values.fill(np.nan)
  147. elif inferred_type not in ['floating', 'integer',
  148. 'mixed-integer', 'mixed-integer-float']:
  149. raise TypeError("{} cannot be converted to an IntegerDtype".format(
  150. values.dtype))
  151. elif not (is_integer_dtype(values) or is_float_dtype(values)):
  152. raise TypeError("{} cannot be converted to an IntegerDtype".format(
  153. values.dtype))
  154. if mask is None:
  155. mask = isna(values)
  156. else:
  157. assert len(mask) == len(values)
  158. if not values.ndim == 1:
  159. raise TypeError("values must be a 1D list-like")
  160. if not mask.ndim == 1:
  161. raise TypeError("mask must be a 1D list-like")
  162. # infer dtype if needed
  163. if dtype is None:
  164. dtype = np.dtype('int64')
  165. else:
  166. dtype = dtype.type
  167. # if we are float, let's make sure that we can
  168. # safely cast
  169. # we copy as need to coerce here
  170. if mask.any():
  171. values = values.copy()
  172. values[mask] = 1
  173. values = safe_cast(values, dtype, copy=False)
  174. else:
  175. values = safe_cast(values, dtype, copy=False)
  176. return values, mask
  177. class IntegerArray(ExtensionArray, ExtensionOpsMixin):
  178. """
  179. Array of integer (optional missing) values.
  180. .. versionadded:: 0.24.0
  181. .. warning::
  182. IntegerArray is currently experimental, and its API or internal
  183. implementation may change without warning.
  184. We represent an IntegerArray with 2 numpy arrays:
  185. - data: contains a numpy integer array of the appropriate dtype
  186. - mask: a boolean array holding a mask on the data, True is missing
  187. To construct an IntegerArray from generic array-like input, use
  188. :func:`pandas.array` with one of the integer dtypes (see examples).
  189. See :ref:`integer_na` for more.
  190. Parameters
  191. ----------
  192. values : numpy.ndarray
  193. A 1-d integer-dtype array.
  194. mask : numpy.ndarray
  195. A 1-d boolean-dtype array indicating missing values.
  196. copy : bool, default False
  197. Whether to copy the `values` and `mask`.
  198. Returns
  199. -------
  200. IntegerArray
  201. Examples
  202. --------
  203. Create an IntegerArray with :func:`pandas.array`.
  204. >>> int_array = pd.array([1, None, 3], dtype=pd.Int32Dtype())
  205. >>> int_array
  206. <IntegerArray>
  207. [1, NaN, 3]
  208. Length: 3, dtype: Int32
  209. String aliases for the dtypes are also available. They are capitalized.
  210. >>> pd.array([1, None, 3], dtype='Int32')
  211. <IntegerArray>
  212. [1, NaN, 3]
  213. Length: 3, dtype: Int32
  214. >>> pd.array([1, None, 3], dtype='UInt16')
  215. <IntegerArray>
  216. [1, NaN, 3]
  217. Length: 3, dtype: UInt16
  218. """
  219. @cache_readonly
  220. def dtype(self):
  221. return _dtypes[str(self._data.dtype)]
  222. def __init__(self, values, mask, copy=False):
  223. if not (isinstance(values, np.ndarray)
  224. and is_integer_dtype(values.dtype)):
  225. raise TypeError("values should be integer numpy array. Use "
  226. "the 'integer_array' function instead")
  227. if not (isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype)):
  228. raise TypeError("mask should be boolean numpy array. Use "
  229. "the 'integer_array' function instead")
  230. if copy:
  231. values = values.copy()
  232. mask = mask.copy()
  233. self._data = values
  234. self._mask = mask
  235. @classmethod
  236. def _from_sequence(cls, scalars, dtype=None, copy=False):
  237. return integer_array(scalars, dtype=dtype, copy=copy)
  238. @classmethod
  239. def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
  240. scalars = to_numeric(strings, errors="raise")
  241. return cls._from_sequence(scalars, dtype, copy)
  242. @classmethod
  243. def _from_factorized(cls, values, original):
  244. return integer_array(values, dtype=original.dtype)
  245. def _formatter(self, boxed=False):
  246. def fmt(x):
  247. if isna(x):
  248. return 'NaN'
  249. return str(x)
  250. return fmt
  251. def __getitem__(self, item):
  252. if is_integer(item):
  253. if self._mask[item]:
  254. return self.dtype.na_value
  255. return self._data[item]
  256. return type(self)(self._data[item], self._mask[item])
  257. def _coerce_to_ndarray(self):
  258. """
  259. coerce to an ndarary of object dtype
  260. """
  261. # TODO(jreback) make this better
  262. data = self._data.astype(object)
  263. data[self._mask] = self._na_value
  264. return data
  265. __array_priority__ = 1000 # higher than ndarray so ops dispatch to us
  266. def __array__(self, dtype=None):
  267. """
  268. the array interface, return my values
  269. We return an object array here to preserve our scalar values
  270. """
  271. return self._coerce_to_ndarray()
  272. def __iter__(self):
  273. for i in range(len(self)):
  274. if self._mask[i]:
  275. yield self.dtype.na_value
  276. else:
  277. yield self._data[i]
  278. def take(self, indexer, allow_fill=False, fill_value=None):
  279. from pandas.api.extensions import take
  280. # we always fill with 1 internally
  281. # to avoid upcasting
  282. data_fill_value = 1 if isna(fill_value) else fill_value
  283. result = take(self._data, indexer, fill_value=data_fill_value,
  284. allow_fill=allow_fill)
  285. mask = take(self._mask, indexer, fill_value=True,
  286. allow_fill=allow_fill)
  287. # if we are filling
  288. # we only fill where the indexer is null
  289. # not existing missing values
  290. # TODO(jreback) what if we have a non-na float as a fill value?
  291. if allow_fill and notna(fill_value):
  292. fill_mask = np.asarray(indexer) == -1
  293. result[fill_mask] = fill_value
  294. mask = mask ^ fill_mask
  295. return type(self)(result, mask, copy=False)
  296. def copy(self, deep=False):
  297. data, mask = self._data, self._mask
  298. if deep:
  299. data = copy.deepcopy(data)
  300. mask = copy.deepcopy(mask)
  301. else:
  302. data = data.copy()
  303. mask = mask.copy()
  304. return type(self)(data, mask, copy=False)
  305. def __setitem__(self, key, value):
  306. _is_scalar = is_scalar(value)
  307. if _is_scalar:
  308. value = [value]
  309. value, mask = coerce_to_array(value, dtype=self.dtype)
  310. if _is_scalar:
  311. value = value[0]
  312. mask = mask[0]
  313. self._data[key] = value
  314. self._mask[key] = mask
  315. def __len__(self):
  316. return len(self._data)
  317. @property
  318. def nbytes(self):
  319. return self._data.nbytes + self._mask.nbytes
  320. def isna(self):
  321. return self._mask
  322. @property
  323. def _na_value(self):
  324. return np.nan
  325. @classmethod
  326. def _concat_same_type(cls, to_concat):
  327. data = np.concatenate([x._data for x in to_concat])
  328. mask = np.concatenate([x._mask for x in to_concat])
  329. return cls(data, mask)
  330. def astype(self, dtype, copy=True):
  331. """
  332. Cast to a NumPy array or IntegerArray with 'dtype'.
  333. Parameters
  334. ----------
  335. dtype : str or dtype
  336. Typecode or data-type to which the array is cast.
  337. copy : bool, default True
  338. Whether to copy the data, even if not necessary. If False,
  339. a copy is made only if the old dtype does not match the
  340. new dtype.
  341. Returns
  342. -------
  343. array : ndarray or IntegerArray
  344. NumPy ndarray or IntergerArray with 'dtype' for its dtype.
  345. Raises
  346. ------
  347. TypeError
  348. if incompatible type with an IntegerDtype, equivalent of same_kind
  349. casting
  350. """
  351. # if we are astyping to an existing IntegerDtype we can fastpath
  352. if isinstance(dtype, _IntegerDtype):
  353. result = self._data.astype(dtype.numpy_dtype, copy=False)
  354. return type(self)(result, mask=self._mask, copy=False)
  355. # coerce
  356. data = self._coerce_to_ndarray()
  357. return astype_nansafe(data, dtype, copy=None)
  358. @property
  359. def _ndarray_values(self):
  360. # type: () -> np.ndarray
  361. """Internal pandas method for lossy conversion to a NumPy ndarray.
  362. This method is not part of the pandas interface.
  363. The expectation is that this is cheap to compute, and is primarily
  364. used for interacting with our indexers.
  365. """
  366. return self._data
  367. def value_counts(self, dropna=True):
  368. """
  369. Returns a Series containing counts of each category.
  370. Every category will have an entry, even those with a count of 0.
  371. Parameters
  372. ----------
  373. dropna : boolean, default True
  374. Don't include counts of NaN.
  375. Returns
  376. -------
  377. counts : Series
  378. See Also
  379. --------
  380. Series.value_counts
  381. """
  382. from pandas import Index, Series
  383. # compute counts on the data with no nans
  384. data = self._data[~self._mask]
  385. value_counts = Index(data).value_counts()
  386. array = value_counts.values
  387. # TODO(extension)
  388. # if we have allow Index to hold an ExtensionArray
  389. # this is easier
  390. index = value_counts.index.astype(object)
  391. # if we want nans, count the mask
  392. if not dropna:
  393. # TODO(extension)
  394. # appending to an Index *always* infers
  395. # w/o passing the dtype
  396. array = np.append(array, [self._mask.sum()])
  397. index = Index(np.concatenate(
  398. [index.values,
  399. np.array([np.nan], dtype=object)]), dtype=object)
  400. return Series(array, index=index)
  401. def _values_for_argsort(self):
  402. # type: () -> ndarray
  403. """Return values for sorting.
  404. Returns
  405. -------
  406. ndarray
  407. The transformed values should maintain the ordering between values
  408. within the array.
  409. See Also
  410. --------
  411. ExtensionArray.argsort
  412. """
  413. data = self._data.copy()
  414. data[self._mask] = data.min() - 1
  415. return data
  416. @classmethod
  417. def _create_comparison_method(cls, op):
  418. def cmp_method(self, other):
  419. op_name = op.__name__
  420. mask = None
  421. if isinstance(other, (ABCSeries, ABCIndexClass)):
  422. # Rely on pandas to unbox and dispatch to us.
  423. return NotImplemented
  424. if isinstance(other, IntegerArray):
  425. other, mask = other._data, other._mask
  426. elif is_list_like(other):
  427. other = np.asarray(other)
  428. if other.ndim > 0 and len(self) != len(other):
  429. raise ValueError('Lengths must match to compare')
  430. other = lib.item_from_zerodim(other)
  431. # numpy will show a DeprecationWarning on invalid elementwise
  432. # comparisons, this will raise in the future
  433. with warnings.catch_warnings():
  434. warnings.filterwarnings("ignore", "elementwise", FutureWarning)
  435. with np.errstate(all='ignore'):
  436. result = op(self._data, other)
  437. # nans propagate
  438. if mask is None:
  439. mask = self._mask
  440. else:
  441. mask = self._mask | mask
  442. result[mask] = True if op_name == 'ne' else False
  443. return result
  444. name = '__{name}__'.format(name=op.__name__)
  445. return set_function_name(cmp_method, name, cls)
  446. def _reduce(self, name, skipna=True, **kwargs):
  447. data = self._data
  448. mask = self._mask
  449. # coerce to a nan-aware float if needed
  450. if mask.any():
  451. data = self._data.astype('float64')
  452. data[mask] = self._na_value
  453. op = getattr(nanops, 'nan' + name)
  454. result = op(data, axis=0, skipna=skipna, mask=mask)
  455. # if we have a boolean op, don't coerce
  456. if name in ['any', 'all']:
  457. pass
  458. # if we have a preservable numeric op,
  459. # provide coercion back to an integer type if possible
  460. elif name in ['sum', 'min', 'max', 'prod'] and notna(result):
  461. int_result = int(result)
  462. if int_result == result:
  463. result = int_result
  464. return result
  465. def _maybe_mask_result(self, result, mask, other, op_name):
  466. """
  467. Parameters
  468. ----------
  469. result : array-like
  470. mask : array-like bool
  471. other : scalar or array-like
  472. op_name : str
  473. """
  474. # may need to fill infs
  475. # and mask wraparound
  476. if is_float_dtype(result):
  477. mask |= (result == np.inf) | (result == -np.inf)
  478. # if we have a float operand we are by-definition
  479. # a float result
  480. # or our op is a divide
  481. if ((is_float_dtype(other) or is_float(other)) or
  482. (op_name in ['rtruediv', 'truediv', 'rdiv', 'div'])):
  483. result[mask] = np.nan
  484. return result
  485. return type(self)(result, mask, copy=False)
  486. @classmethod
  487. def _create_arithmetic_method(cls, op):
  488. def integer_arithmetic_method(self, other):
  489. op_name = op.__name__
  490. mask = None
  491. if isinstance(other, (ABCSeries, ABCIndexClass)):
  492. # Rely on pandas to unbox and dispatch to us.
  493. return NotImplemented
  494. if getattr(other, 'ndim', 0) > 1:
  495. raise NotImplementedError(
  496. "can only perform ops with 1-d structures")
  497. if isinstance(other, IntegerArray):
  498. other, mask = other._data, other._mask
  499. elif getattr(other, 'ndim', None) == 0:
  500. other = other.item()
  501. elif is_list_like(other):
  502. other = np.asarray(other)
  503. if not other.ndim:
  504. other = other.item()
  505. elif other.ndim == 1:
  506. if not (is_float_dtype(other) or is_integer_dtype(other)):
  507. raise TypeError(
  508. "can only perform ops with numeric values")
  509. else:
  510. if not (is_float(other) or is_integer(other)):
  511. raise TypeError("can only perform ops with numeric values")
  512. # nans propagate
  513. if mask is None:
  514. mask = self._mask
  515. else:
  516. mask = self._mask | mask
  517. # 1 ** np.nan is 1. So we have to unmask those.
  518. if op_name == 'pow':
  519. mask = np.where(self == 1, False, mask)
  520. elif op_name == 'rpow':
  521. mask = np.where(other == 1, False, mask)
  522. with np.errstate(all='ignore'):
  523. result = op(self._data, other)
  524. # divmod returns a tuple
  525. if op_name == 'divmod':
  526. div, mod = result
  527. return (self._maybe_mask_result(div, mask, other, 'floordiv'),
  528. self._maybe_mask_result(mod, mask, other, 'mod'))
  529. return self._maybe_mask_result(result, mask, other, op_name)
  530. name = '__{name}__'.format(name=op.__name__)
  531. return set_function_name(integer_arithmetic_method, name, cls)
  532. IntegerArray._add_arithmetic_ops()
  533. IntegerArray._add_comparison_ops()
  534. module = sys.modules[__name__]
  535. # create the Dtype
  536. _dtypes = {}
  537. for dtype in ['int8', 'int16', 'int32', 'int64',
  538. 'uint8', 'uint16', 'uint32', 'uint64']:
  539. if dtype.startswith('u'):
  540. name = "U{}".format(dtype[1:].capitalize())
  541. else:
  542. name = dtype.capitalize()
  543. classname = "{}Dtype".format(name)
  544. numpy_dtype = getattr(np, dtype)
  545. attributes_dict = {'type': numpy_dtype,
  546. 'name': name}
  547. dtype_type = register_extension_dtype(
  548. type(classname, (_IntegerDtype, ), attributes_dict)
  549. )
  550. setattr(module, classname, dtype_type)
  551. _dtypes[dtype] = dtype_type()