base.py 38 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120
  1. """An interface for extending pandas with custom arrays.
  2. .. warning::
  3. This is an experimental API and subject to breaking changes
  4. without warning.
  5. """
  6. import operator
  7. import numpy as np
  8. from pandas.compat import PY3, set_function_name
  9. from pandas.compat.numpy import function as nv
  10. from pandas.errors import AbstractMethodError
  11. from pandas.util._decorators import Appender, Substitution
  12. from pandas.core.dtypes.common import is_list_like
  13. from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
  14. from pandas.core.dtypes.missing import isna
  15. from pandas.core import ops
  16. _not_implemented_message = "{} does not implement {}."
  17. _extension_array_shared_docs = dict()
  18. class ExtensionArray(object):
  19. """
  20. Abstract base class for custom 1-D array types.
  21. pandas will recognize instances of this class as proper arrays
  22. with a custom type and will not attempt to coerce them to objects. They
  23. may be stored directly inside a :class:`DataFrame` or :class:`Series`.
  24. .. versionadded:: 0.23.0
  25. Notes
  26. -----
  27. The interface includes the following abstract methods that must be
  28. implemented by subclasses:
  29. * _from_sequence
  30. * _from_factorized
  31. * __getitem__
  32. * __len__
  33. * dtype
  34. * nbytes
  35. * isna
  36. * take
  37. * copy
  38. * _concat_same_type
  39. A default repr displaying the type, (truncated) data, length,
  40. and dtype is provided. It can be customized or replaced by
  41. by overriding:
  42. * __repr__ : A default repr for the ExtensionArray.
  43. * _formatter : Print scalars inside a Series or DataFrame.
  44. Some methods require casting the ExtensionArray to an ndarray of Python
  45. objects with ``self.astype(object)``, which may be expensive. When
  46. performance is a concern, we highly recommend overriding the following
  47. methods:
  48. * fillna
  49. * dropna
  50. * unique
  51. * factorize / _values_for_factorize
  52. * argsort / _values_for_argsort
  53. * searchsorted
  54. The remaining methods implemented on this class should be performant,
  55. as they only compose abstract methods. Still, a more efficient
  56. implementation may be available, and these methods can be overridden.
  57. One can implement methods to handle array reductions.
  58. * _reduce
  59. One can implement methods to handle parsing from strings that will be used
  60. in methods such as ``pandas.io.parsers.read_csv``.
  61. * _from_sequence_of_strings
  62. This class does not inherit from 'abc.ABCMeta' for performance reasons.
  63. Methods and properties required by the interface raise
  64. ``pandas.errors.AbstractMethodError`` and no ``register`` method is
  65. provided for registering virtual subclasses.
  66. ExtensionArrays are limited to 1 dimension.
  67. They may be backed by none, one, or many NumPy arrays. For example,
  68. ``pandas.Categorical`` is an extension array backed by two arrays,
  69. one for codes and one for categories. An array of IPv6 address may
  70. be backed by a NumPy structured array with two fields, one for the
  71. lower 64 bits and one for the upper 64 bits. Or they may be backed
  72. by some other storage type, like Python lists. Pandas makes no
  73. assumptions on how the data are stored, just that it can be converted
  74. to a NumPy array.
  75. The ExtensionArray interface does not impose any rules on how this data
  76. is stored. However, currently, the backing data cannot be stored in
  77. attributes called ``.values`` or ``._values`` to ensure full compatibility
  78. with pandas internals. But other names as ``.data``, ``._data``,
  79. ``._items``, ... can be freely used.
  80. """
  81. # '_typ' is for pandas.core.dtypes.generic.ABCExtensionArray.
  82. # Don't override this.
  83. _typ = 'extension'
  84. # ------------------------------------------------------------------------
  85. # Constructors
  86. # ------------------------------------------------------------------------
  87. @classmethod
  88. def _from_sequence(cls, scalars, dtype=None, copy=False):
  89. """
  90. Construct a new ExtensionArray from a sequence of scalars.
  91. Parameters
  92. ----------
  93. scalars : Sequence
  94. Each element will be an instance of the scalar type for this
  95. array, ``cls.dtype.type``.
  96. dtype : dtype, optional
  97. Construct for this particular dtype. This should be a Dtype
  98. compatible with the ExtensionArray.
  99. copy : boolean, default False
  100. If True, copy the underlying data.
  101. Returns
  102. -------
  103. ExtensionArray
  104. """
  105. raise AbstractMethodError(cls)
  106. @classmethod
  107. def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
  108. """Construct a new ExtensionArray from a sequence of strings.
  109. .. versionadded:: 0.24.0
  110. Parameters
  111. ----------
  112. strings : Sequence
  113. Each element will be an instance of the scalar type for this
  114. array, ``cls.dtype.type``.
  115. dtype : dtype, optional
  116. Construct for this particular dtype. This should be a Dtype
  117. compatible with the ExtensionArray.
  118. copy : boolean, default False
  119. If True, copy the underlying data.
  120. Returns
  121. -------
  122. ExtensionArray
  123. """
  124. raise AbstractMethodError(cls)
  125. @classmethod
  126. def _from_factorized(cls, values, original):
  127. """
  128. Reconstruct an ExtensionArray after factorization.
  129. Parameters
  130. ----------
  131. values : ndarray
  132. An integer ndarray with the factorized values.
  133. original : ExtensionArray
  134. The original ExtensionArray that factorize was called on.
  135. See Also
  136. --------
  137. pandas.factorize
  138. ExtensionArray.factorize
  139. """
  140. raise AbstractMethodError(cls)
  141. # ------------------------------------------------------------------------
  142. # Must be a Sequence
  143. # ------------------------------------------------------------------------
  144. def __getitem__(self, item):
  145. # type (Any) -> Any
  146. """
  147. Select a subset of self.
  148. Parameters
  149. ----------
  150. item : int, slice, or ndarray
  151. * int: The position in 'self' to get.
  152. * slice: A slice object, where 'start', 'stop', and 'step' are
  153. integers or None
  154. * ndarray: A 1-d boolean NumPy ndarray the same length as 'self'
  155. Returns
  156. -------
  157. item : scalar or ExtensionArray
  158. Notes
  159. -----
  160. For scalar ``item``, return a scalar value suitable for the array's
  161. type. This should be an instance of ``self.dtype.type``.
  162. For slice ``key``, return an instance of ``ExtensionArray``, even
  163. if the slice is length 0 or 1.
  164. For a boolean mask, return an instance of ``ExtensionArray``, filtered
  165. to the values where ``item`` is True.
  166. """
  167. raise AbstractMethodError(self)
  168. def __setitem__(self, key, value):
  169. # type: (Union[int, np.ndarray], Any) -> None
  170. """
  171. Set one or more values inplace.
  172. This method is not required to satisfy the pandas extension array
  173. interface.
  174. Parameters
  175. ----------
  176. key : int, ndarray, or slice
  177. When called from, e.g. ``Series.__setitem__``, ``key`` will be
  178. one of
  179. * scalar int
  180. * ndarray of integers.
  181. * boolean ndarray
  182. * slice object
  183. value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object
  184. value or values to be set of ``key``.
  185. Returns
  186. -------
  187. None
  188. """
  189. # Some notes to the ExtensionArray implementor who may have ended up
  190. # here. While this method is not required for the interface, if you
  191. # *do* choose to implement __setitem__, then some semantics should be
  192. # observed:
  193. #
  194. # * Setting multiple values : ExtensionArrays should support setting
  195. # multiple values at once, 'key' will be a sequence of integers and
  196. # 'value' will be a same-length sequence.
  197. #
  198. # * Broadcasting : For a sequence 'key' and a scalar 'value',
  199. # each position in 'key' should be set to 'value'.
  200. #
  201. # * Coercion : Most users will expect basic coercion to work. For
  202. # example, a string like '2018-01-01' is coerced to a datetime
  203. # when setting on a datetime64ns array. In general, if the
  204. # __init__ method coerces that value, then so should __setitem__
  205. # Note, also, that Series/DataFrame.where internally use __setitem__
  206. # on a copy of the data.
  207. raise NotImplementedError(_not_implemented_message.format(
  208. type(self), '__setitem__')
  209. )
  210. def __len__(self):
  211. # type: () -> int
  212. """
  213. Length of this array
  214. Returns
  215. -------
  216. length : int
  217. """
  218. raise AbstractMethodError(self)
  219. def __iter__(self):
  220. """
  221. Iterate over elements of the array.
  222. """
  223. # This needs to be implemented so that pandas recognizes extension
  224. # arrays as list-like. The default implementation makes successive
  225. # calls to ``__getitem__``, which may be slower than necessary.
  226. for i in range(len(self)):
  227. yield self[i]
  228. # ------------------------------------------------------------------------
  229. # Required attributes
  230. # ------------------------------------------------------------------------
  231. @property
  232. def dtype(self):
  233. # type: () -> ExtensionDtype
  234. """
  235. An instance of 'ExtensionDtype'.
  236. """
  237. raise AbstractMethodError(self)
  238. @property
  239. def shape(self):
  240. # type: () -> Tuple[int, ...]
  241. """
  242. Return a tuple of the array dimensions.
  243. """
  244. return (len(self),)
  245. @property
  246. def ndim(self):
  247. # type: () -> int
  248. """
  249. Extension Arrays are only allowed to be 1-dimensional.
  250. """
  251. return 1
  252. @property
  253. def nbytes(self):
  254. # type: () -> int
  255. """
  256. The number of bytes needed to store this object in memory.
  257. """
  258. # If this is expensive to compute, return an approximate lower bound
  259. # on the number of bytes needed.
  260. raise AbstractMethodError(self)
  261. # ------------------------------------------------------------------------
  262. # Additional Methods
  263. # ------------------------------------------------------------------------
  264. def astype(self, dtype, copy=True):
  265. """
  266. Cast to a NumPy array with 'dtype'.
  267. Parameters
  268. ----------
  269. dtype : str or dtype
  270. Typecode or data-type to which the array is cast.
  271. copy : bool, default True
  272. Whether to copy the data, even if not necessary. If False,
  273. a copy is made only if the old dtype does not match the
  274. new dtype.
  275. Returns
  276. -------
  277. array : ndarray
  278. NumPy ndarray with 'dtype' for its dtype.
  279. """
  280. return np.array(self, dtype=dtype, copy=copy)
  281. def isna(self):
  282. # type: () -> Union[ExtensionArray, np.ndarray]
  283. """
  284. A 1-D array indicating if each value is missing.
  285. Returns
  286. -------
  287. na_values : Union[np.ndarray, ExtensionArray]
  288. In most cases, this should return a NumPy ndarray. For
  289. exceptional cases like ``SparseArray``, where returning
  290. an ndarray would be expensive, an ExtensionArray may be
  291. returned.
  292. Notes
  293. -----
  294. If returning an ExtensionArray, then
  295. * ``na_values._is_boolean`` should be True
  296. * `na_values` should implement :func:`ExtensionArray._reduce`
  297. * ``na_values.any`` and ``na_values.all`` should be implemented
  298. """
  299. raise AbstractMethodError(self)
  300. def _values_for_argsort(self):
  301. # type: () -> ndarray
  302. """
  303. Return values for sorting.
  304. Returns
  305. -------
  306. ndarray
  307. The transformed values should maintain the ordering between values
  308. within the array.
  309. See Also
  310. --------
  311. ExtensionArray.argsort
  312. """
  313. # Note: this is used in `ExtensionArray.argsort`.
  314. return np.array(self)
  315. def argsort(self, ascending=True, kind='quicksort', *args, **kwargs):
  316. """
  317. Return the indices that would sort this array.
  318. Parameters
  319. ----------
  320. ascending : bool, default True
  321. Whether the indices should result in an ascending
  322. or descending sort.
  323. kind : {'quicksort', 'mergesort', 'heapsort'}, optional
  324. Sorting algorithm.
  325. *args, **kwargs:
  326. passed through to :func:`numpy.argsort`.
  327. Returns
  328. -------
  329. index_array : ndarray
  330. Array of indices that sort ``self``.
  331. See Also
  332. --------
  333. numpy.argsort : Sorting implementation used internally.
  334. """
  335. # Implementor note: You have two places to override the behavior of
  336. # argsort.
  337. # 1. _values_for_argsort : construct the values passed to np.argsort
  338. # 2. argsort : total control over sorting.
  339. ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs)
  340. values = self._values_for_argsort()
  341. result = np.argsort(values, kind=kind, **kwargs)
  342. if not ascending:
  343. result = result[::-1]
  344. return result
  345. def fillna(self, value=None, method=None, limit=None):
  346. """
  347. Fill NA/NaN values using the specified method.
  348. Parameters
  349. ----------
  350. value : scalar, array-like
  351. If a scalar value is passed it is used to fill all missing values.
  352. Alternatively, an array-like 'value' can be given. It's expected
  353. that the array-like have the same length as 'self'.
  354. method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
  355. Method to use for filling holes in reindexed Series
  356. pad / ffill: propagate last valid observation forward to next valid
  357. backfill / bfill: use NEXT valid observation to fill gap
  358. limit : int, default None
  359. If method is specified, this is the maximum number of consecutive
  360. NaN values to forward/backward fill. In other words, if there is
  361. a gap with more than this number of consecutive NaNs, it will only
  362. be partially filled. If method is not specified, this is the
  363. maximum number of entries along the entire axis where NaNs will be
  364. filled.
  365. Returns
  366. -------
  367. filled : ExtensionArray with NA/NaN filled
  368. """
  369. from pandas.api.types import is_array_like
  370. from pandas.util._validators import validate_fillna_kwargs
  371. from pandas.core.missing import pad_1d, backfill_1d
  372. value, method = validate_fillna_kwargs(value, method)
  373. mask = self.isna()
  374. if is_array_like(value):
  375. if len(value) != len(self):
  376. raise ValueError("Length of 'value' does not match. Got ({}) "
  377. " expected {}".format(len(value), len(self)))
  378. value = value[mask]
  379. if mask.any():
  380. if method is not None:
  381. func = pad_1d if method == 'pad' else backfill_1d
  382. new_values = func(self.astype(object), limit=limit,
  383. mask=mask)
  384. new_values = self._from_sequence(new_values, dtype=self.dtype)
  385. else:
  386. # fill with value
  387. new_values = self.copy()
  388. new_values[mask] = value
  389. else:
  390. new_values = self.copy()
  391. return new_values
  392. def dropna(self):
  393. """
  394. Return ExtensionArray without NA values
  395. Returns
  396. -------
  397. valid : ExtensionArray
  398. """
  399. return self[~self.isna()]
  400. def shift(self, periods=1, fill_value=None):
  401. # type: (int, object) -> ExtensionArray
  402. """
  403. Shift values by desired number.
  404. Newly introduced missing values are filled with
  405. ``self.dtype.na_value``.
  406. .. versionadded:: 0.24.0
  407. Parameters
  408. ----------
  409. periods : int, default 1
  410. The number of periods to shift. Negative values are allowed
  411. for shifting backwards.
  412. fill_value : object, optional
  413. The scalar value to use for newly introduced missing values.
  414. The default is ``self.dtype.na_value``
  415. .. versionadded:: 0.24.0
  416. Returns
  417. -------
  418. shifted : ExtensionArray
  419. Notes
  420. -----
  421. If ``self`` is empty or ``periods`` is 0, a copy of ``self`` is
  422. returned.
  423. If ``periods > len(self)``, then an array of size
  424. len(self) is returned, with all values filled with
  425. ``self.dtype.na_value``.
  426. """
  427. # Note: this implementation assumes that `self.dtype.na_value` can be
  428. # stored in an instance of your ExtensionArray with `self.dtype`.
  429. if not len(self) or periods == 0:
  430. return self.copy()
  431. if isna(fill_value):
  432. fill_value = self.dtype.na_value
  433. empty = self._from_sequence(
  434. [fill_value] * min(abs(periods), len(self)),
  435. dtype=self.dtype
  436. )
  437. if periods > 0:
  438. a = empty
  439. b = self[:-periods]
  440. else:
  441. a = self[abs(periods):]
  442. b = empty
  443. return self._concat_same_type([a, b])
  444. def unique(self):
  445. """
  446. Compute the ExtensionArray of unique values.
  447. Returns
  448. -------
  449. uniques : ExtensionArray
  450. """
  451. from pandas import unique
  452. uniques = unique(self.astype(object))
  453. return self._from_sequence(uniques, dtype=self.dtype)
  454. def searchsorted(self, value, side="left", sorter=None):
  455. """
  456. Find indices where elements should be inserted to maintain order.
  457. .. versionadded:: 0.24.0
  458. Find the indices into a sorted array `self` (a) such that, if the
  459. corresponding elements in `v` were inserted before the indices, the
  460. order of `self` would be preserved.
  461. Assuming that `a` is sorted:
  462. ====== ============================
  463. `side` returned index `i` satisfies
  464. ====== ============================
  465. left ``self[i-1] < v <= self[i]``
  466. right ``self[i-1] <= v < self[i]``
  467. ====== ============================
  468. Parameters
  469. ----------
  470. value : array_like
  471. Values to insert into `self`.
  472. side : {'left', 'right'}, optional
  473. If 'left', the index of the first suitable location found is given.
  474. If 'right', return the last such index. If there is no suitable
  475. index, return either 0 or N (where N is the length of `self`).
  476. sorter : 1-D array_like, optional
  477. Optional array of integer indices that sort array a into ascending
  478. order. They are typically the result of argsort.
  479. Returns
  480. -------
  481. indices : array of ints
  482. Array of insertion points with the same shape as `value`.
  483. See Also
  484. --------
  485. numpy.searchsorted : Similar method from NumPy.
  486. """
  487. # Note: the base tests provided by pandas only test the basics.
  488. # We do not test
  489. # 1. Values outside the range of the `data_for_sorting` fixture
  490. # 2. Values between the values in the `data_for_sorting` fixture
  491. # 3. Missing values.
  492. arr = self.astype(object)
  493. return arr.searchsorted(value, side=side, sorter=sorter)
  494. def _values_for_factorize(self):
  495. # type: () -> Tuple[ndarray, Any]
  496. """
  497. Return an array and missing value suitable for factorization.
  498. Returns
  499. -------
  500. values : ndarray
  501. An array suitable for factorization. This should maintain order
  502. and be a supported dtype (Float64, Int64, UInt64, String, Object).
  503. By default, the extension array is cast to object dtype.
  504. na_value : object
  505. The value in `values` to consider missing. This will be treated
  506. as NA in the factorization routines, so it will be coded as
  507. `na_sentinal` and not included in `uniques`. By default,
  508. ``np.nan`` is used.
  509. Notes
  510. -----
  511. The values returned by this method are also used in
  512. :func:`pandas.util.hash_pandas_object`.
  513. """
  514. return self.astype(object), np.nan
  515. def factorize(self, na_sentinel=-1):
  516. # type: (int) -> Tuple[ndarray, ExtensionArray]
  517. """
  518. Encode the extension array as an enumerated type.
  519. Parameters
  520. ----------
  521. na_sentinel : int, default -1
  522. Value to use in the `labels` array to indicate missing values.
  523. Returns
  524. -------
  525. labels : ndarray
  526. An integer NumPy array that's an indexer into the original
  527. ExtensionArray.
  528. uniques : ExtensionArray
  529. An ExtensionArray containing the unique values of `self`.
  530. .. note::
  531. uniques will *not* contain an entry for the NA value of
  532. the ExtensionArray if there are any missing values present
  533. in `self`.
  534. See Also
  535. --------
  536. pandas.factorize : Top-level factorize method that dispatches here.
  537. Notes
  538. -----
  539. :meth:`pandas.factorize` offers a `sort` keyword as well.
  540. """
  541. # Impelmentor note: There are two ways to override the behavior of
  542. # pandas.factorize
  543. # 1. _values_for_factorize and _from_factorize.
  544. # Specify the values passed to pandas' internal factorization
  545. # routines, and how to convert from those values back to the
  546. # original ExtensionArray.
  547. # 2. ExtensionArray.factorize.
  548. # Complete control over factorization.
  549. from pandas.core.algorithms import _factorize_array
  550. arr, na_value = self._values_for_factorize()
  551. labels, uniques = _factorize_array(arr, na_sentinel=na_sentinel,
  552. na_value=na_value)
  553. uniques = self._from_factorized(uniques, self)
  554. return labels, uniques
  555. _extension_array_shared_docs['repeat'] = """
  556. Repeat elements of a %(klass)s.
  557. Returns a new %(klass)s where each element of the current %(klass)s
  558. is repeated consecutively a given number of times.
  559. Parameters
  560. ----------
  561. repeats : int or array of ints
  562. The number of repetitions for each element. This should be a
  563. non-negative integer. Repeating 0 times will return an empty
  564. %(klass)s.
  565. axis : None
  566. Must be ``None``. Has no effect but is accepted for compatibility
  567. with numpy.
  568. Returns
  569. -------
  570. repeated_array : %(klass)s
  571. Newly created %(klass)s with repeated elements.
  572. See Also
  573. --------
  574. Series.repeat : Equivalent function for Series.
  575. Index.repeat : Equivalent function for Index.
  576. numpy.repeat : Similar method for :class:`numpy.ndarray`.
  577. ExtensionArray.take : Take arbitrary positions.
  578. Examples
  579. --------
  580. >>> cat = pd.Categorical(['a', 'b', 'c'])
  581. >>> cat
  582. [a, b, c]
  583. Categories (3, object): [a, b, c]
  584. >>> cat.repeat(2)
  585. [a, a, b, b, c, c]
  586. Categories (3, object): [a, b, c]
  587. >>> cat.repeat([1, 2, 3])
  588. [a, b, b, c, c, c]
  589. Categories (3, object): [a, b, c]
  590. """
  591. @Substitution(klass='ExtensionArray')
  592. @Appender(_extension_array_shared_docs['repeat'])
  593. def repeat(self, repeats, axis=None):
  594. nv.validate_repeat(tuple(), dict(axis=axis))
  595. ind = np.arange(len(self)).repeat(repeats)
  596. return self.take(ind)
  597. # ------------------------------------------------------------------------
  598. # Indexing methods
  599. # ------------------------------------------------------------------------
  600. def take(self, indices, allow_fill=False, fill_value=None):
  601. # type: (Sequence[int], bool, Optional[Any]) -> ExtensionArray
  602. """
  603. Take elements from an array.
  604. Parameters
  605. ----------
  606. indices : sequence of integers
  607. Indices to be taken.
  608. allow_fill : bool, default False
  609. How to handle negative values in `indices`.
  610. * False: negative values in `indices` indicate positional indices
  611. from the right (the default). This is similar to
  612. :func:`numpy.take`.
  613. * True: negative values in `indices` indicate
  614. missing values. These values are set to `fill_value`. Any other
  615. other negative values raise a ``ValueError``.
  616. fill_value : any, optional
  617. Fill value to use for NA-indices when `allow_fill` is True.
  618. This may be ``None``, in which case the default NA value for
  619. the type, ``self.dtype.na_value``, is used.
  620. For many ExtensionArrays, there will be two representations of
  621. `fill_value`: a user-facing "boxed" scalar, and a low-level
  622. physical NA value. `fill_value` should be the user-facing version,
  623. and the implementation should handle translating that to the
  624. physical version for processing the take if necessary.
  625. Returns
  626. -------
  627. ExtensionArray
  628. Raises
  629. ------
  630. IndexError
  631. When the indices are out of bounds for the array.
  632. ValueError
  633. When `indices` contains negative values other than ``-1``
  634. and `allow_fill` is True.
  635. Notes
  636. -----
  637. ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
  638. ``iloc``, when `indices` is a sequence of values. Additionally,
  639. it's called by :meth:`Series.reindex`, or any other method
  640. that causes realignment, with a `fill_value`.
  641. See Also
  642. --------
  643. numpy.take
  644. pandas.api.extensions.take
  645. Examples
  646. --------
  647. Here's an example implementation, which relies on casting the
  648. extension array to object dtype. This uses the helper method
  649. :func:`pandas.api.extensions.take`.
  650. .. code-block:: python
  651. def take(self, indices, allow_fill=False, fill_value=None):
  652. from pandas.core.algorithms import take
  653. # If the ExtensionArray is backed by an ndarray, then
  654. # just pass that here instead of coercing to object.
  655. data = self.astype(object)
  656. if allow_fill and fill_value is None:
  657. fill_value = self.dtype.na_value
  658. # fill value should always be translated from the scalar
  659. # type for the array, to the physical storage type for
  660. # the data, before passing to take.
  661. result = take(data, indices, fill_value=fill_value,
  662. allow_fill=allow_fill)
  663. return self._from_sequence(result, dtype=self.dtype)
  664. """
  665. # Implementer note: The `fill_value` parameter should be a user-facing
  666. # value, an instance of self.dtype.type. When passed `fill_value=None`,
  667. # the default of `self.dtype.na_value` should be used.
  668. # This may differ from the physical storage type your ExtensionArray
  669. # uses. In this case, your implementation is responsible for casting
  670. # the user-facing type to the storage type, before using
  671. # pandas.api.extensions.take
  672. raise AbstractMethodError(self)
  673. def copy(self, deep=False):
  674. # type: (bool) -> ExtensionArray
  675. """
  676. Return a copy of the array.
  677. Parameters
  678. ----------
  679. deep : bool, default False
  680. Also copy the underlying data backing this array.
  681. Returns
  682. -------
  683. ExtensionArray
  684. """
  685. raise AbstractMethodError(self)
  686. # ------------------------------------------------------------------------
  687. # Printing
  688. # ------------------------------------------------------------------------
  689. def __repr__(self):
  690. from pandas.io.formats.printing import format_object_summary
  691. template = (
  692. u'{class_name}'
  693. u'{data}\n'
  694. u'Length: {length}, dtype: {dtype}'
  695. )
  696. # the short repr has no trailing newline, while the truncated
  697. # repr does. So we include a newline in our template, and strip
  698. # any trailing newlines from format_object_summary
  699. data = format_object_summary(self, self._formatter(),
  700. indent_for_name=False).rstrip(', \n')
  701. class_name = u'<{}>\n'.format(self.__class__.__name__)
  702. return template.format(class_name=class_name, data=data,
  703. length=len(self),
  704. dtype=self.dtype)
  705. def _formatter(self, boxed=False):
  706. # type: (bool) -> Callable[[Any], Optional[str]]
  707. """Formatting function for scalar values.
  708. This is used in the default '__repr__'. The returned formatting
  709. function receives instances of your scalar type.
  710. Parameters
  711. ----------
  712. boxed: bool, default False
  713. An indicated for whether or not your array is being printed
  714. within a Series, DataFrame, or Index (True), or just by
  715. itself (False). This may be useful if you want scalar values
  716. to appear differently within a Series versus on its own (e.g.
  717. quoted or not).
  718. Returns
  719. -------
  720. Callable[[Any], str]
  721. A callable that gets instances of the scalar type and
  722. returns a string. By default, :func:`repr` is used
  723. when ``boxed=False`` and :func:`str` is used when
  724. ``boxed=True``.
  725. """
  726. if boxed:
  727. return str
  728. return repr
  729. def _formatting_values(self):
  730. # type: () -> np.ndarray
  731. # At the moment, this has to be an array since we use result.dtype
  732. """
  733. An array of values to be printed in, e.g. the Series repr
  734. .. deprecated:: 0.24.0
  735. Use :meth:`ExtensionArray._formatter` instead.
  736. """
  737. return np.array(self)
  738. # ------------------------------------------------------------------------
  739. # Reshaping
  740. # ------------------------------------------------------------------------
  741. @classmethod
  742. def _concat_same_type(cls, to_concat):
  743. # type: (Sequence[ExtensionArray]) -> ExtensionArray
  744. """
  745. Concatenate multiple array
  746. Parameters
  747. ----------
  748. to_concat : sequence of this type
  749. Returns
  750. -------
  751. ExtensionArray
  752. """
  753. raise AbstractMethodError(cls)
  754. # The _can_hold_na attribute is set to True so that pandas internals
  755. # will use the ExtensionDtype.na_value as the NA value in operations
  756. # such as take(), reindex(), shift(), etc. In addition, those results
  757. # will then be of the ExtensionArray subclass rather than an array
  758. # of objects
  759. _can_hold_na = True
  760. @property
  761. def _ndarray_values(self):
  762. # type: () -> np.ndarray
  763. """
  764. Internal pandas method for lossy conversion to a NumPy ndarray.
  765. This method is not part of the pandas interface.
  766. The expectation is that this is cheap to compute, and is primarily
  767. used for interacting with our indexers.
  768. """
  769. return np.array(self)
  770. def _reduce(self, name, skipna=True, **kwargs):
  771. """
  772. Return a scalar result of performing the reduction operation.
  773. Parameters
  774. ----------
  775. name : str
  776. Name of the function, supported values are:
  777. { any, all, min, max, sum, mean, median, prod,
  778. std, var, sem, kurt, skew }.
  779. skipna : bool, default True
  780. If True, skip NaN values.
  781. **kwargs
  782. Additional keyword arguments passed to the reduction function.
  783. Currently, `ddof` is the only supported kwarg.
  784. Returns
  785. -------
  786. scalar
  787. Raises
  788. ------
  789. TypeError : subclass does not define reductions
  790. """
  791. raise TypeError("cannot perform {name} with type {dtype}".format(
  792. name=name, dtype=self.dtype))
  793. class ExtensionOpsMixin(object):
  794. """
  795. A base class for linking the operators to their dunder names.
  796. .. note::
  797. You may want to set ``__array_priority__`` if you want your
  798. implementation to be called when involved in binary operations
  799. with NumPy arrays.
  800. """
  801. @classmethod
  802. def _add_arithmetic_ops(cls):
  803. cls.__add__ = cls._create_arithmetic_method(operator.add)
  804. cls.__radd__ = cls._create_arithmetic_method(ops.radd)
  805. cls.__sub__ = cls._create_arithmetic_method(operator.sub)
  806. cls.__rsub__ = cls._create_arithmetic_method(ops.rsub)
  807. cls.__mul__ = cls._create_arithmetic_method(operator.mul)
  808. cls.__rmul__ = cls._create_arithmetic_method(ops.rmul)
  809. cls.__pow__ = cls._create_arithmetic_method(operator.pow)
  810. cls.__rpow__ = cls._create_arithmetic_method(ops.rpow)
  811. cls.__mod__ = cls._create_arithmetic_method(operator.mod)
  812. cls.__rmod__ = cls._create_arithmetic_method(ops.rmod)
  813. cls.__floordiv__ = cls._create_arithmetic_method(operator.floordiv)
  814. cls.__rfloordiv__ = cls._create_arithmetic_method(ops.rfloordiv)
  815. cls.__truediv__ = cls._create_arithmetic_method(operator.truediv)
  816. cls.__rtruediv__ = cls._create_arithmetic_method(ops.rtruediv)
  817. if not PY3:
  818. cls.__div__ = cls._create_arithmetic_method(operator.div)
  819. cls.__rdiv__ = cls._create_arithmetic_method(ops.rdiv)
  820. cls.__divmod__ = cls._create_arithmetic_method(divmod)
  821. cls.__rdivmod__ = cls._create_arithmetic_method(ops.rdivmod)
  822. @classmethod
  823. def _add_comparison_ops(cls):
  824. cls.__eq__ = cls._create_comparison_method(operator.eq)
  825. cls.__ne__ = cls._create_comparison_method(operator.ne)
  826. cls.__lt__ = cls._create_comparison_method(operator.lt)
  827. cls.__gt__ = cls._create_comparison_method(operator.gt)
  828. cls.__le__ = cls._create_comparison_method(operator.le)
  829. cls.__ge__ = cls._create_comparison_method(operator.ge)
  830. class ExtensionScalarOpsMixin(ExtensionOpsMixin):
  831. """
  832. A mixin for defining ops on an ExtensionArray.
  833. It is assumed that the underlying scalar objects have the operators
  834. already defined.
  835. Notes
  836. -----
  837. If you have defined a subclass MyExtensionArray(ExtensionArray), then
  838. use MyExtensionArray(ExtensionArray, ExtensionScalarOpsMixin) to
  839. get the arithmetic operators. After the definition of MyExtensionArray,
  840. insert the lines
  841. MyExtensionArray._add_arithmetic_ops()
  842. MyExtensionArray._add_comparison_ops()
  843. to link the operators to your class.
  844. .. note::
  845. You may want to set ``__array_priority__`` if you want your
  846. implementation to be called when involved in binary operations
  847. with NumPy arrays.
  848. """
  849. @classmethod
  850. def _create_method(cls, op, coerce_to_dtype=True):
  851. """
  852. A class method that returns a method that will correspond to an
  853. operator for an ExtensionArray subclass, by dispatching to the
  854. relevant operator defined on the individual elements of the
  855. ExtensionArray.
  856. Parameters
  857. ----------
  858. op : function
  859. An operator that takes arguments op(a, b)
  860. coerce_to_dtype : bool, default True
  861. boolean indicating whether to attempt to convert
  862. the result to the underlying ExtensionArray dtype.
  863. If it's not possible to create a new ExtensionArray with the
  864. values, an ndarray is returned instead.
  865. Returns
  866. -------
  867. Callable[[Any, Any], Union[ndarray, ExtensionArray]]
  868. A method that can be bound to a class. When used, the method
  869. receives the two arguments, one of which is the instance of
  870. this class, and should return an ExtensionArray or an ndarray.
  871. Returning an ndarray may be necessary when the result of the
  872. `op` cannot be stored in the ExtensionArray. The dtype of the
  873. ndarray uses NumPy's normal inference rules.
  874. Example
  875. -------
  876. Given an ExtensionArray subclass called MyExtensionArray, use
  877. >>> __add__ = cls._create_method(operator.add)
  878. in the class definition of MyExtensionArray to create the operator
  879. for addition, that will be based on the operator implementation
  880. of the underlying elements of the ExtensionArray
  881. """
  882. def _binop(self, other):
  883. def convert_values(param):
  884. if isinstance(param, ExtensionArray) or is_list_like(param):
  885. ovalues = param
  886. else: # Assume its an object
  887. ovalues = [param] * len(self)
  888. return ovalues
  889. if isinstance(other, (ABCSeries, ABCIndexClass)):
  890. # rely on pandas to unbox and dispatch to us
  891. return NotImplemented
  892. lvalues = self
  893. rvalues = convert_values(other)
  894. # If the operator is not defined for the underlying objects,
  895. # a TypeError should be raised
  896. res = [op(a, b) for (a, b) in zip(lvalues, rvalues)]
  897. def _maybe_convert(arr):
  898. if coerce_to_dtype:
  899. # https://github.com/pandas-dev/pandas/issues/22850
  900. # We catch all regular exceptions here, and fall back
  901. # to an ndarray.
  902. try:
  903. res = self._from_sequence(arr)
  904. except Exception:
  905. res = np.asarray(arr)
  906. else:
  907. res = np.asarray(arr)
  908. return res
  909. if op.__name__ in {'divmod', 'rdivmod'}:
  910. a, b = zip(*res)
  911. res = _maybe_convert(a), _maybe_convert(b)
  912. else:
  913. res = _maybe_convert(res)
  914. return res
  915. op_name = ops._get_op_name(op, True)
  916. return set_function_name(_binop, op_name, cls)
  917. @classmethod
  918. def _create_arithmetic_method(cls, op):
  919. return cls._create_method(op)
  920. @classmethod
  921. def _create_comparison_method(cls, op):
  922. return cls._create_method(op, coerce_to_dtype=False)