series.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592
  1. """
  2. Data structures for sparse float data. Life is made simpler by dealing only
  3. with float64 data
  4. """
  5. # pylint: disable=E1101,E1103,W0231
  6. import warnings
  7. import numpy as np
  8. import pandas._libs.index as libindex
  9. import pandas._libs.sparse as splib
  10. from pandas._libs.sparse import BlockIndex, IntIndex
  11. import pandas.compat as compat
  12. from pandas.compat.numpy import function as nv
  13. from pandas.util._decorators import Appender, Substitution
  14. from pandas.core.dtypes.common import is_integer, is_scalar
  15. from pandas.core.dtypes.generic import ABCSeries, ABCSparseSeries
  16. from pandas.core.dtypes.missing import isna, notna
  17. from pandas.core import generic
  18. from pandas.core.arrays import SparseArray
  19. from pandas.core.arrays.sparse import SparseAccessor
  20. from pandas.core.index import Index
  21. from pandas.core.internals import SingleBlockManager
  22. import pandas.core.ops as ops
  23. from pandas.core.series import Series
  24. from pandas.core.sparse.scipy_sparse import (
  25. _coo_to_sparse_series, _sparse_series_to_coo)
  26. _shared_doc_kwargs = dict(axes='index', klass='SparseSeries',
  27. axes_single_arg="{0, 'index'}",
  28. optional_labels='', optional_axis='')
  29. class SparseSeries(Series):
  30. """Data structure for labeled, sparse floating point data
  31. Parameters
  32. ----------
  33. data : {array-like, Series, SparseSeries, dict}
  34. .. versionchanged :: 0.23.0
  35. If data is a dict, argument order is maintained for Python 3.6
  36. and later.
  37. kind : {'block', 'integer'}
  38. fill_value : float
  39. Code for missing value. Defaults depends on dtype.
  40. 0 for int dtype, False for bool dtype, and NaN for other dtypes
  41. sparse_index : {BlockIndex, IntIndex}, optional
  42. Only if you have one. Mainly used internally
  43. Notes
  44. -----
  45. SparseSeries objects are immutable via the typical Python means. If you
  46. must change values, convert to dense, make your changes, then convert back
  47. to sparse
  48. """
  49. _subtyp = 'sparse_series'
  50. def __init__(self, data=None, index=None, sparse_index=None, kind='block',
  51. fill_value=None, name=None, dtype=None, copy=False,
  52. fastpath=False):
  53. # TODO: Most of this should be refactored and shared with Series
  54. # 1. BlockManager -> array
  55. # 2. Series.index, Series.name, index, name reconciliation
  56. # 3. Implicit reindexing
  57. # 4. Implicit broadcasting
  58. # 5. Dict construction
  59. if data is None:
  60. data = []
  61. elif isinstance(data, SingleBlockManager):
  62. index = data.index
  63. data = data.blocks[0].values
  64. elif isinstance(data, (ABCSeries, ABCSparseSeries)):
  65. index = data.index if index is None else index
  66. dtype = data.dtype if dtype is None else dtype
  67. name = data.name if name is None else name
  68. if index is not None:
  69. data = data.reindex(index)
  70. elif isinstance(data, compat.Mapping):
  71. data, index = Series()._init_dict(data, index=index)
  72. elif is_scalar(data) and index is not None:
  73. data = np.full(len(index), fill_value=data)
  74. super(SparseSeries, self).__init__(
  75. SparseArray(data,
  76. sparse_index=sparse_index,
  77. kind=kind,
  78. dtype=dtype,
  79. fill_value=fill_value,
  80. copy=copy),
  81. index=index, name=name,
  82. copy=False, fastpath=fastpath
  83. )
  84. def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
  85. # avoid infinite recursion for other SparseSeries inputs
  86. inputs = tuple(
  87. x.values if isinstance(x, type(self)) else x
  88. for x in inputs
  89. )
  90. result = self.values.__array_ufunc__(ufunc, method, *inputs, **kwargs)
  91. return self._constructor(result, index=self.index,
  92. sparse_index=self.sp_index,
  93. fill_value=result.fill_value,
  94. copy=False).__finalize__(self)
  95. def __array_wrap__(self, result, context=None):
  96. """
  97. Gets called prior to a ufunc (and after)
  98. See SparseArray.__array_wrap__ for detail.
  99. """
  100. result = self.values.__array_wrap__(result, context=context)
  101. return self._constructor(result, index=self.index,
  102. sparse_index=self.sp_index,
  103. fill_value=result.fill_value,
  104. copy=False).__finalize__(self)
  105. def __array_finalize__(self, obj):
  106. """
  107. Gets called after any ufunc or other array operations, necessary
  108. to pass on the index.
  109. """
  110. self.name = getattr(obj, 'name', None)
  111. self.fill_value = getattr(obj, 'fill_value', None)
  112. # unary ops
  113. # TODO: See if this can be shared
  114. def __pos__(self):
  115. result = self.values.__pos__()
  116. return self._constructor(result, index=self.index,
  117. sparse_index=self.sp_index,
  118. fill_value=result.fill_value,
  119. copy=False).__finalize__(self)
  120. def __neg__(self):
  121. result = self.values.__neg__()
  122. return self._constructor(result, index=self.index,
  123. sparse_index=self.sp_index,
  124. fill_value=result.fill_value,
  125. copy=False).__finalize__(self)
  126. def __invert__(self):
  127. result = self.values.__invert__()
  128. return self._constructor(result, index=self.index,
  129. sparse_index=self.sp_index,
  130. fill_value=result.fill_value,
  131. copy=False).__finalize__(self)
  132. @property
  133. def block(self):
  134. warnings.warn("SparseSeries.block is deprecated.", FutureWarning,
  135. stacklevel=2)
  136. return self._data._block
  137. @property
  138. def fill_value(self):
  139. return self.values.fill_value
  140. @fill_value.setter
  141. def fill_value(self, v):
  142. self.values.fill_value = v
  143. @property
  144. def sp_index(self):
  145. return self.values.sp_index
  146. @property
  147. def sp_values(self):
  148. return self.values.sp_values
  149. @property
  150. def npoints(self):
  151. return self.values.npoints
  152. @classmethod
  153. def from_array(cls, arr, index=None, name=None, copy=False,
  154. fill_value=None, fastpath=False):
  155. """Construct SparseSeries from array.
  156. .. deprecated:: 0.23.0
  157. Use the pd.SparseSeries(..) constructor instead.
  158. """
  159. warnings.warn("'from_array' is deprecated and will be removed in a "
  160. "future version. Please use the pd.SparseSeries(..) "
  161. "constructor instead.", FutureWarning, stacklevel=2)
  162. return cls(arr, index=index, name=name, copy=copy,
  163. fill_value=fill_value, fastpath=fastpath)
  164. @property
  165. def _constructor(self):
  166. return SparseSeries
  167. @property
  168. def _constructor_expanddim(self):
  169. from pandas.core.sparse.api import SparseDataFrame
  170. return SparseDataFrame
  171. @property
  172. def kind(self):
  173. if isinstance(self.sp_index, BlockIndex):
  174. return 'block'
  175. elif isinstance(self.sp_index, IntIndex):
  176. return 'integer'
  177. def as_sparse_array(self, kind=None, fill_value=None, copy=False):
  178. """ return my self as a sparse array, do not copy by default """
  179. if fill_value is None:
  180. fill_value = self.fill_value
  181. if kind is None:
  182. kind = self.kind
  183. return SparseArray(self.values, sparse_index=self.sp_index,
  184. fill_value=fill_value, kind=kind, copy=copy)
  185. def __unicode__(self):
  186. # currently, unicode is same as repr...fixes infinite loop
  187. series_rep = Series.__unicode__(self)
  188. rep = '{series}\n{index!r}'.format(series=series_rep,
  189. index=self.sp_index)
  190. return rep
  191. def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
  192. filter_type=None, **kwds):
  193. """ perform a reduction operation """
  194. return op(self.get_values(), skipna=skipna, **kwds)
  195. def __getstate__(self):
  196. # pickling
  197. return dict(_typ=self._typ, _subtyp=self._subtyp, _data=self._data,
  198. fill_value=self.fill_value, name=self.name)
  199. def _unpickle_series_compat(self, state):
  200. nd_state, own_state = state
  201. # recreate the ndarray
  202. data = np.empty(nd_state[1], dtype=nd_state[2])
  203. np.ndarray.__setstate__(data, nd_state)
  204. index, fill_value, sp_index = own_state[:3]
  205. name = None
  206. if len(own_state) > 3:
  207. name = own_state[3]
  208. # create a sparse array
  209. if not isinstance(data, SparseArray):
  210. data = SparseArray(data, sparse_index=sp_index,
  211. fill_value=fill_value, copy=False)
  212. # recreate
  213. data = SingleBlockManager(data, index, fastpath=True)
  214. generic.NDFrame.__init__(self, data)
  215. self._set_axis(0, index)
  216. self.name = name
  217. def _set_subtyp(self, is_all_dates):
  218. if is_all_dates:
  219. object.__setattr__(self, '_subtyp', 'sparse_time_series')
  220. else:
  221. object.__setattr__(self, '_subtyp', 'sparse_series')
  222. def _ixs(self, i, axis=0):
  223. """
  224. Return the i-th value or values in the SparseSeries by location
  225. Parameters
  226. ----------
  227. i : int, slice, or sequence of integers
  228. Returns
  229. -------
  230. value : scalar (int) or Series (slice, sequence)
  231. """
  232. label = self.index[i]
  233. if isinstance(label, Index):
  234. return self.take(i, axis=axis)
  235. else:
  236. return self._get_val_at(i)
  237. def _get_val_at(self, loc):
  238. """ forward to the array """
  239. return self.values._get_val_at(loc)
  240. def __getitem__(self, key):
  241. # TODO: Document difference from Series.__getitem__, deprecate,
  242. # and remove!
  243. if is_integer(key) and key not in self.index:
  244. return self._get_val_at(key)
  245. else:
  246. return super(SparseSeries, self).__getitem__(key)
  247. def _get_values(self, indexer):
  248. try:
  249. return self._constructor(self._data.get_slice(indexer),
  250. fastpath=True).__finalize__(self)
  251. except Exception:
  252. return self[indexer]
  253. def _set_with_engine(self, key, value):
  254. return self._set_value(key, value)
  255. def abs(self):
  256. """
  257. Return an object with absolute value taken. Only applicable to objects
  258. that are all numeric
  259. Returns
  260. -------
  261. abs: same type as caller
  262. """
  263. return self._constructor(np.abs(self.values),
  264. index=self.index).__finalize__(self)
  265. def get(self, label, default=None):
  266. """
  267. Returns value occupying requested label, default to specified
  268. missing value if not present. Analogous to dict.get
  269. Parameters
  270. ----------
  271. label : object
  272. Label value looking for
  273. default : object, optional
  274. Value to return if label not in index
  275. Returns
  276. -------
  277. y : scalar
  278. """
  279. if label in self.index:
  280. loc = self.index.get_loc(label)
  281. return self._get_val_at(loc)
  282. else:
  283. return default
  284. def get_value(self, label, takeable=False):
  285. """
  286. Retrieve single value at passed index label
  287. .. deprecated:: 0.21.0
  288. Please use .at[] or .iat[] accessors.
  289. Parameters
  290. ----------
  291. index : label
  292. takeable : interpret the index as indexers, default False
  293. Returns
  294. -------
  295. value : scalar value
  296. """
  297. warnings.warn("get_value is deprecated and will be removed "
  298. "in a future release. Please use "
  299. ".at[] or .iat[] accessors instead", FutureWarning,
  300. stacklevel=2)
  301. return self._get_value(label, takeable=takeable)
  302. def _get_value(self, label, takeable=False):
  303. loc = label if takeable is True else self.index.get_loc(label)
  304. return self._get_val_at(loc)
  305. _get_value.__doc__ = get_value.__doc__
  306. def set_value(self, label, value, takeable=False):
  307. """
  308. Quickly set single value at passed label. If label is not contained, a
  309. new object is created with the label placed at the end of the result
  310. index
  311. .. deprecated:: 0.21.0
  312. Please use .at[] or .iat[] accessors.
  313. Parameters
  314. ----------
  315. label : object
  316. Partial indexing with MultiIndex not allowed
  317. value : object
  318. Scalar value
  319. takeable : interpret the index as indexers, default False
  320. Notes
  321. -----
  322. This method *always* returns a new object. It is not particularly
  323. efficient but is provided for API compatibility with Series
  324. Returns
  325. -------
  326. series : SparseSeries
  327. """
  328. warnings.warn("set_value is deprecated and will be removed "
  329. "in a future release. Please use "
  330. ".at[] or .iat[] accessors instead", FutureWarning,
  331. stacklevel=2)
  332. return self._set_value(label, value, takeable=takeable)
  333. def _set_value(self, label, value, takeable=False):
  334. values = self.to_dense()
  335. # if the label doesn't exist, we will create a new object here
  336. # and possibly change the index
  337. new_values = values._set_value(label, value, takeable=takeable)
  338. if new_values is not None:
  339. values = new_values
  340. new_index = values.index
  341. values = SparseArray(values, fill_value=self.fill_value,
  342. kind=self.kind)
  343. self._data = SingleBlockManager(values, new_index)
  344. self._index = new_index
  345. _set_value.__doc__ = set_value.__doc__
  346. def _set_values(self, key, value):
  347. # this might be inefficient as we have to recreate the sparse array
  348. # rather than setting individual elements, but have to convert
  349. # the passed slice/boolean that's in dense space into a sparse indexer
  350. # not sure how to do that!
  351. if isinstance(key, Series):
  352. key = key.values
  353. values = self.values.to_dense()
  354. values[key] = libindex.convert_scalar(values, value)
  355. values = SparseArray(values, fill_value=self.fill_value,
  356. kind=self.kind)
  357. self._data = SingleBlockManager(values, self.index)
  358. def to_dense(self):
  359. """
  360. Convert SparseSeries to a Series.
  361. Returns
  362. -------
  363. s : Series
  364. """
  365. return Series(self.values.to_dense(), index=self.index,
  366. name=self.name)
  367. @property
  368. def density(self):
  369. return self.values.density
  370. def copy(self, deep=True):
  371. """
  372. Make a copy of the SparseSeries. Only the actual sparse values need to
  373. be copied
  374. """
  375. # TODO: https://github.com/pandas-dev/pandas/issues/22314
  376. # We skip the block manager till that is resolved.
  377. new_data = self.values.copy(deep=deep)
  378. return self._constructor(new_data, sparse_index=self.sp_index,
  379. fill_value=self.fill_value,
  380. index=self.index.copy(),
  381. name=self.name).__finalize__(self)
  382. @Substitution(**_shared_doc_kwargs)
  383. @Appender(generic.NDFrame.reindex.__doc__)
  384. def reindex(self, index=None, method=None, copy=True, limit=None,
  385. **kwargs):
  386. # TODO: remove?
  387. return super(SparseSeries, self).reindex(index=index, method=method,
  388. copy=copy, limit=limit,
  389. **kwargs)
  390. def sparse_reindex(self, new_index):
  391. """
  392. Conform sparse values to new SparseIndex
  393. Parameters
  394. ----------
  395. new_index : {BlockIndex, IntIndex}
  396. Returns
  397. -------
  398. reindexed : SparseSeries
  399. """
  400. if not isinstance(new_index, splib.SparseIndex):
  401. raise TypeError("new index must be a SparseIndex")
  402. values = self.values
  403. values = values.sp_index.to_int_index().reindex(
  404. values.sp_values.astype('float64'), values.fill_value, new_index)
  405. values = SparseArray(values,
  406. sparse_index=new_index,
  407. fill_value=self.values.fill_value)
  408. return self._constructor(values, index=self.index).__finalize__(self)
  409. def cumsum(self, axis=0, *args, **kwargs):
  410. """
  411. Cumulative sum of non-NA/null values.
  412. When performing the cumulative summation, any non-NA/null values will
  413. be skipped. The resulting SparseSeries will preserve the locations of
  414. NaN values, but the fill value will be `np.nan` regardless.
  415. Parameters
  416. ----------
  417. axis : {0}
  418. Returns
  419. -------
  420. cumsum : SparseSeries
  421. """
  422. nv.validate_cumsum(args, kwargs)
  423. # Validate axis
  424. if axis is not None:
  425. self._get_axis_number(axis)
  426. new_array = self.values.cumsum()
  427. return self._constructor(
  428. new_array, index=self.index,
  429. sparse_index=new_array.sp_index).__finalize__(self)
  430. # TODO: SparseSeries.isna is Sparse, while Series.isna is dense
  431. @Appender(generic._shared_docs['isna'] % _shared_doc_kwargs)
  432. def isna(self):
  433. arr = SparseArray(isna(self.values.sp_values),
  434. sparse_index=self.values.sp_index,
  435. fill_value=isna(self.fill_value))
  436. return self._constructor(arr, index=self.index).__finalize__(self)
  437. isnull = isna
  438. @Appender(generic._shared_docs['notna'] % _shared_doc_kwargs)
  439. def notna(self):
  440. arr = SparseArray(notna(self.values.sp_values),
  441. sparse_index=self.values.sp_index,
  442. fill_value=notna(self.fill_value))
  443. return self._constructor(arr, index=self.index).__finalize__(self)
  444. notnull = notna
  445. def dropna(self, axis=0, inplace=False, **kwargs):
  446. """
  447. Analogous to Series.dropna. If fill_value=NaN, returns a dense Series
  448. """
  449. # TODO: make more efficient
  450. # Validate axis
  451. self._get_axis_number(axis or 0)
  452. dense_valid = self.to_dense().dropna()
  453. if inplace:
  454. raise NotImplementedError("Cannot perform inplace dropna"
  455. " operations on a SparseSeries")
  456. if isna(self.fill_value):
  457. return dense_valid
  458. else:
  459. dense_valid = dense_valid[dense_valid != self.fill_value]
  460. return dense_valid.to_sparse(fill_value=self.fill_value)
  461. def combine_first(self, other):
  462. """
  463. Combine Series values, choosing the calling Series's values
  464. first. Result index will be the union of the two indexes
  465. Parameters
  466. ----------
  467. other : Series
  468. Returns
  469. -------
  470. y : Series
  471. """
  472. if isinstance(other, SparseSeries):
  473. other = other.to_dense()
  474. dense_combined = self.to_dense().combine_first(other)
  475. return dense_combined.to_sparse(fill_value=self.fill_value)
  476. @Appender(SparseAccessor.to_coo.__doc__)
  477. def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
  478. A, rows, columns = _sparse_series_to_coo(self, row_levels,
  479. column_levels,
  480. sort_labels=sort_labels)
  481. return A, rows, columns
  482. @classmethod
  483. @Appender(SparseAccessor.from_coo.__doc__)
  484. def from_coo(cls, A, dense_index=False):
  485. return _coo_to_sparse_series(A, dense_index=dense_index)
  486. # overwrite series methods with unaccelerated Sparse-specific versions
  487. ops.add_flex_arithmetic_methods(SparseSeries)
  488. ops.add_special_arithmetic_methods(SparseSeries)