123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592 |
- """
- Data structures for sparse float data. Life is made simpler by dealing only
- with float64 data
- """
- # pylint: disable=E1101,E1103,W0231
- import warnings
- import numpy as np
- import pandas._libs.index as libindex
- import pandas._libs.sparse as splib
- from pandas._libs.sparse import BlockIndex, IntIndex
- import pandas.compat as compat
- from pandas.compat.numpy import function as nv
- from pandas.util._decorators import Appender, Substitution
- from pandas.core.dtypes.common import is_integer, is_scalar
- from pandas.core.dtypes.generic import ABCSeries, ABCSparseSeries
- from pandas.core.dtypes.missing import isna, notna
- from pandas.core import generic
- from pandas.core.arrays import SparseArray
- from pandas.core.arrays.sparse import SparseAccessor
- from pandas.core.index import Index
- from pandas.core.internals import SingleBlockManager
- import pandas.core.ops as ops
- from pandas.core.series import Series
- from pandas.core.sparse.scipy_sparse import (
- _coo_to_sparse_series, _sparse_series_to_coo)
- _shared_doc_kwargs = dict(axes='index', klass='SparseSeries',
- axes_single_arg="{0, 'index'}",
- optional_labels='', optional_axis='')
- class SparseSeries(Series):
- """Data structure for labeled, sparse floating point data
- Parameters
- ----------
- data : {array-like, Series, SparseSeries, dict}
- .. versionchanged :: 0.23.0
- If data is a dict, argument order is maintained for Python 3.6
- and later.
- kind : {'block', 'integer'}
- fill_value : float
- Code for missing value. Defaults depends on dtype.
- 0 for int dtype, False for bool dtype, and NaN for other dtypes
- sparse_index : {BlockIndex, IntIndex}, optional
- Only if you have one. Mainly used internally
- Notes
- -----
- SparseSeries objects are immutable via the typical Python means. If you
- must change values, convert to dense, make your changes, then convert back
- to sparse
- """
- _subtyp = 'sparse_series'
- def __init__(self, data=None, index=None, sparse_index=None, kind='block',
- fill_value=None, name=None, dtype=None, copy=False,
- fastpath=False):
- # TODO: Most of this should be refactored and shared with Series
- # 1. BlockManager -> array
- # 2. Series.index, Series.name, index, name reconciliation
- # 3. Implicit reindexing
- # 4. Implicit broadcasting
- # 5. Dict construction
- if data is None:
- data = []
- elif isinstance(data, SingleBlockManager):
- index = data.index
- data = data.blocks[0].values
- elif isinstance(data, (ABCSeries, ABCSparseSeries)):
- index = data.index if index is None else index
- dtype = data.dtype if dtype is None else dtype
- name = data.name if name is None else name
- if index is not None:
- data = data.reindex(index)
- elif isinstance(data, compat.Mapping):
- data, index = Series()._init_dict(data, index=index)
- elif is_scalar(data) and index is not None:
- data = np.full(len(index), fill_value=data)
- super(SparseSeries, self).__init__(
- SparseArray(data,
- sparse_index=sparse_index,
- kind=kind,
- dtype=dtype,
- fill_value=fill_value,
- copy=copy),
- index=index, name=name,
- copy=False, fastpath=fastpath
- )
- def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
- # avoid infinite recursion for other SparseSeries inputs
- inputs = tuple(
- x.values if isinstance(x, type(self)) else x
- for x in inputs
- )
- result = self.values.__array_ufunc__(ufunc, method, *inputs, **kwargs)
- return self._constructor(result, index=self.index,
- sparse_index=self.sp_index,
- fill_value=result.fill_value,
- copy=False).__finalize__(self)
- def __array_wrap__(self, result, context=None):
- """
- Gets called prior to a ufunc (and after)
- See SparseArray.__array_wrap__ for detail.
- """
- result = self.values.__array_wrap__(result, context=context)
- return self._constructor(result, index=self.index,
- sparse_index=self.sp_index,
- fill_value=result.fill_value,
- copy=False).__finalize__(self)
- def __array_finalize__(self, obj):
- """
- Gets called after any ufunc or other array operations, necessary
- to pass on the index.
- """
- self.name = getattr(obj, 'name', None)
- self.fill_value = getattr(obj, 'fill_value', None)
- # unary ops
- # TODO: See if this can be shared
- def __pos__(self):
- result = self.values.__pos__()
- return self._constructor(result, index=self.index,
- sparse_index=self.sp_index,
- fill_value=result.fill_value,
- copy=False).__finalize__(self)
- def __neg__(self):
- result = self.values.__neg__()
- return self._constructor(result, index=self.index,
- sparse_index=self.sp_index,
- fill_value=result.fill_value,
- copy=False).__finalize__(self)
- def __invert__(self):
- result = self.values.__invert__()
- return self._constructor(result, index=self.index,
- sparse_index=self.sp_index,
- fill_value=result.fill_value,
- copy=False).__finalize__(self)
- @property
- def block(self):
- warnings.warn("SparseSeries.block is deprecated.", FutureWarning,
- stacklevel=2)
- return self._data._block
- @property
- def fill_value(self):
- return self.values.fill_value
- @fill_value.setter
- def fill_value(self, v):
- self.values.fill_value = v
- @property
- def sp_index(self):
- return self.values.sp_index
- @property
- def sp_values(self):
- return self.values.sp_values
- @property
- def npoints(self):
- return self.values.npoints
- @classmethod
- def from_array(cls, arr, index=None, name=None, copy=False,
- fill_value=None, fastpath=False):
- """Construct SparseSeries from array.
- .. deprecated:: 0.23.0
- Use the pd.SparseSeries(..) constructor instead.
- """
- warnings.warn("'from_array' is deprecated and will be removed in a "
- "future version. Please use the pd.SparseSeries(..) "
- "constructor instead.", FutureWarning, stacklevel=2)
- return cls(arr, index=index, name=name, copy=copy,
- fill_value=fill_value, fastpath=fastpath)
- @property
- def _constructor(self):
- return SparseSeries
- @property
- def _constructor_expanddim(self):
- from pandas.core.sparse.api import SparseDataFrame
- return SparseDataFrame
- @property
- def kind(self):
- if isinstance(self.sp_index, BlockIndex):
- return 'block'
- elif isinstance(self.sp_index, IntIndex):
- return 'integer'
- def as_sparse_array(self, kind=None, fill_value=None, copy=False):
- """ return my self as a sparse array, do not copy by default """
- if fill_value is None:
- fill_value = self.fill_value
- if kind is None:
- kind = self.kind
- return SparseArray(self.values, sparse_index=self.sp_index,
- fill_value=fill_value, kind=kind, copy=copy)
- def __unicode__(self):
- # currently, unicode is same as repr...fixes infinite loop
- series_rep = Series.__unicode__(self)
- rep = '{series}\n{index!r}'.format(series=series_rep,
- index=self.sp_index)
- return rep
- def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
- filter_type=None, **kwds):
- """ perform a reduction operation """
- return op(self.get_values(), skipna=skipna, **kwds)
- def __getstate__(self):
- # pickling
- return dict(_typ=self._typ, _subtyp=self._subtyp, _data=self._data,
- fill_value=self.fill_value, name=self.name)
- def _unpickle_series_compat(self, state):
- nd_state, own_state = state
- # recreate the ndarray
- data = np.empty(nd_state[1], dtype=nd_state[2])
- np.ndarray.__setstate__(data, nd_state)
- index, fill_value, sp_index = own_state[:3]
- name = None
- if len(own_state) > 3:
- name = own_state[3]
- # create a sparse array
- if not isinstance(data, SparseArray):
- data = SparseArray(data, sparse_index=sp_index,
- fill_value=fill_value, copy=False)
- # recreate
- data = SingleBlockManager(data, index, fastpath=True)
- generic.NDFrame.__init__(self, data)
- self._set_axis(0, index)
- self.name = name
- def _set_subtyp(self, is_all_dates):
- if is_all_dates:
- object.__setattr__(self, '_subtyp', 'sparse_time_series')
- else:
- object.__setattr__(self, '_subtyp', 'sparse_series')
- def _ixs(self, i, axis=0):
- """
- Return the i-th value or values in the SparseSeries by location
- Parameters
- ----------
- i : int, slice, or sequence of integers
- Returns
- -------
- value : scalar (int) or Series (slice, sequence)
- """
- label = self.index[i]
- if isinstance(label, Index):
- return self.take(i, axis=axis)
- else:
- return self._get_val_at(i)
- def _get_val_at(self, loc):
- """ forward to the array """
- return self.values._get_val_at(loc)
- def __getitem__(self, key):
- # TODO: Document difference from Series.__getitem__, deprecate,
- # and remove!
- if is_integer(key) and key not in self.index:
- return self._get_val_at(key)
- else:
- return super(SparseSeries, self).__getitem__(key)
- def _get_values(self, indexer):
- try:
- return self._constructor(self._data.get_slice(indexer),
- fastpath=True).__finalize__(self)
- except Exception:
- return self[indexer]
- def _set_with_engine(self, key, value):
- return self._set_value(key, value)
- def abs(self):
- """
- Return an object with absolute value taken. Only applicable to objects
- that are all numeric
- Returns
- -------
- abs: same type as caller
- """
- return self._constructor(np.abs(self.values),
- index=self.index).__finalize__(self)
- def get(self, label, default=None):
- """
- Returns value occupying requested label, default to specified
- missing value if not present. Analogous to dict.get
- Parameters
- ----------
- label : object
- Label value looking for
- default : object, optional
- Value to return if label not in index
- Returns
- -------
- y : scalar
- """
- if label in self.index:
- loc = self.index.get_loc(label)
- return self._get_val_at(loc)
- else:
- return default
- def get_value(self, label, takeable=False):
- """
- Retrieve single value at passed index label
- .. deprecated:: 0.21.0
- Please use .at[] or .iat[] accessors.
- Parameters
- ----------
- index : label
- takeable : interpret the index as indexers, default False
- Returns
- -------
- value : scalar value
- """
- warnings.warn("get_value is deprecated and will be removed "
- "in a future release. Please use "
- ".at[] or .iat[] accessors instead", FutureWarning,
- stacklevel=2)
- return self._get_value(label, takeable=takeable)
- def _get_value(self, label, takeable=False):
- loc = label if takeable is True else self.index.get_loc(label)
- return self._get_val_at(loc)
- _get_value.__doc__ = get_value.__doc__
- def set_value(self, label, value, takeable=False):
- """
- Quickly set single value at passed label. If label is not contained, a
- new object is created with the label placed at the end of the result
- index
- .. deprecated:: 0.21.0
- Please use .at[] or .iat[] accessors.
- Parameters
- ----------
- label : object
- Partial indexing with MultiIndex not allowed
- value : object
- Scalar value
- takeable : interpret the index as indexers, default False
- Notes
- -----
- This method *always* returns a new object. It is not particularly
- efficient but is provided for API compatibility with Series
- Returns
- -------
- series : SparseSeries
- """
- warnings.warn("set_value is deprecated and will be removed "
- "in a future release. Please use "
- ".at[] or .iat[] accessors instead", FutureWarning,
- stacklevel=2)
- return self._set_value(label, value, takeable=takeable)
- def _set_value(self, label, value, takeable=False):
- values = self.to_dense()
- # if the label doesn't exist, we will create a new object here
- # and possibly change the index
- new_values = values._set_value(label, value, takeable=takeable)
- if new_values is not None:
- values = new_values
- new_index = values.index
- values = SparseArray(values, fill_value=self.fill_value,
- kind=self.kind)
- self._data = SingleBlockManager(values, new_index)
- self._index = new_index
- _set_value.__doc__ = set_value.__doc__
- def _set_values(self, key, value):
- # this might be inefficient as we have to recreate the sparse array
- # rather than setting individual elements, but have to convert
- # the passed slice/boolean that's in dense space into a sparse indexer
- # not sure how to do that!
- if isinstance(key, Series):
- key = key.values
- values = self.values.to_dense()
- values[key] = libindex.convert_scalar(values, value)
- values = SparseArray(values, fill_value=self.fill_value,
- kind=self.kind)
- self._data = SingleBlockManager(values, self.index)
- def to_dense(self):
- """
- Convert SparseSeries to a Series.
- Returns
- -------
- s : Series
- """
- return Series(self.values.to_dense(), index=self.index,
- name=self.name)
- @property
- def density(self):
- return self.values.density
- def copy(self, deep=True):
- """
- Make a copy of the SparseSeries. Only the actual sparse values need to
- be copied
- """
- # TODO: https://github.com/pandas-dev/pandas/issues/22314
- # We skip the block manager till that is resolved.
- new_data = self.values.copy(deep=deep)
- return self._constructor(new_data, sparse_index=self.sp_index,
- fill_value=self.fill_value,
- index=self.index.copy(),
- name=self.name).__finalize__(self)
- @Substitution(**_shared_doc_kwargs)
- @Appender(generic.NDFrame.reindex.__doc__)
- def reindex(self, index=None, method=None, copy=True, limit=None,
- **kwargs):
- # TODO: remove?
- return super(SparseSeries, self).reindex(index=index, method=method,
- copy=copy, limit=limit,
- **kwargs)
- def sparse_reindex(self, new_index):
- """
- Conform sparse values to new SparseIndex
- Parameters
- ----------
- new_index : {BlockIndex, IntIndex}
- Returns
- -------
- reindexed : SparseSeries
- """
- if not isinstance(new_index, splib.SparseIndex):
- raise TypeError("new index must be a SparseIndex")
- values = self.values
- values = values.sp_index.to_int_index().reindex(
- values.sp_values.astype('float64'), values.fill_value, new_index)
- values = SparseArray(values,
- sparse_index=new_index,
- fill_value=self.values.fill_value)
- return self._constructor(values, index=self.index).__finalize__(self)
- def cumsum(self, axis=0, *args, **kwargs):
- """
- Cumulative sum of non-NA/null values.
- When performing the cumulative summation, any non-NA/null values will
- be skipped. The resulting SparseSeries will preserve the locations of
- NaN values, but the fill value will be `np.nan` regardless.
- Parameters
- ----------
- axis : {0}
- Returns
- -------
- cumsum : SparseSeries
- """
- nv.validate_cumsum(args, kwargs)
- # Validate axis
- if axis is not None:
- self._get_axis_number(axis)
- new_array = self.values.cumsum()
- return self._constructor(
- new_array, index=self.index,
- sparse_index=new_array.sp_index).__finalize__(self)
- # TODO: SparseSeries.isna is Sparse, while Series.isna is dense
- @Appender(generic._shared_docs['isna'] % _shared_doc_kwargs)
- def isna(self):
- arr = SparseArray(isna(self.values.sp_values),
- sparse_index=self.values.sp_index,
- fill_value=isna(self.fill_value))
- return self._constructor(arr, index=self.index).__finalize__(self)
- isnull = isna
- @Appender(generic._shared_docs['notna'] % _shared_doc_kwargs)
- def notna(self):
- arr = SparseArray(notna(self.values.sp_values),
- sparse_index=self.values.sp_index,
- fill_value=notna(self.fill_value))
- return self._constructor(arr, index=self.index).__finalize__(self)
- notnull = notna
- def dropna(self, axis=0, inplace=False, **kwargs):
- """
- Analogous to Series.dropna. If fill_value=NaN, returns a dense Series
- """
- # TODO: make more efficient
- # Validate axis
- self._get_axis_number(axis or 0)
- dense_valid = self.to_dense().dropna()
- if inplace:
- raise NotImplementedError("Cannot perform inplace dropna"
- " operations on a SparseSeries")
- if isna(self.fill_value):
- return dense_valid
- else:
- dense_valid = dense_valid[dense_valid != self.fill_value]
- return dense_valid.to_sparse(fill_value=self.fill_value)
- def combine_first(self, other):
- """
- Combine Series values, choosing the calling Series's values
- first. Result index will be the union of the two indexes
- Parameters
- ----------
- other : Series
- Returns
- -------
- y : Series
- """
- if isinstance(other, SparseSeries):
- other = other.to_dense()
- dense_combined = self.to_dense().combine_first(other)
- return dense_combined.to_sparse(fill_value=self.fill_value)
- @Appender(SparseAccessor.to_coo.__doc__)
- def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
- A, rows, columns = _sparse_series_to_coo(self, row_levels,
- column_levels,
- sort_labels=sort_labels)
- return A, rows, columns
- @classmethod
- @Appender(SparseAccessor.from_coo.__doc__)
- def from_coo(cls, A, dense_index=False):
- return _coo_to_sparse_series(A, dense_index=dense_index)
- # overwrite series methods with unaccelerated Sparse-specific versions
- ops.add_flex_arithmetic_methods(SparseSeries)
- ops.add_special_arithmetic_methods(SparseSeries)
|