123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028 |
- """
- SparseArray data structure
- """
- from __future__ import division
- import numbers
- import operator
- import re
- import warnings
- import numpy as np
- from pandas._libs import index as libindex, lib
- import pandas._libs.sparse as splib
- from pandas._libs.sparse import BlockIndex, IntIndex
- from pandas._libs.tslibs import NaT
- import pandas.compat as compat
- from pandas.compat.numpy import function as nv
- from pandas.errors import PerformanceWarning
- from pandas.core.dtypes.base import ExtensionDtype
- from pandas.core.dtypes.cast import (
- astype_nansafe, construct_1d_arraylike_from_scalar, find_common_type,
- infer_dtype_from_scalar, maybe_convert_platform)
- from pandas.core.dtypes.common import (
- is_array_like, is_bool_dtype, is_datetime64_any_dtype, is_dtype_equal,
- is_integer, is_list_like, is_object_dtype, is_scalar, is_string_dtype,
- pandas_dtype)
- from pandas.core.dtypes.dtypes import register_extension_dtype
- from pandas.core.dtypes.generic import (
- ABCIndexClass, ABCSeries, ABCSparseSeries)
- from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna
- from pandas.core.accessor import PandasDelegate, delegate_names
- import pandas.core.algorithms as algos
- from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
- from pandas.core.base import PandasObject
- import pandas.core.common as com
- from pandas.core.missing import interpolate_2d
- import pandas.io.formats.printing as printing
- # ----------------------------------------------------------------------------
- # Dtype
- @register_extension_dtype
- class SparseDtype(ExtensionDtype):
- """
- Dtype for data stored in :class:`SparseArray`.
- This dtype implements the pandas ExtensionDtype interface.
- .. versionadded:: 0.24.0
- Parameters
- ----------
- dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64
- The dtype of the underlying array storing the non-fill value values.
- fill_value : scalar, optional
- The scalar value not stored in the SparseArray. By default, this
- depends on `dtype`.
- =========== ==========
- dtype na_value
- =========== ==========
- float ``np.nan``
- int ``0``
- bool ``False``
- datetime64 ``pd.NaT``
- timedelta64 ``pd.NaT``
- =========== ==========
- The default value may be overridden by specifying a `fill_value`.
- """
- # We include `_is_na_fill_value` in the metadata to avoid hash collisions
- # between SparseDtype(float, 0.0) and SparseDtype(float, nan).
- # Without is_na_fill_value in the comparison, those would be equal since
- # hash(nan) is (sometimes?) 0.
- _metadata = ('_dtype', '_fill_value', '_is_na_fill_value')
- def __init__(self, dtype=np.float64, fill_value=None):
- # type: (Union[str, np.dtype, 'ExtensionDtype', type], Any) -> None
- from pandas.core.dtypes.missing import na_value_for_dtype
- from pandas.core.dtypes.common import (
- pandas_dtype, is_string_dtype, is_scalar
- )
- if isinstance(dtype, type(self)):
- if fill_value is None:
- fill_value = dtype.fill_value
- dtype = dtype.subtype
- dtype = pandas_dtype(dtype)
- if is_string_dtype(dtype):
- dtype = np.dtype('object')
- if fill_value is None:
- fill_value = na_value_for_dtype(dtype)
- if not is_scalar(fill_value):
- raise ValueError("fill_value must be a scalar. Got {} "
- "instead".format(fill_value))
- self._dtype = dtype
- self._fill_value = fill_value
- def __hash__(self):
- # Python3 doesn't inherit __hash__ when a base class overrides
- # __eq__, so we explicitly do it here.
- return super(SparseDtype, self).__hash__()
- def __eq__(self, other):
- # We have to override __eq__ to handle NA values in _metadata.
- # The base class does simple == checks, which fail for NA.
- if isinstance(other, compat.string_types):
- try:
- other = self.construct_from_string(other)
- except TypeError:
- return False
- if isinstance(other, type(self)):
- subtype = self.subtype == other.subtype
- if self._is_na_fill_value:
- # this case is complicated by two things:
- # SparseDtype(float, float(nan)) == SparseDtype(float, np.nan)
- # SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT)
- # i.e. we want to treat any floating-point NaN as equal, but
- # not a floating-point NaN and a datetime NaT.
- fill_value = (
- other._is_na_fill_value and
- isinstance(self.fill_value, type(other.fill_value)) or
- isinstance(other.fill_value, type(self.fill_value))
- )
- else:
- fill_value = self.fill_value == other.fill_value
- return subtype and fill_value
- return False
- @property
- def fill_value(self):
- """
- The fill value of the array.
- Converting the SparseArray to a dense ndarray will fill the
- array with this value.
- .. warning::
- It's possible to end up with a SparseArray that has ``fill_value``
- values in ``sp_values``. This can occur, for example, when setting
- ``SparseArray.fill_value`` directly.
- """
- return self._fill_value
- @property
- def _is_na_fill_value(self):
- from pandas.core.dtypes.missing import isna
- return isna(self.fill_value)
- @property
- def _is_numeric(self):
- from pandas.core.dtypes.common import is_object_dtype
- return not is_object_dtype(self.subtype)
- @property
- def _is_boolean(self):
- from pandas.core.dtypes.common import is_bool_dtype
- return is_bool_dtype(self.subtype)
- @property
- def kind(self):
- """
- The sparse kind. Either 'integer', or 'block'.
- """
- return self.subtype.kind
- @property
- def type(self):
- return self.subtype.type
- @property
- def subtype(self):
- return self._dtype
- @property
- def name(self):
- return 'Sparse[{}, {}]'.format(self.subtype.name, self.fill_value)
- def __repr__(self):
- return self.name
- @classmethod
- def construct_array_type(cls):
- return SparseArray
- @classmethod
- def construct_from_string(cls, string):
- """
- Construct a SparseDtype from a string form.
- Parameters
- ----------
- string : str
- Can take the following forms.
- string dtype
- ================ ============================
- 'int' SparseDtype[np.int64, 0]
- 'Sparse' SparseDtype[np.float64, nan]
- 'Sparse[int]' SparseDtype[np.int64, 0]
- 'Sparse[int, 0]' SparseDtype[np.int64, 0]
- ================ ============================
- It is not possible to specify non-default fill values
- with a string. An argument like ``'Sparse[int, 1]'``
- will raise a ``TypeError`` because the default fill value
- for integers is 0.
- Returns
- -------
- SparseDtype
- """
- msg = "Could not construct SparseDtype from '{}'".format(string)
- if string.startswith("Sparse"):
- try:
- sub_type, has_fill_value = cls._parse_subtype(string)
- result = SparseDtype(sub_type)
- except Exception:
- raise TypeError(msg)
- else:
- msg = ("Could not construct SparseDtype from '{}'.\n\nIt "
- "looks like the fill_value in the string is not "
- "the default for the dtype. Non-default fill_values "
- "are not supported. Use the 'SparseDtype()' "
- "constructor instead.")
- if has_fill_value and str(result) != string:
- raise TypeError(msg.format(string))
- return result
- else:
- raise TypeError(msg)
- @staticmethod
- def _parse_subtype(dtype):
- """
- Parse a string to get the subtype
- Parameters
- ----------
- dtype : str
- A string like
- * Sparse[subtype]
- * Sparse[subtype, fill_value]
- Returns
- -------
- subtype : str
- Raises
- ------
- ValueError
- When the subtype cannot be extracted.
- """
- xpr = re.compile(
- r"Sparse\[(?P<subtype>[^,]*)(, )?(?P<fill_value>.*?)?\]$"
- )
- m = xpr.match(dtype)
- has_fill_value = False
- if m:
- subtype = m.groupdict()['subtype']
- has_fill_value = m.groupdict()['fill_value'] or has_fill_value
- elif dtype == "Sparse":
- subtype = 'float64'
- else:
- raise ValueError("Cannot parse {}".format(dtype))
- return subtype, has_fill_value
- @classmethod
- def is_dtype(cls, dtype):
- dtype = getattr(dtype, 'dtype', dtype)
- if (isinstance(dtype, compat.string_types) and
- dtype.startswith("Sparse")):
- sub_type, _ = cls._parse_subtype(dtype)
- dtype = np.dtype(sub_type)
- elif isinstance(dtype, cls):
- return True
- return isinstance(dtype, np.dtype) or dtype == 'Sparse'
- def update_dtype(self, dtype):
- """
- Convert the SparseDtype to a new dtype.
- This takes care of converting the ``fill_value``.
- Parameters
- ----------
- dtype : Union[str, numpy.dtype, SparseDtype]
- The new dtype to use.
- * For a SparseDtype, it is simply returned
- * For a NumPy dtype (or str), the current fill value
- is converted to the new dtype, and a SparseDtype
- with `dtype` and the new fill value is returned.
- Returns
- -------
- SparseDtype
- A new SparseDtype with the corret `dtype` and fill value
- for that `dtype`.
- Raises
- ------
- ValueError
- When the current fill value cannot be converted to the
- new `dtype` (e.g. trying to convert ``np.nan`` to an
- integer dtype).
- Examples
- --------
- >>> SparseDtype(int, 0).update_dtype(float)
- Sparse[float64, 0.0]
- >>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan))
- Sparse[float64, nan]
- """
- cls = type(self)
- dtype = pandas_dtype(dtype)
- if not isinstance(dtype, cls):
- fill_value = astype_nansafe(np.array(self.fill_value),
- dtype).item()
- dtype = cls(dtype, fill_value=fill_value)
- return dtype
- @property
- def _subtype_with_str(self):
- """
- Whether the SparseDtype's subtype should be considered ``str``.
- Typically, pandas will store string data in an object-dtype array.
- When converting values to a dtype, e.g. in ``.astype``, we need to
- be more specific, we need the actual underlying type.
- Returns
- -------
- >>> SparseDtype(int, 1)._subtype_with_str
- dtype('int64')
- >>> SparseDtype(object, 1)._subtype_with_str
- dtype('O')
- >>> dtype = SparseDtype(str, '')
- >>> dtype.subtype
- dtype('O')
- >>> dtype._subtype_with_str
- str
- """
- if isinstance(self.fill_value, compat.string_types):
- return type(self.fill_value)
- return self.subtype
- # ----------------------------------------------------------------------------
- # Array
- _sparray_doc_kwargs = dict(klass='SparseArray')
- def _get_fill(arr):
- # type: (SparseArray) -> ndarray
- """
- Create a 0-dim ndarray containing the fill value
- Parameters
- ----------
- arr : SparseArray
- Returns
- -------
- fill_value : ndarray
- 0-dim ndarray with just the fill value.
- Notes
- -----
- coerce fill_value to arr dtype if possible
- int64 SparseArray can have NaN as fill_value if there is no missing
- """
- try:
- return np.asarray(arr.fill_value, dtype=arr.dtype.subtype)
- except ValueError:
- return np.asarray(arr.fill_value)
- def _sparse_array_op(left, right, op, name):
- """
- Perform a binary operation between two arrays.
- Parameters
- ----------
- left : Union[SparseArray, ndarray]
- right : Union[SparseArray, ndarray]
- op : Callable
- The binary operation to perform
- name str
- Name of the callable.
- Returns
- -------
- SparseArray
- """
- # type: (SparseArray, SparseArray, Callable, str) -> Any
- if name.startswith('__'):
- # For lookups in _libs.sparse we need non-dunder op name
- name = name[2:-2]
- # dtype used to find corresponding sparse method
- ltype = left.dtype.subtype
- rtype = right.dtype.subtype
- if not is_dtype_equal(ltype, rtype):
- subtype = find_common_type([ltype, rtype])
- ltype = SparseDtype(subtype, left.fill_value)
- rtype = SparseDtype(subtype, right.fill_value)
- # TODO(GH-23092): pass copy=False. Need to fix astype_nansafe
- left = left.astype(ltype)
- right = right.astype(rtype)
- dtype = ltype.subtype
- else:
- dtype = ltype
- # dtype the result must have
- result_dtype = None
- if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0:
- with np.errstate(all='ignore'):
- result = op(left.get_values(), right.get_values())
- fill = op(_get_fill(left), _get_fill(right))
- if left.sp_index.ngaps == 0:
- index = left.sp_index
- else:
- index = right.sp_index
- elif left.sp_index.equals(right.sp_index):
- with np.errstate(all='ignore'):
- result = op(left.sp_values, right.sp_values)
- fill = op(_get_fill(left), _get_fill(right))
- index = left.sp_index
- else:
- if name[0] == 'r':
- left, right = right, left
- name = name[1:]
- if name in ('and', 'or') and dtype == 'bool':
- opname = 'sparse_{name}_uint8'.format(name=name)
- # to make template simple, cast here
- left_sp_values = left.sp_values.view(np.uint8)
- right_sp_values = right.sp_values.view(np.uint8)
- result_dtype = np.bool
- else:
- opname = 'sparse_{name}_{dtype}'.format(name=name, dtype=dtype)
- left_sp_values = left.sp_values
- right_sp_values = right.sp_values
- sparse_op = getattr(splib, opname)
- with np.errstate(all='ignore'):
- result, index, fill = sparse_op(
- left_sp_values, left.sp_index, left.fill_value,
- right_sp_values, right.sp_index, right.fill_value)
- if result_dtype is None:
- result_dtype = result.dtype
- return _wrap_result(name, result, index, fill, dtype=result_dtype)
- def _wrap_result(name, data, sparse_index, fill_value, dtype=None):
- """
- wrap op result to have correct dtype
- """
- if name.startswith('__'):
- # e.g. __eq__ --> eq
- name = name[2:-2]
- if name in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'):
- dtype = np.bool
- fill_value = lib.item_from_zerodim(fill_value)
- if is_bool_dtype(dtype):
- # fill_value may be np.bool_
- fill_value = bool(fill_value)
- return SparseArray(data,
- sparse_index=sparse_index,
- fill_value=fill_value,
- dtype=dtype)
- class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin):
- """
- An ExtensionArray for storing sparse data.
- .. versionchanged:: 0.24.0
- Implements the ExtensionArray interface.
- Parameters
- ----------
- data : array-like
- A dense array of values to store in the SparseArray. This may contain
- `fill_value`.
- sparse_index : SparseIndex, optional
- index : Index
- fill_value : scalar, optional
- Elements in `data` that are `fill_value` are not stored in the
- SparseArray. For memory savings, this should be the most common value
- in `data`. By default, `fill_value` depends on the dtype of `data`:
- =========== ==========
- data.dtype na_value
- =========== ==========
- float ``np.nan``
- int ``0``
- bool False
- datetime64 ``pd.NaT``
- timedelta64 ``pd.NaT``
- =========== ==========
- The fill value is potentiall specified in three ways. In order of
- precedence, these are
- 1. The `fill_value` argument
- 2. ``dtype.fill_value`` if `fill_value` is None and `dtype` is
- a ``SparseDtype``
- 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype`
- is not a ``SparseDtype`` and `data` is a ``SparseArray``.
- kind : {'integer', 'block'}, default 'integer'
- The type of storage for sparse locations.
- * 'block': Stores a `block` and `block_length` for each
- contiguous *span* of sparse values. This is best when
- sparse data tends to be clumped together, with large
- regsions of ``fill-value`` values between sparse values.
- * 'integer': uses an integer to store the location of
- each sparse value.
- dtype : np.dtype or SparseDtype, optional
- The dtype to use for the SparseArray. For numpy dtypes, this
- determines the dtype of ``self.sp_values``. For SparseDtype,
- this determines ``self.sp_values`` and ``self.fill_value``.
- copy : bool, default False
- Whether to explicitly copy the incoming `data` array.
- """
- __array_priority__ = 15
- _pandas_ftype = 'sparse'
- _subtyp = 'sparse_array' # register ABCSparseArray
- def __init__(self, data, sparse_index=None, index=None, fill_value=None,
- kind='integer', dtype=None, copy=False):
- from pandas.core.internals import SingleBlockManager
- if isinstance(data, SingleBlockManager):
- data = data.internal_values()
- if fill_value is None and isinstance(dtype, SparseDtype):
- fill_value = dtype.fill_value
- if isinstance(data, (type(self), ABCSparseSeries)):
- # disable normal inference on dtype, sparse_index, & fill_value
- if sparse_index is None:
- sparse_index = data.sp_index
- if fill_value is None:
- fill_value = data.fill_value
- if dtype is None:
- dtype = data.dtype
- # TODO: make kind=None, and use data.kind?
- data = data.sp_values
- # Handle use-provided dtype
- if isinstance(dtype, compat.string_types):
- # Two options: dtype='int', regular numpy dtype
- # or dtype='Sparse[int]', a sparse dtype
- try:
- dtype = SparseDtype.construct_from_string(dtype)
- except TypeError:
- dtype = pandas_dtype(dtype)
- if isinstance(dtype, SparseDtype):
- if fill_value is None:
- fill_value = dtype.fill_value
- dtype = dtype.subtype
- if index is not None and not is_scalar(data):
- raise Exception("must only pass scalars with an index ")
- if is_scalar(data):
- if index is not None:
- if data is None:
- data = np.nan
- if index is not None:
- npoints = len(index)
- elif sparse_index is None:
- npoints = 1
- else:
- npoints = sparse_index.length
- dtype = infer_dtype_from_scalar(data)[0]
- data = construct_1d_arraylike_from_scalar(
- data, npoints, dtype
- )
- if dtype is not None:
- dtype = pandas_dtype(dtype)
- # TODO: disentangle the fill_value dtype inference from
- # dtype inference
- if data is None:
- # XXX: What should the empty dtype be? Object or float?
- data = np.array([], dtype=dtype)
- if not is_array_like(data):
- try:
- # probably shared code in sanitize_series
- from pandas.core.internals.construction import sanitize_array
- data = sanitize_array(data, index=None)
- except ValueError:
- # NumPy may raise a ValueError on data like [1, []]
- # we retry with object dtype here.
- if dtype is None:
- dtype = object
- data = np.atleast_1d(np.asarray(data, dtype=dtype))
- else:
- raise
- if copy:
- # TODO: avoid double copy when dtype forces cast.
- data = data.copy()
- if fill_value is None:
- fill_value_dtype = data.dtype if dtype is None else dtype
- if fill_value_dtype is None:
- fill_value = np.nan
- else:
- fill_value = na_value_for_dtype(fill_value_dtype)
- if isinstance(data, type(self)) and sparse_index is None:
- sparse_index = data._sparse_index
- sparse_values = np.asarray(data.sp_values, dtype=dtype)
- elif sparse_index is None:
- sparse_values, sparse_index, fill_value = make_sparse(
- data, kind=kind, fill_value=fill_value, dtype=dtype
- )
- else:
- sparse_values = np.asarray(data, dtype=dtype)
- if len(sparse_values) != sparse_index.npoints:
- raise AssertionError("Non array-like type {type} must "
- "have the same length as the index"
- .format(type=type(sparse_values)))
- self._sparse_index = sparse_index
- self._sparse_values = sparse_values
- self._dtype = SparseDtype(sparse_values.dtype, fill_value)
- @classmethod
- def _simple_new(cls, sparse_array, sparse_index, dtype):
- # type: (np.ndarray, SparseIndex, SparseDtype) -> 'SparseArray'
- new = cls([])
- new._sparse_index = sparse_index
- new._sparse_values = sparse_array
- new._dtype = dtype
- return new
- def __array__(self, dtype=None, copy=True):
- fill_value = self.fill_value
- if self.sp_index.ngaps == 0:
- # Compat for na dtype and int values.
- return self.sp_values
- if dtype is None:
- # Can NumPy represent this type?
- # If not, `np.result_type` will raise. We catch that
- # and return object.
- if is_datetime64_any_dtype(self.sp_values.dtype):
- # However, we *do* special-case the common case of
- # a datetime64 with pandas NaT.
- if fill_value is NaT:
- # Can't put pd.NaT in a datetime64[ns]
- fill_value = np.datetime64('NaT')
- try:
- dtype = np.result_type(self.sp_values.dtype, type(fill_value))
- except TypeError:
- dtype = object
- out = np.full(self.shape, fill_value, dtype=dtype)
- out[self.sp_index.to_int_index().indices] = self.sp_values
- return out
- def __setitem__(self, key, value):
- # I suppose we could allow setting of non-fill_value elements.
- # TODO(SparseArray.__setitem__): remove special cases in
- # ExtensionBlock.where
- msg = "SparseArray does not support item assignment via setitem"
- raise TypeError(msg)
- @classmethod
- def _from_sequence(cls, scalars, dtype=None, copy=False):
- return cls(scalars, dtype=dtype)
- @classmethod
- def _from_factorized(cls, values, original):
- return cls(values, dtype=original.dtype)
- # ------------------------------------------------------------------------
- # Data
- # ------------------------------------------------------------------------
- @property
- def sp_index(self):
- """
- The SparseIndex containing the location of non- ``fill_value`` points.
- """
- return self._sparse_index
- @property
- def sp_values(self):
- """
- An ndarray containing the non- ``fill_value`` values.
- Examples
- --------
- >>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0)
- >>> s.sp_values
- array([1, 2])
- """
- return self._sparse_values
- @property
- def dtype(self):
- return self._dtype
- @property
- def fill_value(self):
- """
- Elements in `data` that are `fill_value` are not stored.
- For memory savings, this should be the most common value in the array.
- """
- return self.dtype.fill_value
- @fill_value.setter
- def fill_value(self, value):
- self._dtype = SparseDtype(self.dtype.subtype, value)
- @property
- def kind(self):
- """
- The kind of sparse index for this array. One of {'integer', 'block'}.
- """
- if isinstance(self.sp_index, IntIndex):
- return 'integer'
- else:
- return 'block'
- @property
- def _valid_sp_values(self):
- sp_vals = self.sp_values
- mask = notna(sp_vals)
- return sp_vals[mask]
- def __len__(self):
- return self.sp_index.length
- @property
- def _null_fill_value(self):
- return self._dtype._is_na_fill_value
- def _fill_value_matches(self, fill_value):
- if self._null_fill_value:
- return isna(fill_value)
- else:
- return self.fill_value == fill_value
- @property
- def nbytes(self):
- return self.sp_values.nbytes + self.sp_index.nbytes
- @property
- def density(self):
- """
- The percent of non- ``fill_value`` points, as decimal.
- Examples
- --------
- >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
- >>> s.density
- 0.6
- """
- r = float(self.sp_index.npoints) / float(self.sp_index.length)
- return r
- @property
- def npoints(self):
- """
- The number of non- ``fill_value`` points.
- Examples
- --------
- >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
- >>> s.npoints
- 3
- """
- return self.sp_index.npoints
- @property
- def values(self):
- """
- Dense values
- """
- return self.to_dense()
- def isna(self):
- from pandas import isna
- # If null fill value, we want SparseDtype[bool, true]
- # to preserve the same memory usage.
- dtype = SparseDtype(bool, self._null_fill_value)
- return type(self)._simple_new(isna(self.sp_values),
- self.sp_index, dtype)
- def fillna(self, value=None, method=None, limit=None):
- """
- Fill missing values with `value`.
- Parameters
- ----------
- value : scalar, optional
- method : str, optional
- .. warning::
- Using 'method' will result in high memory use,
- as all `fill_value` methods will be converted to
- an in-memory ndarray
- limit : int, optional
- Returns
- -------
- SparseArray
- Notes
- -----
- When `value` is specified, the result's ``fill_value`` depends on
- ``self.fill_value``. The goal is to maintain low-memory use.
- If ``self.fill_value`` is NA, the result dtype will be
- ``SparseDtype(self.dtype, fill_value=value)``. This will preserve
- amount of memory used before and after filling.
- When ``self.fill_value`` is not NA, the result dtype will be
- ``self.dtype``. Again, this preserves the amount of memory used.
- """
- if ((method is None and value is None) or
- (method is not None and value is not None)):
- raise ValueError("Must specify one of 'method' or 'value'.")
- elif method is not None:
- msg = "fillna with 'method' requires high memory usage."
- warnings.warn(msg, PerformanceWarning)
- filled = interpolate_2d(np.asarray(self), method=method,
- limit=limit)
- return type(self)(filled, fill_value=self.fill_value)
- else:
- new_values = np.where(isna(self.sp_values), value, self.sp_values)
- if self._null_fill_value:
- # This is essentially just updating the dtype.
- new_dtype = SparseDtype(self.dtype.subtype, fill_value=value)
- else:
- new_dtype = self.dtype
- return self._simple_new(new_values, self._sparse_index, new_dtype)
- def shift(self, periods=1, fill_value=None):
- if not len(self) or periods == 0:
- return self.copy()
- if isna(fill_value):
- fill_value = self.dtype.na_value
- subtype = np.result_type(fill_value, self.dtype.subtype)
- if subtype != self.dtype.subtype:
- # just coerce up front
- arr = self.astype(SparseDtype(subtype, self.fill_value))
- else:
- arr = self
- empty = self._from_sequence(
- [fill_value] * min(abs(periods), len(self)),
- dtype=arr.dtype
- )
- if periods > 0:
- a = empty
- b = arr[:-periods]
- else:
- a = arr[abs(periods):]
- b = empty
- return arr._concat_same_type([a, b])
- def _first_fill_value_loc(self):
- """
- Get the location of the first missing value.
- Returns
- -------
- int
- """
- if len(self) == 0 or self.sp_index.npoints == len(self):
- return -1
- indices = self.sp_index.to_int_index().indices
- if not len(indices) or indices[0] > 0:
- return 0
- diff = indices[1:] - indices[:-1]
- return np.searchsorted(diff, 2) + 1
- def unique(self):
- uniques = list(algos.unique(self.sp_values))
- fill_loc = self._first_fill_value_loc()
- if fill_loc >= 0:
- uniques.insert(fill_loc, self.fill_value)
- return type(self)._from_sequence(uniques, dtype=self.dtype)
- def _values_for_factorize(self):
- # Still override this for hash_pandas_object
- return np.asarray(self), self.fill_value
- def factorize(self, na_sentinel=-1):
- # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
- # The sparsity on this is backwards from what Sparse would want. Want
- # ExtensionArray.factorize -> Tuple[EA, EA]
- # Given that we have to return a dense array of labels, why bother
- # implementing an efficient factorize?
- labels, uniques = algos.factorize(np.asarray(self),
- na_sentinel=na_sentinel)
- uniques = SparseArray(uniques, dtype=self.dtype)
- return labels, uniques
- def value_counts(self, dropna=True):
- """
- Returns a Series containing counts of unique values.
- Parameters
- ----------
- dropna : boolean, default True
- Don't include counts of NaN, even if NaN is in sp_values.
- Returns
- -------
- counts : Series
- """
- from pandas import Index, Series
- keys, counts = algos._value_counts_arraylike(self.sp_values,
- dropna=dropna)
- fcounts = self.sp_index.ngaps
- if fcounts > 0:
- if self._null_fill_value and dropna:
- pass
- else:
- if self._null_fill_value:
- mask = isna(keys)
- else:
- mask = keys == self.fill_value
- if mask.any():
- counts[mask] += fcounts
- else:
- keys = np.insert(keys, 0, self.fill_value)
- counts = np.insert(counts, 0, fcounts)
- if not isinstance(keys, ABCIndexClass):
- keys = Index(keys)
- result = Series(counts, index=keys)
- return result
- # --------
- # Indexing
- # --------
- def __getitem__(self, key):
- if isinstance(key, tuple):
- if len(key) > 1:
- raise IndexError("too many indices for array.")
- key = key[0]
- if is_integer(key):
- return self._get_val_at(key)
- elif isinstance(key, tuple):
- data_slice = self.values[key]
- elif isinstance(key, slice):
- # special case to preserve dtypes
- if key == slice(None):
- return self.copy()
- # TODO: this logic is surely elsewhere
- # TODO: this could be more efficient
- indices = np.arange(len(self), dtype=np.int32)[key]
- return self.take(indices)
- else:
- # TODO: I think we can avoid densifying when masking a
- # boolean SparseArray with another. Need to look at the
- # key's fill_value for True / False, and then do an intersection
- # on the indicies of the sp_values.
- if isinstance(key, SparseArray):
- if is_bool_dtype(key):
- key = key.to_dense()
- else:
- key = np.asarray(key)
- if com.is_bool_indexer(key) and len(self) == len(key):
- return self.take(np.arange(len(key), dtype=np.int32)[key])
- elif hasattr(key, '__len__'):
- return self.take(key)
- else:
- raise ValueError("Cannot slice with '{}'".format(key))
- return type(self)(data_slice, kind=self.kind)
- def _get_val_at(self, loc):
- n = len(self)
- if loc < 0:
- loc += n
- if loc >= n or loc < 0:
- raise IndexError('Out of bounds access')
- sp_loc = self.sp_index.lookup(loc)
- if sp_loc == -1:
- return self.fill_value
- else:
- return libindex.get_value_at(self.sp_values, sp_loc)
- def take(self, indices, allow_fill=False, fill_value=None):
- if is_scalar(indices):
- raise ValueError("'indices' must be an array, not a "
- "scalar '{}'.".format(indices))
- indices = np.asarray(indices, dtype=np.int32)
- if indices.size == 0:
- result = []
- kwargs = {'dtype': self.dtype}
- elif allow_fill:
- result = self._take_with_fill(indices, fill_value=fill_value)
- kwargs = {}
- else:
- result = self._take_without_fill(indices)
- kwargs = {'dtype': self.dtype}
- return type(self)(result, fill_value=self.fill_value, kind=self.kind,
- **kwargs)
- def _take_with_fill(self, indices, fill_value=None):
- if fill_value is None:
- fill_value = self.dtype.na_value
- if indices.min() < -1:
- raise ValueError("Invalid value in 'indices'. Must be between -1 "
- "and the length of the array.")
- if indices.max() >= len(self):
- raise IndexError("out of bounds value in 'indices'.")
- if len(self) == 0:
- # Empty... Allow taking only if all empty
- if (indices == -1).all():
- dtype = np.result_type(self.sp_values, type(fill_value))
- taken = np.empty_like(indices, dtype=dtype)
- taken.fill(fill_value)
- return taken
- else:
- raise IndexError('cannot do a non-empty take from an empty '
- 'axes.')
- sp_indexer = self.sp_index.lookup_array(indices)
- if self.sp_index.npoints == 0:
- # Avoid taking from the empty self.sp_values
- taken = np.full(sp_indexer.shape, fill_value=fill_value,
- dtype=np.result_type(type(fill_value)))
- else:
- taken = self.sp_values.take(sp_indexer)
- # sp_indexer may be -1 for two reasons
- # 1.) we took for an index of -1 (new)
- # 2.) we took a value that was self.fill_value (old)
- new_fill_indices = indices == -1
- old_fill_indices = (sp_indexer == -1) & ~new_fill_indices
- # Fill in two steps.
- # Old fill values
- # New fill values
- # potentially coercing to a new dtype at each stage.
- m0 = sp_indexer[old_fill_indices] < 0
- m1 = sp_indexer[new_fill_indices] < 0
- result_type = taken.dtype
- if m0.any():
- result_type = np.result_type(result_type,
- type(self.fill_value))
- taken = taken.astype(result_type)
- taken[old_fill_indices] = self.fill_value
- if m1.any():
- result_type = np.result_type(result_type, type(fill_value))
- taken = taken.astype(result_type)
- taken[new_fill_indices] = fill_value
- return taken
- def _take_without_fill(self, indices):
- to_shift = indices < 0
- indices = indices.copy()
- n = len(self)
- if (indices.max() >= n) or (indices.min() < -n):
- if n == 0:
- raise IndexError("cannot do a non-empty take from an "
- "empty axes.")
- else:
- raise IndexError("out of bounds value in 'indices'.")
- if to_shift.any():
- indices[to_shift] += n
- if self.sp_index.npoints == 0:
- # edge case in take...
- # I think just return
- out = np.full(indices.shape, self.fill_value,
- dtype=np.result_type(type(self.fill_value)))
- arr, sp_index, fill_value = make_sparse(out,
- fill_value=self.fill_value)
- return type(self)(arr, sparse_index=sp_index,
- fill_value=fill_value)
- sp_indexer = self.sp_index.lookup_array(indices)
- taken = self.sp_values.take(sp_indexer)
- fillable = (sp_indexer < 0)
- if fillable.any():
- # TODO: may need to coerce array to fill value
- result_type = np.result_type(taken, type(self.fill_value))
- taken = taken.astype(result_type)
- taken[fillable] = self.fill_value
- return taken
- def searchsorted(self, v, side="left", sorter=None):
- msg = "searchsorted requires high memory usage."
- warnings.warn(msg, PerformanceWarning, stacklevel=2)
- if not is_scalar(v):
- v = np.asarray(v)
- v = np.asarray(v)
- return np.asarray(self, dtype=self.dtype.subtype).searchsorted(
- v, side, sorter
- )
- def copy(self, deep=False):
- if deep:
- values = self.sp_values.copy()
- else:
- values = self.sp_values
- return self._simple_new(values, self.sp_index, self.dtype)
- @classmethod
- def _concat_same_type(cls, to_concat):
- fill_values = [x.fill_value for x in to_concat]
- fill_value = fill_values[0]
- # np.nan isn't a singleton, so we may end up with multiple
- # NaNs here, so we ignore tha all NA case too.
- if not (len(set(fill_values)) == 1 or isna(fill_values).all()):
- warnings.warn("Concatenating sparse arrays with multiple fill "
- "values: '{}'. Picking the first and "
- "converting the rest.".format(fill_values),
- PerformanceWarning,
- stacklevel=6)
- keep = to_concat[0]
- to_concat2 = [keep]
- for arr in to_concat[1:]:
- to_concat2.append(cls(np.asarray(arr), fill_value=fill_value))
- to_concat = to_concat2
- values = []
- length = 0
- if to_concat:
- sp_kind = to_concat[0].kind
- else:
- sp_kind = 'integer'
- if sp_kind == 'integer':
- indices = []
- for arr in to_concat:
- idx = arr.sp_index.to_int_index().indices.copy()
- idx += length # TODO: wraparound
- length += arr.sp_index.length
- values.append(arr.sp_values)
- indices.append(idx)
- data = np.concatenate(values)
- indices = np.concatenate(indices)
- sp_index = IntIndex(length, indices)
- else:
- # when concatentating block indices, we don't claim that you'll
- # get an identical index as concating the values and then
- # creating a new index. We don't want to spend the time trying
- # to merge blocks across arrays in `to_concat`, so the resulting
- # BlockIndex may have more blocs.
- blengths = []
- blocs = []
- for arr in to_concat:
- idx = arr.sp_index.to_block_index()
- values.append(arr.sp_values)
- blocs.append(idx.blocs.copy() + length)
- blengths.append(idx.blengths)
- length += arr.sp_index.length
- data = np.concatenate(values)
- blocs = np.concatenate(blocs)
- blengths = np.concatenate(blengths)
- sp_index = BlockIndex(length, blocs, blengths)
- return cls(data, sparse_index=sp_index, fill_value=fill_value)
- def astype(self, dtype=None, copy=True):
- """
- Change the dtype of a SparseArray.
- The output will always be a SparseArray. To convert to a dense
- ndarray with a certain dtype, use :meth:`numpy.asarray`.
- Parameters
- ----------
- dtype : np.dtype or ExtensionDtype
- For SparseDtype, this changes the dtype of
- ``self.sp_values`` and the ``self.fill_value``.
- For other dtypes, this only changes the dtype of
- ``self.sp_values``.
- copy : bool, default True
- Whether to ensure a copy is made, even if not necessary.
- Returns
- -------
- SparseArray
- Examples
- --------
- >>> arr = SparseArray([0, 0, 1, 2])
- >>> arr
- [0, 0, 1, 2]
- Fill: 0
- IntIndex
- Indices: array([2, 3], dtype=int32)
- >>> arr.astype(np.dtype('int32'))
- [0, 0, 1, 2]
- Fill: 0
- IntIndex
- Indices: array([2, 3], dtype=int32)
- Using a NumPy dtype with a different kind (e.g. float) will coerce
- just ``self.sp_values``.
- >>> arr.astype(np.dtype('float64'))
- ... # doctest: +NORMALIZE_WHITESPACE
- [0, 0, 1.0, 2.0]
- Fill: 0
- IntIndex
- Indices: array([2, 3], dtype=int32)
- Use a SparseDtype if you wish to be change the fill value as well.
- >>> arr.astype(SparseDtype("float64", fill_value=np.nan))
- ... # doctest: +NORMALIZE_WHITESPACE
- [nan, nan, 1.0, 2.0]
- Fill: nan
- IntIndex
- Indices: array([2, 3], dtype=int32)
- """
- dtype = self.dtype.update_dtype(dtype)
- subtype = dtype._subtype_with_str
- sp_values = astype_nansafe(self.sp_values,
- subtype,
- copy=copy)
- if sp_values is self.sp_values and copy:
- sp_values = sp_values.copy()
- return self._simple_new(sp_values,
- self.sp_index,
- dtype)
- def map(self, mapper):
- """
- Map categories using input correspondence (dict, Series, or function).
- Parameters
- ----------
- mapper : dict, Series, callable
- The correspondence from old values to new.
- Returns
- -------
- SparseArray
- The output array will have the same density as the input.
- The output fill value will be the result of applying the
- mapping to ``self.fill_value``
- Examples
- --------
- >>> arr = pd.SparseArray([0, 1, 2])
- >>> arr.apply(lambda x: x + 10)
- [10, 11, 12]
- Fill: 10
- IntIndex
- Indices: array([1, 2], dtype=int32)
- >>> arr.apply({0: 10, 1: 11, 2: 12})
- [10, 11, 12]
- Fill: 10
- IntIndex
- Indices: array([1, 2], dtype=int32)
- >>> arr.apply(pd.Series([10, 11, 12], index=[0, 1, 2]))
- [10, 11, 12]
- Fill: 10
- IntIndex
- Indices: array([1, 2], dtype=int32)
- """
- # this is used in apply.
- # We get hit since we're an "is_extension_type" but regular extension
- # types are not hit. This may be worth adding to the interface.
- if isinstance(mapper, ABCSeries):
- mapper = mapper.to_dict()
- if isinstance(mapper, compat.Mapping):
- fill_value = mapper.get(self.fill_value, self.fill_value)
- sp_values = [mapper.get(x, None) for x in self.sp_values]
- else:
- fill_value = mapper(self.fill_value)
- sp_values = [mapper(x) for x in self.sp_values]
- return type(self)(sp_values, sparse_index=self.sp_index,
- fill_value=fill_value)
- def to_dense(self):
- """
- Convert SparseArray to a NumPy array.
- Returns
- -------
- arr : NumPy array
- """
- return np.asarray(self, dtype=self.sp_values.dtype)
- # TODO: Look into deprecating this in favor of `to_dense`.
- get_values = to_dense
- # ------------------------------------------------------------------------
- # IO
- # ------------------------------------------------------------------------
- def __setstate__(self, state):
- """Necessary for making this object picklable"""
- if isinstance(state, tuple):
- # Compat for pandas < 0.24.0
- nd_state, (fill_value, sp_index) = state
- sparse_values = np.array([])
- sparse_values.__setstate__(nd_state)
- self._sparse_values = sparse_values
- self._sparse_index = sp_index
- self._dtype = SparseDtype(sparse_values.dtype, fill_value)
- else:
- self.__dict__.update(state)
- def nonzero(self):
- if self.fill_value == 0:
- return self.sp_index.to_int_index().indices,
- else:
- return self.sp_index.to_int_index().indices[self.sp_values != 0],
- # ------------------------------------------------------------------------
- # Reductions
- # ------------------------------------------------------------------------
- def _reduce(self, name, skipna=True, **kwargs):
- method = getattr(self, name, None)
- if method is None:
- raise TypeError("cannot perform {name} with type {dtype}".format(
- name=name, dtype=self.dtype))
- if skipna:
- arr = self
- else:
- arr = self.dropna()
- # we don't support these kwargs.
- # They should only be present when called via pandas, so do it here.
- # instead of in `any` / `all` (which will raise if they're present,
- # thanks to nv.validate
- kwargs.pop('filter_type', None)
- kwargs.pop('numeric_only', None)
- kwargs.pop('op', None)
- return getattr(arr, name)(**kwargs)
- def all(self, axis=None, *args, **kwargs):
- """
- Tests whether all elements evaluate True
- Returns
- -------
- all : bool
- See Also
- --------
- numpy.all
- """
- nv.validate_all(args, kwargs)
- values = self.sp_values
- if len(values) != len(self) and not np.all(self.fill_value):
- return False
- return values.all()
- def any(self, axis=0, *args, **kwargs):
- """
- Tests whether at least one of elements evaluate True
- Returns
- -------
- any : bool
- See Also
- --------
- numpy.any
- """
- nv.validate_any(args, kwargs)
- values = self.sp_values
- if len(values) != len(self) and np.any(self.fill_value):
- return True
- return values.any().item()
- def sum(self, axis=0, *args, **kwargs):
- """
- Sum of non-NA/null values
- Returns
- -------
- sum : float
- """
- nv.validate_sum(args, kwargs)
- valid_vals = self._valid_sp_values
- sp_sum = valid_vals.sum()
- if self._null_fill_value:
- return sp_sum
- else:
- nsparse = self.sp_index.ngaps
- return sp_sum + self.fill_value * nsparse
- def cumsum(self, axis=0, *args, **kwargs):
- """
- Cumulative sum of non-NA/null values.
- When performing the cumulative summation, any non-NA/null values will
- be skipped. The resulting SparseArray will preserve the locations of
- NaN values, but the fill value will be `np.nan` regardless.
- Parameters
- ----------
- axis : int or None
- Axis over which to perform the cumulative summation. If None,
- perform cumulative summation over flattened array.
- Returns
- -------
- cumsum : SparseArray
- """
- nv.validate_cumsum(args, kwargs)
- if axis is not None and axis >= self.ndim: # Mimic ndarray behaviour.
- raise ValueError("axis(={axis}) out of bounds".format(axis=axis))
- if not self._null_fill_value:
- return SparseArray(self.to_dense()).cumsum()
- return SparseArray(self.sp_values.cumsum(), sparse_index=self.sp_index,
- fill_value=self.fill_value)
- def mean(self, axis=0, *args, **kwargs):
- """
- Mean of non-NA/null values
- Returns
- -------
- mean : float
- """
- nv.validate_mean(args, kwargs)
- valid_vals = self._valid_sp_values
- sp_sum = valid_vals.sum()
- ct = len(valid_vals)
- if self._null_fill_value:
- return sp_sum / ct
- else:
- nsparse = self.sp_index.ngaps
- return (sp_sum + self.fill_value * nsparse) / (ct + nsparse)
- def transpose(self, *axes):
- """
- Returns the SparseArray.
- """
- return self
- @property
- def T(self):
- """
- Returns the SparseArray.
- """
- return self
- # ------------------------------------------------------------------------
- # Ufuncs
- # ------------------------------------------------------------------------
- def __array_wrap__(self, array, context=None):
- from pandas.core.dtypes.generic import ABCSparseSeries
- ufunc, inputs, _ = context
- inputs = tuple(x.values if isinstance(x, ABCSparseSeries) else x
- for x in inputs)
- return self.__array_ufunc__(ufunc, '__call__', *inputs)
- _HANDLED_TYPES = (np.ndarray, numbers.Number)
- def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
- out = kwargs.get('out', ())
- for x in inputs + out:
- if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)):
- return NotImplemented
- special = {'add', 'sub', 'mul', 'pow', 'mod', 'floordiv', 'truediv',
- 'divmod', 'eq', 'ne', 'lt', 'gt', 'le', 'ge', 'remainder'}
- if compat.PY2:
- special.add('div')
- aliases = {
- 'subtract': 'sub',
- 'multiply': 'mul',
- 'floor_divide': 'floordiv',
- 'true_divide': 'truediv',
- 'power': 'pow',
- 'remainder': 'mod',
- 'divide': 'div',
- 'equal': 'eq',
- 'not_equal': 'ne',
- 'less': 'lt',
- 'less_equal': 'le',
- 'greater': 'gt',
- 'greater_equal': 'ge',
- }
- flipped = {
- 'lt': '__gt__',
- 'le': '__ge__',
- 'gt': '__lt__',
- 'ge': '__le__',
- 'eq': '__eq__',
- 'ne': '__ne__',
- }
- op_name = ufunc.__name__
- op_name = aliases.get(op_name, op_name)
- if op_name in special and kwargs.get('out') is None:
- if isinstance(inputs[0], type(self)):
- return getattr(self, '__{}__'.format(op_name))(inputs[1])
- else:
- name = flipped.get(op_name, '__r{}__'.format(op_name))
- return getattr(self, name)(inputs[0])
- if len(inputs) == 1:
- # No alignment necessary.
- sp_values = getattr(ufunc, method)(self.sp_values, **kwargs)
- fill_value = getattr(ufunc, method)(self.fill_value, **kwargs)
- return self._simple_new(sp_values,
- self.sp_index,
- SparseDtype(sp_values.dtype, fill_value))
- result = getattr(ufunc, method)(*[np.asarray(x) for x in inputs],
- **kwargs)
- if out:
- if len(out) == 1:
- out = out[0]
- return out
- if type(result) is tuple:
- return tuple(type(self)(x) for x in result)
- elif method == 'at':
- # no return value
- return None
- else:
- return type(self)(result)
- def __abs__(self):
- return np.abs(self)
- # ------------------------------------------------------------------------
- # Ops
- # ------------------------------------------------------------------------
- @classmethod
- def _create_unary_method(cls, op):
- def sparse_unary_method(self):
- fill_value = op(np.array(self.fill_value)).item()
- values = op(self.sp_values)
- dtype = SparseDtype(values.dtype, fill_value)
- return cls._simple_new(values, self.sp_index, dtype)
- name = '__{name}__'.format(name=op.__name__)
- return compat.set_function_name(sparse_unary_method, name, cls)
- @classmethod
- def _create_arithmetic_method(cls, op):
- def sparse_arithmetic_method(self, other):
- op_name = op.__name__
- if isinstance(other, (ABCSeries, ABCIndexClass)):
- # Rely on pandas to dispatch to us.
- return NotImplemented
- if isinstance(other, SparseArray):
- return _sparse_array_op(self, other, op, op_name)
- elif is_scalar(other):
- with np.errstate(all='ignore'):
- fill = op(_get_fill(self), np.asarray(other))
- result = op(self.sp_values, other)
- if op_name == 'divmod':
- left, right = result
- lfill, rfill = fill
- return (_wrap_result(op_name, left, self.sp_index, lfill),
- _wrap_result(op_name, right, self.sp_index, rfill))
- return _wrap_result(op_name, result, self.sp_index, fill)
- else:
- other = np.asarray(other)
- with np.errstate(all='ignore'):
- # TODO: delete sparse stuff in core/ops.py
- # TODO: look into _wrap_result
- if len(self) != len(other):
- raise AssertionError(
- ("length mismatch: {self} vs. {other}".format(
- self=len(self), other=len(other))))
- if not isinstance(other, SparseArray):
- dtype = getattr(other, 'dtype', None)
- other = SparseArray(other, fill_value=self.fill_value,
- dtype=dtype)
- return _sparse_array_op(self, other, op, op_name)
- name = '__{name}__'.format(name=op.__name__)
- return compat.set_function_name(sparse_arithmetic_method, name, cls)
- @classmethod
- def _create_comparison_method(cls, op):
- def cmp_method(self, other):
- op_name = op.__name__
- if op_name in {'and_', 'or_'}:
- op_name = op_name[:-1]
- if isinstance(other, (ABCSeries, ABCIndexClass)):
- # Rely on pandas to unbox and dispatch to us.
- return NotImplemented
- if not is_scalar(other) and not isinstance(other, type(self)):
- # convert list-like to ndarray
- other = np.asarray(other)
- if isinstance(other, np.ndarray):
- # TODO: make this more flexible than just ndarray...
- if len(self) != len(other):
- raise AssertionError("length mismatch: {self} vs. {other}"
- .format(self=len(self),
- other=len(other)))
- other = SparseArray(other, fill_value=self.fill_value)
- if isinstance(other, SparseArray):
- return _sparse_array_op(self, other, op, op_name)
- else:
- with np.errstate(all='ignore'):
- fill_value = op(self.fill_value, other)
- result = op(self.sp_values, other)
- return type(self)(result,
- sparse_index=self.sp_index,
- fill_value=fill_value,
- dtype=np.bool_)
- name = '__{name}__'.format(name=op.__name__)
- return compat.set_function_name(cmp_method, name, cls)
- @classmethod
- def _add_unary_ops(cls):
- cls.__pos__ = cls._create_unary_method(operator.pos)
- cls.__neg__ = cls._create_unary_method(operator.neg)
- cls.__invert__ = cls._create_unary_method(operator.invert)
- @classmethod
- def _add_comparison_ops(cls):
- cls.__and__ = cls._create_comparison_method(operator.and_)
- cls.__or__ = cls._create_comparison_method(operator.or_)
- super(SparseArray, cls)._add_comparison_ops()
- # ----------
- # Formatting
- # -----------
- def __unicode__(self):
- return '{self}\nFill: {fill}\n{index}'.format(
- self=printing.pprint_thing(self),
- fill=printing.pprint_thing(self.fill_value),
- index=printing.pprint_thing(self.sp_index))
- def _formatter(self, boxed=False):
- # Defer to the formatter from the GenericArrayFormatter calling us.
- # This will infer the correct formatter from the dtype of the values.
- return None
- SparseArray._add_arithmetic_ops()
- SparseArray._add_comparison_ops()
- SparseArray._add_unary_ops()
- def _maybe_to_dense(obj):
- """
- try to convert to dense
- """
- if hasattr(obj, 'to_dense'):
- return obj.to_dense()
- return obj
- def _maybe_to_sparse(array):
- """
- array must be SparseSeries or SparseArray
- """
- if isinstance(array, ABCSparseSeries):
- array = array.values.copy()
- return array
- def _sanitize_values(arr):
- """
- return an ndarray for our input,
- in a platform independent manner
- """
- if hasattr(arr, 'values'):
- arr = arr.values
- else:
- # scalar
- if is_scalar(arr):
- arr = [arr]
- # ndarray
- if isinstance(arr, np.ndarray):
- pass
- elif is_list_like(arr) and len(arr) > 0:
- arr = maybe_convert_platform(arr)
- else:
- arr = np.asarray(arr)
- return arr
- def make_sparse(arr, kind='block', fill_value=None, dtype=None, copy=False):
- """
- Convert ndarray to sparse format
- Parameters
- ----------
- arr : ndarray
- kind : {'block', 'integer'}
- fill_value : NaN or another value
- dtype : np.dtype, optional
- copy : bool, default False
- Returns
- -------
- (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar)
- """
- arr = _sanitize_values(arr)
- if arr.ndim > 1:
- raise TypeError("expected dimension <= 1 data")
- if fill_value is None:
- fill_value = na_value_for_dtype(arr.dtype)
- if isna(fill_value):
- mask = notna(arr)
- else:
- # For str arrays in NumPy 1.12.0, operator!= below isn't
- # element-wise but just returns False if fill_value is not str,
- # so cast to object comparison to be safe
- if is_string_dtype(arr):
- arr = arr.astype(object)
- if is_object_dtype(arr.dtype):
- # element-wise equality check method in numpy doesn't treat
- # each element type, eg. 0, 0.0, and False are treated as
- # same. So we have to check the both of its type and value.
- mask = splib.make_mask_object_ndarray(arr, fill_value)
- else:
- mask = arr != fill_value
- length = len(arr)
- if length != len(mask):
- # the arr is a SparseArray
- indices = mask.sp_index.indices
- else:
- indices = mask.nonzero()[0].astype(np.int32)
- index = _make_index(length, indices, kind)
- sparsified_values = arr[mask]
- if dtype is not None:
- sparsified_values = astype_nansafe(sparsified_values, dtype=dtype)
- # TODO: copy
- return sparsified_values, index, fill_value
- def _make_index(length, indices, kind):
- if kind == 'block' or isinstance(kind, BlockIndex):
- locs, lens = splib.get_blocks(indices)
- index = BlockIndex(length, locs, lens)
- elif kind == 'integer' or isinstance(kind, IntIndex):
- index = IntIndex(length, indices)
- else: # pragma: no cover
- raise ValueError('must be block or integer type')
- return index
- # ----------------------------------------------------------------------------
- # Accessor
- @delegate_names(SparseArray, ['npoints', 'density', 'fill_value',
- 'sp_values'],
- typ='property')
- class SparseAccessor(PandasDelegate):
- """
- Accessor for SparseSparse from other sparse matrix data types.
- """
- def __init__(self, data=None):
- self._validate(data)
- # Store the Series since we need that for to_coo
- self._parent = data
- @staticmethod
- def _validate(data):
- if not isinstance(data.dtype, SparseDtype):
- msg = "Can only use the '.sparse' accessor with Sparse data."
- raise AttributeError(msg)
- def _delegate_property_get(self, name, *args, **kwargs):
- return getattr(self._parent.values, name)
- def _delegate_method(self, name, *args, **kwargs):
- if name == 'from_coo':
- return self.from_coo(*args, **kwargs)
- elif name == 'to_coo':
- return self.to_coo(*args, **kwargs)
- else:
- raise ValueError
- @classmethod
- def from_coo(cls, A, dense_index=False):
- """
- Create a SparseSeries from a scipy.sparse.coo_matrix.
- Parameters
- ----------
- A : scipy.sparse.coo_matrix
- dense_index : bool, default False
- If False (default), the SparseSeries index consists of only the
- coords of the non-null entries of the original coo_matrix.
- If True, the SparseSeries index consists of the full sorted
- (row, col) coordinates of the coo_matrix.
- Returns
- -------
- s : SparseSeries
- Examples
- ---------
- >>> from scipy import sparse
- >>> A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])),
- shape=(3, 4))
- >>> A
- <3x4 sparse matrix of type '<class 'numpy.float64'>'
- with 3 stored elements in COOrdinate format>
- >>> A.todense()
- matrix([[ 0., 0., 1., 2.],
- [ 3., 0., 0., 0.],
- [ 0., 0., 0., 0.]])
- >>> ss = pd.SparseSeries.from_coo(A)
- >>> ss
- 0 2 1
- 3 2
- 1 0 3
- dtype: float64
- BlockIndex
- Block locations: array([0], dtype=int32)
- Block lengths: array([3], dtype=int32)
- """
- from pandas.core.sparse.scipy_sparse import _coo_to_sparse_series
- from pandas import Series
- result = _coo_to_sparse_series(A, dense_index=dense_index)
- # SparseSeries -> Series[sparse]
- result = Series(result.values, index=result.index, copy=False)
- return result
- def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
- """
- Create a scipy.sparse.coo_matrix from a SparseSeries with MultiIndex.
- Use row_levels and column_levels to determine the row and column
- coordinates respectively. row_levels and column_levels are the names
- (labels) or numbers of the levels. {row_levels, column_levels} must be
- a partition of the MultiIndex level names (or numbers).
- Parameters
- ----------
- row_levels : tuple/list
- column_levels : tuple/list
- sort_labels : bool, default False
- Sort the row and column labels before forming the sparse matrix.
- Returns
- -------
- y : scipy.sparse.coo_matrix
- rows : list (row labels)
- columns : list (column labels)
- Examples
- --------
- >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan])
- >>> s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0),
- (1, 2, 'a', 1),
- (1, 1, 'b', 0),
- (1, 1, 'b', 1),
- (2, 1, 'b', 0),
- (2, 1, 'b', 1)],
- names=['A', 'B', 'C', 'D'])
- >>> ss = s.to_sparse()
- >>> A, rows, columns = ss.to_coo(row_levels=['A', 'B'],
- column_levels=['C', 'D'],
- sort_labels=True)
- >>> A
- <3x4 sparse matrix of type '<class 'numpy.float64'>'
- with 3 stored elements in COOrdinate format>
- >>> A.todense()
- matrix([[ 0., 0., 1., 3.],
- [ 3., 0., 0., 0.],
- [ 0., 0., 0., 0.]])
- >>> rows
- [(1, 1), (1, 2), (2, 1)]
- >>> columns
- [('a', 0), ('a', 1), ('b', 0), ('b', 1)]
- """
- from pandas.core.sparse.scipy_sparse import _sparse_series_to_coo
- A, rows, columns = _sparse_series_to_coo(self._parent,
- row_levels,
- column_levels,
- sort_labels=sort_labels)
- return A, rows, columns
|