sparse.py 65 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028
  1. """
  2. SparseArray data structure
  3. """
  4. from __future__ import division
  5. import numbers
  6. import operator
  7. import re
  8. import warnings
  9. import numpy as np
  10. from pandas._libs import index as libindex, lib
  11. import pandas._libs.sparse as splib
  12. from pandas._libs.sparse import BlockIndex, IntIndex
  13. from pandas._libs.tslibs import NaT
  14. import pandas.compat as compat
  15. from pandas.compat.numpy import function as nv
  16. from pandas.errors import PerformanceWarning
  17. from pandas.core.dtypes.base import ExtensionDtype
  18. from pandas.core.dtypes.cast import (
  19. astype_nansafe, construct_1d_arraylike_from_scalar, find_common_type,
  20. infer_dtype_from_scalar, maybe_convert_platform)
  21. from pandas.core.dtypes.common import (
  22. is_array_like, is_bool_dtype, is_datetime64_any_dtype, is_dtype_equal,
  23. is_integer, is_list_like, is_object_dtype, is_scalar, is_string_dtype,
  24. pandas_dtype)
  25. from pandas.core.dtypes.dtypes import register_extension_dtype
  26. from pandas.core.dtypes.generic import (
  27. ABCIndexClass, ABCSeries, ABCSparseSeries)
  28. from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna
  29. from pandas.core.accessor import PandasDelegate, delegate_names
  30. import pandas.core.algorithms as algos
  31. from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
  32. from pandas.core.base import PandasObject
  33. import pandas.core.common as com
  34. from pandas.core.missing import interpolate_2d
  35. import pandas.io.formats.printing as printing
  36. # ----------------------------------------------------------------------------
  37. # Dtype
  38. @register_extension_dtype
  39. class SparseDtype(ExtensionDtype):
  40. """
  41. Dtype for data stored in :class:`SparseArray`.
  42. This dtype implements the pandas ExtensionDtype interface.
  43. .. versionadded:: 0.24.0
  44. Parameters
  45. ----------
  46. dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64
  47. The dtype of the underlying array storing the non-fill value values.
  48. fill_value : scalar, optional
  49. The scalar value not stored in the SparseArray. By default, this
  50. depends on `dtype`.
  51. =========== ==========
  52. dtype na_value
  53. =========== ==========
  54. float ``np.nan``
  55. int ``0``
  56. bool ``False``
  57. datetime64 ``pd.NaT``
  58. timedelta64 ``pd.NaT``
  59. =========== ==========
  60. The default value may be overridden by specifying a `fill_value`.
  61. """
  62. # We include `_is_na_fill_value` in the metadata to avoid hash collisions
  63. # between SparseDtype(float, 0.0) and SparseDtype(float, nan).
  64. # Without is_na_fill_value in the comparison, those would be equal since
  65. # hash(nan) is (sometimes?) 0.
  66. _metadata = ('_dtype', '_fill_value', '_is_na_fill_value')
  67. def __init__(self, dtype=np.float64, fill_value=None):
  68. # type: (Union[str, np.dtype, 'ExtensionDtype', type], Any) -> None
  69. from pandas.core.dtypes.missing import na_value_for_dtype
  70. from pandas.core.dtypes.common import (
  71. pandas_dtype, is_string_dtype, is_scalar
  72. )
  73. if isinstance(dtype, type(self)):
  74. if fill_value is None:
  75. fill_value = dtype.fill_value
  76. dtype = dtype.subtype
  77. dtype = pandas_dtype(dtype)
  78. if is_string_dtype(dtype):
  79. dtype = np.dtype('object')
  80. if fill_value is None:
  81. fill_value = na_value_for_dtype(dtype)
  82. if not is_scalar(fill_value):
  83. raise ValueError("fill_value must be a scalar. Got {} "
  84. "instead".format(fill_value))
  85. self._dtype = dtype
  86. self._fill_value = fill_value
  87. def __hash__(self):
  88. # Python3 doesn't inherit __hash__ when a base class overrides
  89. # __eq__, so we explicitly do it here.
  90. return super(SparseDtype, self).__hash__()
  91. def __eq__(self, other):
  92. # We have to override __eq__ to handle NA values in _metadata.
  93. # The base class does simple == checks, which fail for NA.
  94. if isinstance(other, compat.string_types):
  95. try:
  96. other = self.construct_from_string(other)
  97. except TypeError:
  98. return False
  99. if isinstance(other, type(self)):
  100. subtype = self.subtype == other.subtype
  101. if self._is_na_fill_value:
  102. # this case is complicated by two things:
  103. # SparseDtype(float, float(nan)) == SparseDtype(float, np.nan)
  104. # SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT)
  105. # i.e. we want to treat any floating-point NaN as equal, but
  106. # not a floating-point NaN and a datetime NaT.
  107. fill_value = (
  108. other._is_na_fill_value and
  109. isinstance(self.fill_value, type(other.fill_value)) or
  110. isinstance(other.fill_value, type(self.fill_value))
  111. )
  112. else:
  113. fill_value = self.fill_value == other.fill_value
  114. return subtype and fill_value
  115. return False
  116. @property
  117. def fill_value(self):
  118. """
  119. The fill value of the array.
  120. Converting the SparseArray to a dense ndarray will fill the
  121. array with this value.
  122. .. warning::
  123. It's possible to end up with a SparseArray that has ``fill_value``
  124. values in ``sp_values``. This can occur, for example, when setting
  125. ``SparseArray.fill_value`` directly.
  126. """
  127. return self._fill_value
  128. @property
  129. def _is_na_fill_value(self):
  130. from pandas.core.dtypes.missing import isna
  131. return isna(self.fill_value)
  132. @property
  133. def _is_numeric(self):
  134. from pandas.core.dtypes.common import is_object_dtype
  135. return not is_object_dtype(self.subtype)
  136. @property
  137. def _is_boolean(self):
  138. from pandas.core.dtypes.common import is_bool_dtype
  139. return is_bool_dtype(self.subtype)
  140. @property
  141. def kind(self):
  142. """
  143. The sparse kind. Either 'integer', or 'block'.
  144. """
  145. return self.subtype.kind
  146. @property
  147. def type(self):
  148. return self.subtype.type
  149. @property
  150. def subtype(self):
  151. return self._dtype
  152. @property
  153. def name(self):
  154. return 'Sparse[{}, {}]'.format(self.subtype.name, self.fill_value)
  155. def __repr__(self):
  156. return self.name
  157. @classmethod
  158. def construct_array_type(cls):
  159. return SparseArray
  160. @classmethod
  161. def construct_from_string(cls, string):
  162. """
  163. Construct a SparseDtype from a string form.
  164. Parameters
  165. ----------
  166. string : str
  167. Can take the following forms.
  168. string dtype
  169. ================ ============================
  170. 'int' SparseDtype[np.int64, 0]
  171. 'Sparse' SparseDtype[np.float64, nan]
  172. 'Sparse[int]' SparseDtype[np.int64, 0]
  173. 'Sparse[int, 0]' SparseDtype[np.int64, 0]
  174. ================ ============================
  175. It is not possible to specify non-default fill values
  176. with a string. An argument like ``'Sparse[int, 1]'``
  177. will raise a ``TypeError`` because the default fill value
  178. for integers is 0.
  179. Returns
  180. -------
  181. SparseDtype
  182. """
  183. msg = "Could not construct SparseDtype from '{}'".format(string)
  184. if string.startswith("Sparse"):
  185. try:
  186. sub_type, has_fill_value = cls._parse_subtype(string)
  187. result = SparseDtype(sub_type)
  188. except Exception:
  189. raise TypeError(msg)
  190. else:
  191. msg = ("Could not construct SparseDtype from '{}'.\n\nIt "
  192. "looks like the fill_value in the string is not "
  193. "the default for the dtype. Non-default fill_values "
  194. "are not supported. Use the 'SparseDtype()' "
  195. "constructor instead.")
  196. if has_fill_value and str(result) != string:
  197. raise TypeError(msg.format(string))
  198. return result
  199. else:
  200. raise TypeError(msg)
  201. @staticmethod
  202. def _parse_subtype(dtype):
  203. """
  204. Parse a string to get the subtype
  205. Parameters
  206. ----------
  207. dtype : str
  208. A string like
  209. * Sparse[subtype]
  210. * Sparse[subtype, fill_value]
  211. Returns
  212. -------
  213. subtype : str
  214. Raises
  215. ------
  216. ValueError
  217. When the subtype cannot be extracted.
  218. """
  219. xpr = re.compile(
  220. r"Sparse\[(?P<subtype>[^,]*)(, )?(?P<fill_value>.*?)?\]$"
  221. )
  222. m = xpr.match(dtype)
  223. has_fill_value = False
  224. if m:
  225. subtype = m.groupdict()['subtype']
  226. has_fill_value = m.groupdict()['fill_value'] or has_fill_value
  227. elif dtype == "Sparse":
  228. subtype = 'float64'
  229. else:
  230. raise ValueError("Cannot parse {}".format(dtype))
  231. return subtype, has_fill_value
  232. @classmethod
  233. def is_dtype(cls, dtype):
  234. dtype = getattr(dtype, 'dtype', dtype)
  235. if (isinstance(dtype, compat.string_types) and
  236. dtype.startswith("Sparse")):
  237. sub_type, _ = cls._parse_subtype(dtype)
  238. dtype = np.dtype(sub_type)
  239. elif isinstance(dtype, cls):
  240. return True
  241. return isinstance(dtype, np.dtype) or dtype == 'Sparse'
  242. def update_dtype(self, dtype):
  243. """
  244. Convert the SparseDtype to a new dtype.
  245. This takes care of converting the ``fill_value``.
  246. Parameters
  247. ----------
  248. dtype : Union[str, numpy.dtype, SparseDtype]
  249. The new dtype to use.
  250. * For a SparseDtype, it is simply returned
  251. * For a NumPy dtype (or str), the current fill value
  252. is converted to the new dtype, and a SparseDtype
  253. with `dtype` and the new fill value is returned.
  254. Returns
  255. -------
  256. SparseDtype
  257. A new SparseDtype with the corret `dtype` and fill value
  258. for that `dtype`.
  259. Raises
  260. ------
  261. ValueError
  262. When the current fill value cannot be converted to the
  263. new `dtype` (e.g. trying to convert ``np.nan`` to an
  264. integer dtype).
  265. Examples
  266. --------
  267. >>> SparseDtype(int, 0).update_dtype(float)
  268. Sparse[float64, 0.0]
  269. >>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan))
  270. Sparse[float64, nan]
  271. """
  272. cls = type(self)
  273. dtype = pandas_dtype(dtype)
  274. if not isinstance(dtype, cls):
  275. fill_value = astype_nansafe(np.array(self.fill_value),
  276. dtype).item()
  277. dtype = cls(dtype, fill_value=fill_value)
  278. return dtype
  279. @property
  280. def _subtype_with_str(self):
  281. """
  282. Whether the SparseDtype's subtype should be considered ``str``.
  283. Typically, pandas will store string data in an object-dtype array.
  284. When converting values to a dtype, e.g. in ``.astype``, we need to
  285. be more specific, we need the actual underlying type.
  286. Returns
  287. -------
  288. >>> SparseDtype(int, 1)._subtype_with_str
  289. dtype('int64')
  290. >>> SparseDtype(object, 1)._subtype_with_str
  291. dtype('O')
  292. >>> dtype = SparseDtype(str, '')
  293. >>> dtype.subtype
  294. dtype('O')
  295. >>> dtype._subtype_with_str
  296. str
  297. """
  298. if isinstance(self.fill_value, compat.string_types):
  299. return type(self.fill_value)
  300. return self.subtype
  301. # ----------------------------------------------------------------------------
  302. # Array
  303. _sparray_doc_kwargs = dict(klass='SparseArray')
  304. def _get_fill(arr):
  305. # type: (SparseArray) -> ndarray
  306. """
  307. Create a 0-dim ndarray containing the fill value
  308. Parameters
  309. ----------
  310. arr : SparseArray
  311. Returns
  312. -------
  313. fill_value : ndarray
  314. 0-dim ndarray with just the fill value.
  315. Notes
  316. -----
  317. coerce fill_value to arr dtype if possible
  318. int64 SparseArray can have NaN as fill_value if there is no missing
  319. """
  320. try:
  321. return np.asarray(arr.fill_value, dtype=arr.dtype.subtype)
  322. except ValueError:
  323. return np.asarray(arr.fill_value)
  324. def _sparse_array_op(left, right, op, name):
  325. """
  326. Perform a binary operation between two arrays.
  327. Parameters
  328. ----------
  329. left : Union[SparseArray, ndarray]
  330. right : Union[SparseArray, ndarray]
  331. op : Callable
  332. The binary operation to perform
  333. name str
  334. Name of the callable.
  335. Returns
  336. -------
  337. SparseArray
  338. """
  339. # type: (SparseArray, SparseArray, Callable, str) -> Any
  340. if name.startswith('__'):
  341. # For lookups in _libs.sparse we need non-dunder op name
  342. name = name[2:-2]
  343. # dtype used to find corresponding sparse method
  344. ltype = left.dtype.subtype
  345. rtype = right.dtype.subtype
  346. if not is_dtype_equal(ltype, rtype):
  347. subtype = find_common_type([ltype, rtype])
  348. ltype = SparseDtype(subtype, left.fill_value)
  349. rtype = SparseDtype(subtype, right.fill_value)
  350. # TODO(GH-23092): pass copy=False. Need to fix astype_nansafe
  351. left = left.astype(ltype)
  352. right = right.astype(rtype)
  353. dtype = ltype.subtype
  354. else:
  355. dtype = ltype
  356. # dtype the result must have
  357. result_dtype = None
  358. if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0:
  359. with np.errstate(all='ignore'):
  360. result = op(left.get_values(), right.get_values())
  361. fill = op(_get_fill(left), _get_fill(right))
  362. if left.sp_index.ngaps == 0:
  363. index = left.sp_index
  364. else:
  365. index = right.sp_index
  366. elif left.sp_index.equals(right.sp_index):
  367. with np.errstate(all='ignore'):
  368. result = op(left.sp_values, right.sp_values)
  369. fill = op(_get_fill(left), _get_fill(right))
  370. index = left.sp_index
  371. else:
  372. if name[0] == 'r':
  373. left, right = right, left
  374. name = name[1:]
  375. if name in ('and', 'or') and dtype == 'bool':
  376. opname = 'sparse_{name}_uint8'.format(name=name)
  377. # to make template simple, cast here
  378. left_sp_values = left.sp_values.view(np.uint8)
  379. right_sp_values = right.sp_values.view(np.uint8)
  380. result_dtype = np.bool
  381. else:
  382. opname = 'sparse_{name}_{dtype}'.format(name=name, dtype=dtype)
  383. left_sp_values = left.sp_values
  384. right_sp_values = right.sp_values
  385. sparse_op = getattr(splib, opname)
  386. with np.errstate(all='ignore'):
  387. result, index, fill = sparse_op(
  388. left_sp_values, left.sp_index, left.fill_value,
  389. right_sp_values, right.sp_index, right.fill_value)
  390. if result_dtype is None:
  391. result_dtype = result.dtype
  392. return _wrap_result(name, result, index, fill, dtype=result_dtype)
  393. def _wrap_result(name, data, sparse_index, fill_value, dtype=None):
  394. """
  395. wrap op result to have correct dtype
  396. """
  397. if name.startswith('__'):
  398. # e.g. __eq__ --> eq
  399. name = name[2:-2]
  400. if name in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'):
  401. dtype = np.bool
  402. fill_value = lib.item_from_zerodim(fill_value)
  403. if is_bool_dtype(dtype):
  404. # fill_value may be np.bool_
  405. fill_value = bool(fill_value)
  406. return SparseArray(data,
  407. sparse_index=sparse_index,
  408. fill_value=fill_value,
  409. dtype=dtype)
  410. class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin):
  411. """
  412. An ExtensionArray for storing sparse data.
  413. .. versionchanged:: 0.24.0
  414. Implements the ExtensionArray interface.
  415. Parameters
  416. ----------
  417. data : array-like
  418. A dense array of values to store in the SparseArray. This may contain
  419. `fill_value`.
  420. sparse_index : SparseIndex, optional
  421. index : Index
  422. fill_value : scalar, optional
  423. Elements in `data` that are `fill_value` are not stored in the
  424. SparseArray. For memory savings, this should be the most common value
  425. in `data`. By default, `fill_value` depends on the dtype of `data`:
  426. =========== ==========
  427. data.dtype na_value
  428. =========== ==========
  429. float ``np.nan``
  430. int ``0``
  431. bool False
  432. datetime64 ``pd.NaT``
  433. timedelta64 ``pd.NaT``
  434. =========== ==========
  435. The fill value is potentiall specified in three ways. In order of
  436. precedence, these are
  437. 1. The `fill_value` argument
  438. 2. ``dtype.fill_value`` if `fill_value` is None and `dtype` is
  439. a ``SparseDtype``
  440. 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype`
  441. is not a ``SparseDtype`` and `data` is a ``SparseArray``.
  442. kind : {'integer', 'block'}, default 'integer'
  443. The type of storage for sparse locations.
  444. * 'block': Stores a `block` and `block_length` for each
  445. contiguous *span* of sparse values. This is best when
  446. sparse data tends to be clumped together, with large
  447. regsions of ``fill-value`` values between sparse values.
  448. * 'integer': uses an integer to store the location of
  449. each sparse value.
  450. dtype : np.dtype or SparseDtype, optional
  451. The dtype to use for the SparseArray. For numpy dtypes, this
  452. determines the dtype of ``self.sp_values``. For SparseDtype,
  453. this determines ``self.sp_values`` and ``self.fill_value``.
  454. copy : bool, default False
  455. Whether to explicitly copy the incoming `data` array.
  456. """
  457. __array_priority__ = 15
  458. _pandas_ftype = 'sparse'
  459. _subtyp = 'sparse_array' # register ABCSparseArray
  460. def __init__(self, data, sparse_index=None, index=None, fill_value=None,
  461. kind='integer', dtype=None, copy=False):
  462. from pandas.core.internals import SingleBlockManager
  463. if isinstance(data, SingleBlockManager):
  464. data = data.internal_values()
  465. if fill_value is None and isinstance(dtype, SparseDtype):
  466. fill_value = dtype.fill_value
  467. if isinstance(data, (type(self), ABCSparseSeries)):
  468. # disable normal inference on dtype, sparse_index, & fill_value
  469. if sparse_index is None:
  470. sparse_index = data.sp_index
  471. if fill_value is None:
  472. fill_value = data.fill_value
  473. if dtype is None:
  474. dtype = data.dtype
  475. # TODO: make kind=None, and use data.kind?
  476. data = data.sp_values
  477. # Handle use-provided dtype
  478. if isinstance(dtype, compat.string_types):
  479. # Two options: dtype='int', regular numpy dtype
  480. # or dtype='Sparse[int]', a sparse dtype
  481. try:
  482. dtype = SparseDtype.construct_from_string(dtype)
  483. except TypeError:
  484. dtype = pandas_dtype(dtype)
  485. if isinstance(dtype, SparseDtype):
  486. if fill_value is None:
  487. fill_value = dtype.fill_value
  488. dtype = dtype.subtype
  489. if index is not None and not is_scalar(data):
  490. raise Exception("must only pass scalars with an index ")
  491. if is_scalar(data):
  492. if index is not None:
  493. if data is None:
  494. data = np.nan
  495. if index is not None:
  496. npoints = len(index)
  497. elif sparse_index is None:
  498. npoints = 1
  499. else:
  500. npoints = sparse_index.length
  501. dtype = infer_dtype_from_scalar(data)[0]
  502. data = construct_1d_arraylike_from_scalar(
  503. data, npoints, dtype
  504. )
  505. if dtype is not None:
  506. dtype = pandas_dtype(dtype)
  507. # TODO: disentangle the fill_value dtype inference from
  508. # dtype inference
  509. if data is None:
  510. # XXX: What should the empty dtype be? Object or float?
  511. data = np.array([], dtype=dtype)
  512. if not is_array_like(data):
  513. try:
  514. # probably shared code in sanitize_series
  515. from pandas.core.internals.construction import sanitize_array
  516. data = sanitize_array(data, index=None)
  517. except ValueError:
  518. # NumPy may raise a ValueError on data like [1, []]
  519. # we retry with object dtype here.
  520. if dtype is None:
  521. dtype = object
  522. data = np.atleast_1d(np.asarray(data, dtype=dtype))
  523. else:
  524. raise
  525. if copy:
  526. # TODO: avoid double copy when dtype forces cast.
  527. data = data.copy()
  528. if fill_value is None:
  529. fill_value_dtype = data.dtype if dtype is None else dtype
  530. if fill_value_dtype is None:
  531. fill_value = np.nan
  532. else:
  533. fill_value = na_value_for_dtype(fill_value_dtype)
  534. if isinstance(data, type(self)) and sparse_index is None:
  535. sparse_index = data._sparse_index
  536. sparse_values = np.asarray(data.sp_values, dtype=dtype)
  537. elif sparse_index is None:
  538. sparse_values, sparse_index, fill_value = make_sparse(
  539. data, kind=kind, fill_value=fill_value, dtype=dtype
  540. )
  541. else:
  542. sparse_values = np.asarray(data, dtype=dtype)
  543. if len(sparse_values) != sparse_index.npoints:
  544. raise AssertionError("Non array-like type {type} must "
  545. "have the same length as the index"
  546. .format(type=type(sparse_values)))
  547. self._sparse_index = sparse_index
  548. self._sparse_values = sparse_values
  549. self._dtype = SparseDtype(sparse_values.dtype, fill_value)
  550. @classmethod
  551. def _simple_new(cls, sparse_array, sparse_index, dtype):
  552. # type: (np.ndarray, SparseIndex, SparseDtype) -> 'SparseArray'
  553. new = cls([])
  554. new._sparse_index = sparse_index
  555. new._sparse_values = sparse_array
  556. new._dtype = dtype
  557. return new
  558. def __array__(self, dtype=None, copy=True):
  559. fill_value = self.fill_value
  560. if self.sp_index.ngaps == 0:
  561. # Compat for na dtype and int values.
  562. return self.sp_values
  563. if dtype is None:
  564. # Can NumPy represent this type?
  565. # If not, `np.result_type` will raise. We catch that
  566. # and return object.
  567. if is_datetime64_any_dtype(self.sp_values.dtype):
  568. # However, we *do* special-case the common case of
  569. # a datetime64 with pandas NaT.
  570. if fill_value is NaT:
  571. # Can't put pd.NaT in a datetime64[ns]
  572. fill_value = np.datetime64('NaT')
  573. try:
  574. dtype = np.result_type(self.sp_values.dtype, type(fill_value))
  575. except TypeError:
  576. dtype = object
  577. out = np.full(self.shape, fill_value, dtype=dtype)
  578. out[self.sp_index.to_int_index().indices] = self.sp_values
  579. return out
  580. def __setitem__(self, key, value):
  581. # I suppose we could allow setting of non-fill_value elements.
  582. # TODO(SparseArray.__setitem__): remove special cases in
  583. # ExtensionBlock.where
  584. msg = "SparseArray does not support item assignment via setitem"
  585. raise TypeError(msg)
  586. @classmethod
  587. def _from_sequence(cls, scalars, dtype=None, copy=False):
  588. return cls(scalars, dtype=dtype)
  589. @classmethod
  590. def _from_factorized(cls, values, original):
  591. return cls(values, dtype=original.dtype)
  592. # ------------------------------------------------------------------------
  593. # Data
  594. # ------------------------------------------------------------------------
  595. @property
  596. def sp_index(self):
  597. """
  598. The SparseIndex containing the location of non- ``fill_value`` points.
  599. """
  600. return self._sparse_index
  601. @property
  602. def sp_values(self):
  603. """
  604. An ndarray containing the non- ``fill_value`` values.
  605. Examples
  606. --------
  607. >>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0)
  608. >>> s.sp_values
  609. array([1, 2])
  610. """
  611. return self._sparse_values
  612. @property
  613. def dtype(self):
  614. return self._dtype
  615. @property
  616. def fill_value(self):
  617. """
  618. Elements in `data` that are `fill_value` are not stored.
  619. For memory savings, this should be the most common value in the array.
  620. """
  621. return self.dtype.fill_value
  622. @fill_value.setter
  623. def fill_value(self, value):
  624. self._dtype = SparseDtype(self.dtype.subtype, value)
  625. @property
  626. def kind(self):
  627. """
  628. The kind of sparse index for this array. One of {'integer', 'block'}.
  629. """
  630. if isinstance(self.sp_index, IntIndex):
  631. return 'integer'
  632. else:
  633. return 'block'
  634. @property
  635. def _valid_sp_values(self):
  636. sp_vals = self.sp_values
  637. mask = notna(sp_vals)
  638. return sp_vals[mask]
  639. def __len__(self):
  640. return self.sp_index.length
  641. @property
  642. def _null_fill_value(self):
  643. return self._dtype._is_na_fill_value
  644. def _fill_value_matches(self, fill_value):
  645. if self._null_fill_value:
  646. return isna(fill_value)
  647. else:
  648. return self.fill_value == fill_value
  649. @property
  650. def nbytes(self):
  651. return self.sp_values.nbytes + self.sp_index.nbytes
  652. @property
  653. def density(self):
  654. """
  655. The percent of non- ``fill_value`` points, as decimal.
  656. Examples
  657. --------
  658. >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
  659. >>> s.density
  660. 0.6
  661. """
  662. r = float(self.sp_index.npoints) / float(self.sp_index.length)
  663. return r
  664. @property
  665. def npoints(self):
  666. """
  667. The number of non- ``fill_value`` points.
  668. Examples
  669. --------
  670. >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
  671. >>> s.npoints
  672. 3
  673. """
  674. return self.sp_index.npoints
  675. @property
  676. def values(self):
  677. """
  678. Dense values
  679. """
  680. return self.to_dense()
  681. def isna(self):
  682. from pandas import isna
  683. # If null fill value, we want SparseDtype[bool, true]
  684. # to preserve the same memory usage.
  685. dtype = SparseDtype(bool, self._null_fill_value)
  686. return type(self)._simple_new(isna(self.sp_values),
  687. self.sp_index, dtype)
  688. def fillna(self, value=None, method=None, limit=None):
  689. """
  690. Fill missing values with `value`.
  691. Parameters
  692. ----------
  693. value : scalar, optional
  694. method : str, optional
  695. .. warning::
  696. Using 'method' will result in high memory use,
  697. as all `fill_value` methods will be converted to
  698. an in-memory ndarray
  699. limit : int, optional
  700. Returns
  701. -------
  702. SparseArray
  703. Notes
  704. -----
  705. When `value` is specified, the result's ``fill_value`` depends on
  706. ``self.fill_value``. The goal is to maintain low-memory use.
  707. If ``self.fill_value`` is NA, the result dtype will be
  708. ``SparseDtype(self.dtype, fill_value=value)``. This will preserve
  709. amount of memory used before and after filling.
  710. When ``self.fill_value`` is not NA, the result dtype will be
  711. ``self.dtype``. Again, this preserves the amount of memory used.
  712. """
  713. if ((method is None and value is None) or
  714. (method is not None and value is not None)):
  715. raise ValueError("Must specify one of 'method' or 'value'.")
  716. elif method is not None:
  717. msg = "fillna with 'method' requires high memory usage."
  718. warnings.warn(msg, PerformanceWarning)
  719. filled = interpolate_2d(np.asarray(self), method=method,
  720. limit=limit)
  721. return type(self)(filled, fill_value=self.fill_value)
  722. else:
  723. new_values = np.where(isna(self.sp_values), value, self.sp_values)
  724. if self._null_fill_value:
  725. # This is essentially just updating the dtype.
  726. new_dtype = SparseDtype(self.dtype.subtype, fill_value=value)
  727. else:
  728. new_dtype = self.dtype
  729. return self._simple_new(new_values, self._sparse_index, new_dtype)
  730. def shift(self, periods=1, fill_value=None):
  731. if not len(self) or periods == 0:
  732. return self.copy()
  733. if isna(fill_value):
  734. fill_value = self.dtype.na_value
  735. subtype = np.result_type(fill_value, self.dtype.subtype)
  736. if subtype != self.dtype.subtype:
  737. # just coerce up front
  738. arr = self.astype(SparseDtype(subtype, self.fill_value))
  739. else:
  740. arr = self
  741. empty = self._from_sequence(
  742. [fill_value] * min(abs(periods), len(self)),
  743. dtype=arr.dtype
  744. )
  745. if periods > 0:
  746. a = empty
  747. b = arr[:-periods]
  748. else:
  749. a = arr[abs(periods):]
  750. b = empty
  751. return arr._concat_same_type([a, b])
  752. def _first_fill_value_loc(self):
  753. """
  754. Get the location of the first missing value.
  755. Returns
  756. -------
  757. int
  758. """
  759. if len(self) == 0 or self.sp_index.npoints == len(self):
  760. return -1
  761. indices = self.sp_index.to_int_index().indices
  762. if not len(indices) or indices[0] > 0:
  763. return 0
  764. diff = indices[1:] - indices[:-1]
  765. return np.searchsorted(diff, 2) + 1
  766. def unique(self):
  767. uniques = list(algos.unique(self.sp_values))
  768. fill_loc = self._first_fill_value_loc()
  769. if fill_loc >= 0:
  770. uniques.insert(fill_loc, self.fill_value)
  771. return type(self)._from_sequence(uniques, dtype=self.dtype)
  772. def _values_for_factorize(self):
  773. # Still override this for hash_pandas_object
  774. return np.asarray(self), self.fill_value
  775. def factorize(self, na_sentinel=-1):
  776. # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
  777. # The sparsity on this is backwards from what Sparse would want. Want
  778. # ExtensionArray.factorize -> Tuple[EA, EA]
  779. # Given that we have to return a dense array of labels, why bother
  780. # implementing an efficient factorize?
  781. labels, uniques = algos.factorize(np.asarray(self),
  782. na_sentinel=na_sentinel)
  783. uniques = SparseArray(uniques, dtype=self.dtype)
  784. return labels, uniques
  785. def value_counts(self, dropna=True):
  786. """
  787. Returns a Series containing counts of unique values.
  788. Parameters
  789. ----------
  790. dropna : boolean, default True
  791. Don't include counts of NaN, even if NaN is in sp_values.
  792. Returns
  793. -------
  794. counts : Series
  795. """
  796. from pandas import Index, Series
  797. keys, counts = algos._value_counts_arraylike(self.sp_values,
  798. dropna=dropna)
  799. fcounts = self.sp_index.ngaps
  800. if fcounts > 0:
  801. if self._null_fill_value and dropna:
  802. pass
  803. else:
  804. if self._null_fill_value:
  805. mask = isna(keys)
  806. else:
  807. mask = keys == self.fill_value
  808. if mask.any():
  809. counts[mask] += fcounts
  810. else:
  811. keys = np.insert(keys, 0, self.fill_value)
  812. counts = np.insert(counts, 0, fcounts)
  813. if not isinstance(keys, ABCIndexClass):
  814. keys = Index(keys)
  815. result = Series(counts, index=keys)
  816. return result
  817. # --------
  818. # Indexing
  819. # --------
  820. def __getitem__(self, key):
  821. if isinstance(key, tuple):
  822. if len(key) > 1:
  823. raise IndexError("too many indices for array.")
  824. key = key[0]
  825. if is_integer(key):
  826. return self._get_val_at(key)
  827. elif isinstance(key, tuple):
  828. data_slice = self.values[key]
  829. elif isinstance(key, slice):
  830. # special case to preserve dtypes
  831. if key == slice(None):
  832. return self.copy()
  833. # TODO: this logic is surely elsewhere
  834. # TODO: this could be more efficient
  835. indices = np.arange(len(self), dtype=np.int32)[key]
  836. return self.take(indices)
  837. else:
  838. # TODO: I think we can avoid densifying when masking a
  839. # boolean SparseArray with another. Need to look at the
  840. # key's fill_value for True / False, and then do an intersection
  841. # on the indicies of the sp_values.
  842. if isinstance(key, SparseArray):
  843. if is_bool_dtype(key):
  844. key = key.to_dense()
  845. else:
  846. key = np.asarray(key)
  847. if com.is_bool_indexer(key) and len(self) == len(key):
  848. return self.take(np.arange(len(key), dtype=np.int32)[key])
  849. elif hasattr(key, '__len__'):
  850. return self.take(key)
  851. else:
  852. raise ValueError("Cannot slice with '{}'".format(key))
  853. return type(self)(data_slice, kind=self.kind)
  854. def _get_val_at(self, loc):
  855. n = len(self)
  856. if loc < 0:
  857. loc += n
  858. if loc >= n or loc < 0:
  859. raise IndexError('Out of bounds access')
  860. sp_loc = self.sp_index.lookup(loc)
  861. if sp_loc == -1:
  862. return self.fill_value
  863. else:
  864. return libindex.get_value_at(self.sp_values, sp_loc)
  865. def take(self, indices, allow_fill=False, fill_value=None):
  866. if is_scalar(indices):
  867. raise ValueError("'indices' must be an array, not a "
  868. "scalar '{}'.".format(indices))
  869. indices = np.asarray(indices, dtype=np.int32)
  870. if indices.size == 0:
  871. result = []
  872. kwargs = {'dtype': self.dtype}
  873. elif allow_fill:
  874. result = self._take_with_fill(indices, fill_value=fill_value)
  875. kwargs = {}
  876. else:
  877. result = self._take_without_fill(indices)
  878. kwargs = {'dtype': self.dtype}
  879. return type(self)(result, fill_value=self.fill_value, kind=self.kind,
  880. **kwargs)
  881. def _take_with_fill(self, indices, fill_value=None):
  882. if fill_value is None:
  883. fill_value = self.dtype.na_value
  884. if indices.min() < -1:
  885. raise ValueError("Invalid value in 'indices'. Must be between -1 "
  886. "and the length of the array.")
  887. if indices.max() >= len(self):
  888. raise IndexError("out of bounds value in 'indices'.")
  889. if len(self) == 0:
  890. # Empty... Allow taking only if all empty
  891. if (indices == -1).all():
  892. dtype = np.result_type(self.sp_values, type(fill_value))
  893. taken = np.empty_like(indices, dtype=dtype)
  894. taken.fill(fill_value)
  895. return taken
  896. else:
  897. raise IndexError('cannot do a non-empty take from an empty '
  898. 'axes.')
  899. sp_indexer = self.sp_index.lookup_array(indices)
  900. if self.sp_index.npoints == 0:
  901. # Avoid taking from the empty self.sp_values
  902. taken = np.full(sp_indexer.shape, fill_value=fill_value,
  903. dtype=np.result_type(type(fill_value)))
  904. else:
  905. taken = self.sp_values.take(sp_indexer)
  906. # sp_indexer may be -1 for two reasons
  907. # 1.) we took for an index of -1 (new)
  908. # 2.) we took a value that was self.fill_value (old)
  909. new_fill_indices = indices == -1
  910. old_fill_indices = (sp_indexer == -1) & ~new_fill_indices
  911. # Fill in two steps.
  912. # Old fill values
  913. # New fill values
  914. # potentially coercing to a new dtype at each stage.
  915. m0 = sp_indexer[old_fill_indices] < 0
  916. m1 = sp_indexer[new_fill_indices] < 0
  917. result_type = taken.dtype
  918. if m0.any():
  919. result_type = np.result_type(result_type,
  920. type(self.fill_value))
  921. taken = taken.astype(result_type)
  922. taken[old_fill_indices] = self.fill_value
  923. if m1.any():
  924. result_type = np.result_type(result_type, type(fill_value))
  925. taken = taken.astype(result_type)
  926. taken[new_fill_indices] = fill_value
  927. return taken
  928. def _take_without_fill(self, indices):
  929. to_shift = indices < 0
  930. indices = indices.copy()
  931. n = len(self)
  932. if (indices.max() >= n) or (indices.min() < -n):
  933. if n == 0:
  934. raise IndexError("cannot do a non-empty take from an "
  935. "empty axes.")
  936. else:
  937. raise IndexError("out of bounds value in 'indices'.")
  938. if to_shift.any():
  939. indices[to_shift] += n
  940. if self.sp_index.npoints == 0:
  941. # edge case in take...
  942. # I think just return
  943. out = np.full(indices.shape, self.fill_value,
  944. dtype=np.result_type(type(self.fill_value)))
  945. arr, sp_index, fill_value = make_sparse(out,
  946. fill_value=self.fill_value)
  947. return type(self)(arr, sparse_index=sp_index,
  948. fill_value=fill_value)
  949. sp_indexer = self.sp_index.lookup_array(indices)
  950. taken = self.sp_values.take(sp_indexer)
  951. fillable = (sp_indexer < 0)
  952. if fillable.any():
  953. # TODO: may need to coerce array to fill value
  954. result_type = np.result_type(taken, type(self.fill_value))
  955. taken = taken.astype(result_type)
  956. taken[fillable] = self.fill_value
  957. return taken
  958. def searchsorted(self, v, side="left", sorter=None):
  959. msg = "searchsorted requires high memory usage."
  960. warnings.warn(msg, PerformanceWarning, stacklevel=2)
  961. if not is_scalar(v):
  962. v = np.asarray(v)
  963. v = np.asarray(v)
  964. return np.asarray(self, dtype=self.dtype.subtype).searchsorted(
  965. v, side, sorter
  966. )
  967. def copy(self, deep=False):
  968. if deep:
  969. values = self.sp_values.copy()
  970. else:
  971. values = self.sp_values
  972. return self._simple_new(values, self.sp_index, self.dtype)
  973. @classmethod
  974. def _concat_same_type(cls, to_concat):
  975. fill_values = [x.fill_value for x in to_concat]
  976. fill_value = fill_values[0]
  977. # np.nan isn't a singleton, so we may end up with multiple
  978. # NaNs here, so we ignore tha all NA case too.
  979. if not (len(set(fill_values)) == 1 or isna(fill_values).all()):
  980. warnings.warn("Concatenating sparse arrays with multiple fill "
  981. "values: '{}'. Picking the first and "
  982. "converting the rest.".format(fill_values),
  983. PerformanceWarning,
  984. stacklevel=6)
  985. keep = to_concat[0]
  986. to_concat2 = [keep]
  987. for arr in to_concat[1:]:
  988. to_concat2.append(cls(np.asarray(arr), fill_value=fill_value))
  989. to_concat = to_concat2
  990. values = []
  991. length = 0
  992. if to_concat:
  993. sp_kind = to_concat[0].kind
  994. else:
  995. sp_kind = 'integer'
  996. if sp_kind == 'integer':
  997. indices = []
  998. for arr in to_concat:
  999. idx = arr.sp_index.to_int_index().indices.copy()
  1000. idx += length # TODO: wraparound
  1001. length += arr.sp_index.length
  1002. values.append(arr.sp_values)
  1003. indices.append(idx)
  1004. data = np.concatenate(values)
  1005. indices = np.concatenate(indices)
  1006. sp_index = IntIndex(length, indices)
  1007. else:
  1008. # when concatentating block indices, we don't claim that you'll
  1009. # get an identical index as concating the values and then
  1010. # creating a new index. We don't want to spend the time trying
  1011. # to merge blocks across arrays in `to_concat`, so the resulting
  1012. # BlockIndex may have more blocs.
  1013. blengths = []
  1014. blocs = []
  1015. for arr in to_concat:
  1016. idx = arr.sp_index.to_block_index()
  1017. values.append(arr.sp_values)
  1018. blocs.append(idx.blocs.copy() + length)
  1019. blengths.append(idx.blengths)
  1020. length += arr.sp_index.length
  1021. data = np.concatenate(values)
  1022. blocs = np.concatenate(blocs)
  1023. blengths = np.concatenate(blengths)
  1024. sp_index = BlockIndex(length, blocs, blengths)
  1025. return cls(data, sparse_index=sp_index, fill_value=fill_value)
  1026. def astype(self, dtype=None, copy=True):
  1027. """
  1028. Change the dtype of a SparseArray.
  1029. The output will always be a SparseArray. To convert to a dense
  1030. ndarray with a certain dtype, use :meth:`numpy.asarray`.
  1031. Parameters
  1032. ----------
  1033. dtype : np.dtype or ExtensionDtype
  1034. For SparseDtype, this changes the dtype of
  1035. ``self.sp_values`` and the ``self.fill_value``.
  1036. For other dtypes, this only changes the dtype of
  1037. ``self.sp_values``.
  1038. copy : bool, default True
  1039. Whether to ensure a copy is made, even if not necessary.
  1040. Returns
  1041. -------
  1042. SparseArray
  1043. Examples
  1044. --------
  1045. >>> arr = SparseArray([0, 0, 1, 2])
  1046. >>> arr
  1047. [0, 0, 1, 2]
  1048. Fill: 0
  1049. IntIndex
  1050. Indices: array([2, 3], dtype=int32)
  1051. >>> arr.astype(np.dtype('int32'))
  1052. [0, 0, 1, 2]
  1053. Fill: 0
  1054. IntIndex
  1055. Indices: array([2, 3], dtype=int32)
  1056. Using a NumPy dtype with a different kind (e.g. float) will coerce
  1057. just ``self.sp_values``.
  1058. >>> arr.astype(np.dtype('float64'))
  1059. ... # doctest: +NORMALIZE_WHITESPACE
  1060. [0, 0, 1.0, 2.0]
  1061. Fill: 0
  1062. IntIndex
  1063. Indices: array([2, 3], dtype=int32)
  1064. Use a SparseDtype if you wish to be change the fill value as well.
  1065. >>> arr.astype(SparseDtype("float64", fill_value=np.nan))
  1066. ... # doctest: +NORMALIZE_WHITESPACE
  1067. [nan, nan, 1.0, 2.0]
  1068. Fill: nan
  1069. IntIndex
  1070. Indices: array([2, 3], dtype=int32)
  1071. """
  1072. dtype = self.dtype.update_dtype(dtype)
  1073. subtype = dtype._subtype_with_str
  1074. sp_values = astype_nansafe(self.sp_values,
  1075. subtype,
  1076. copy=copy)
  1077. if sp_values is self.sp_values and copy:
  1078. sp_values = sp_values.copy()
  1079. return self._simple_new(sp_values,
  1080. self.sp_index,
  1081. dtype)
  1082. def map(self, mapper):
  1083. """
  1084. Map categories using input correspondence (dict, Series, or function).
  1085. Parameters
  1086. ----------
  1087. mapper : dict, Series, callable
  1088. The correspondence from old values to new.
  1089. Returns
  1090. -------
  1091. SparseArray
  1092. The output array will have the same density as the input.
  1093. The output fill value will be the result of applying the
  1094. mapping to ``self.fill_value``
  1095. Examples
  1096. --------
  1097. >>> arr = pd.SparseArray([0, 1, 2])
  1098. >>> arr.apply(lambda x: x + 10)
  1099. [10, 11, 12]
  1100. Fill: 10
  1101. IntIndex
  1102. Indices: array([1, 2], dtype=int32)
  1103. >>> arr.apply({0: 10, 1: 11, 2: 12})
  1104. [10, 11, 12]
  1105. Fill: 10
  1106. IntIndex
  1107. Indices: array([1, 2], dtype=int32)
  1108. >>> arr.apply(pd.Series([10, 11, 12], index=[0, 1, 2]))
  1109. [10, 11, 12]
  1110. Fill: 10
  1111. IntIndex
  1112. Indices: array([1, 2], dtype=int32)
  1113. """
  1114. # this is used in apply.
  1115. # We get hit since we're an "is_extension_type" but regular extension
  1116. # types are not hit. This may be worth adding to the interface.
  1117. if isinstance(mapper, ABCSeries):
  1118. mapper = mapper.to_dict()
  1119. if isinstance(mapper, compat.Mapping):
  1120. fill_value = mapper.get(self.fill_value, self.fill_value)
  1121. sp_values = [mapper.get(x, None) for x in self.sp_values]
  1122. else:
  1123. fill_value = mapper(self.fill_value)
  1124. sp_values = [mapper(x) for x in self.sp_values]
  1125. return type(self)(sp_values, sparse_index=self.sp_index,
  1126. fill_value=fill_value)
  1127. def to_dense(self):
  1128. """
  1129. Convert SparseArray to a NumPy array.
  1130. Returns
  1131. -------
  1132. arr : NumPy array
  1133. """
  1134. return np.asarray(self, dtype=self.sp_values.dtype)
  1135. # TODO: Look into deprecating this in favor of `to_dense`.
  1136. get_values = to_dense
  1137. # ------------------------------------------------------------------------
  1138. # IO
  1139. # ------------------------------------------------------------------------
  1140. def __setstate__(self, state):
  1141. """Necessary for making this object picklable"""
  1142. if isinstance(state, tuple):
  1143. # Compat for pandas < 0.24.0
  1144. nd_state, (fill_value, sp_index) = state
  1145. sparse_values = np.array([])
  1146. sparse_values.__setstate__(nd_state)
  1147. self._sparse_values = sparse_values
  1148. self._sparse_index = sp_index
  1149. self._dtype = SparseDtype(sparse_values.dtype, fill_value)
  1150. else:
  1151. self.__dict__.update(state)
  1152. def nonzero(self):
  1153. if self.fill_value == 0:
  1154. return self.sp_index.to_int_index().indices,
  1155. else:
  1156. return self.sp_index.to_int_index().indices[self.sp_values != 0],
  1157. # ------------------------------------------------------------------------
  1158. # Reductions
  1159. # ------------------------------------------------------------------------
  1160. def _reduce(self, name, skipna=True, **kwargs):
  1161. method = getattr(self, name, None)
  1162. if method is None:
  1163. raise TypeError("cannot perform {name} with type {dtype}".format(
  1164. name=name, dtype=self.dtype))
  1165. if skipna:
  1166. arr = self
  1167. else:
  1168. arr = self.dropna()
  1169. # we don't support these kwargs.
  1170. # They should only be present when called via pandas, so do it here.
  1171. # instead of in `any` / `all` (which will raise if they're present,
  1172. # thanks to nv.validate
  1173. kwargs.pop('filter_type', None)
  1174. kwargs.pop('numeric_only', None)
  1175. kwargs.pop('op', None)
  1176. return getattr(arr, name)(**kwargs)
  1177. def all(self, axis=None, *args, **kwargs):
  1178. """
  1179. Tests whether all elements evaluate True
  1180. Returns
  1181. -------
  1182. all : bool
  1183. See Also
  1184. --------
  1185. numpy.all
  1186. """
  1187. nv.validate_all(args, kwargs)
  1188. values = self.sp_values
  1189. if len(values) != len(self) and not np.all(self.fill_value):
  1190. return False
  1191. return values.all()
  1192. def any(self, axis=0, *args, **kwargs):
  1193. """
  1194. Tests whether at least one of elements evaluate True
  1195. Returns
  1196. -------
  1197. any : bool
  1198. See Also
  1199. --------
  1200. numpy.any
  1201. """
  1202. nv.validate_any(args, kwargs)
  1203. values = self.sp_values
  1204. if len(values) != len(self) and np.any(self.fill_value):
  1205. return True
  1206. return values.any().item()
  1207. def sum(self, axis=0, *args, **kwargs):
  1208. """
  1209. Sum of non-NA/null values
  1210. Returns
  1211. -------
  1212. sum : float
  1213. """
  1214. nv.validate_sum(args, kwargs)
  1215. valid_vals = self._valid_sp_values
  1216. sp_sum = valid_vals.sum()
  1217. if self._null_fill_value:
  1218. return sp_sum
  1219. else:
  1220. nsparse = self.sp_index.ngaps
  1221. return sp_sum + self.fill_value * nsparse
  1222. def cumsum(self, axis=0, *args, **kwargs):
  1223. """
  1224. Cumulative sum of non-NA/null values.
  1225. When performing the cumulative summation, any non-NA/null values will
  1226. be skipped. The resulting SparseArray will preserve the locations of
  1227. NaN values, but the fill value will be `np.nan` regardless.
  1228. Parameters
  1229. ----------
  1230. axis : int or None
  1231. Axis over which to perform the cumulative summation. If None,
  1232. perform cumulative summation over flattened array.
  1233. Returns
  1234. -------
  1235. cumsum : SparseArray
  1236. """
  1237. nv.validate_cumsum(args, kwargs)
  1238. if axis is not None and axis >= self.ndim: # Mimic ndarray behaviour.
  1239. raise ValueError("axis(={axis}) out of bounds".format(axis=axis))
  1240. if not self._null_fill_value:
  1241. return SparseArray(self.to_dense()).cumsum()
  1242. return SparseArray(self.sp_values.cumsum(), sparse_index=self.sp_index,
  1243. fill_value=self.fill_value)
  1244. def mean(self, axis=0, *args, **kwargs):
  1245. """
  1246. Mean of non-NA/null values
  1247. Returns
  1248. -------
  1249. mean : float
  1250. """
  1251. nv.validate_mean(args, kwargs)
  1252. valid_vals = self._valid_sp_values
  1253. sp_sum = valid_vals.sum()
  1254. ct = len(valid_vals)
  1255. if self._null_fill_value:
  1256. return sp_sum / ct
  1257. else:
  1258. nsparse = self.sp_index.ngaps
  1259. return (sp_sum + self.fill_value * nsparse) / (ct + nsparse)
  1260. def transpose(self, *axes):
  1261. """
  1262. Returns the SparseArray.
  1263. """
  1264. return self
  1265. @property
  1266. def T(self):
  1267. """
  1268. Returns the SparseArray.
  1269. """
  1270. return self
  1271. # ------------------------------------------------------------------------
  1272. # Ufuncs
  1273. # ------------------------------------------------------------------------
  1274. def __array_wrap__(self, array, context=None):
  1275. from pandas.core.dtypes.generic import ABCSparseSeries
  1276. ufunc, inputs, _ = context
  1277. inputs = tuple(x.values if isinstance(x, ABCSparseSeries) else x
  1278. for x in inputs)
  1279. return self.__array_ufunc__(ufunc, '__call__', *inputs)
  1280. _HANDLED_TYPES = (np.ndarray, numbers.Number)
  1281. def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
  1282. out = kwargs.get('out', ())
  1283. for x in inputs + out:
  1284. if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)):
  1285. return NotImplemented
  1286. special = {'add', 'sub', 'mul', 'pow', 'mod', 'floordiv', 'truediv',
  1287. 'divmod', 'eq', 'ne', 'lt', 'gt', 'le', 'ge', 'remainder'}
  1288. if compat.PY2:
  1289. special.add('div')
  1290. aliases = {
  1291. 'subtract': 'sub',
  1292. 'multiply': 'mul',
  1293. 'floor_divide': 'floordiv',
  1294. 'true_divide': 'truediv',
  1295. 'power': 'pow',
  1296. 'remainder': 'mod',
  1297. 'divide': 'div',
  1298. 'equal': 'eq',
  1299. 'not_equal': 'ne',
  1300. 'less': 'lt',
  1301. 'less_equal': 'le',
  1302. 'greater': 'gt',
  1303. 'greater_equal': 'ge',
  1304. }
  1305. flipped = {
  1306. 'lt': '__gt__',
  1307. 'le': '__ge__',
  1308. 'gt': '__lt__',
  1309. 'ge': '__le__',
  1310. 'eq': '__eq__',
  1311. 'ne': '__ne__',
  1312. }
  1313. op_name = ufunc.__name__
  1314. op_name = aliases.get(op_name, op_name)
  1315. if op_name in special and kwargs.get('out') is None:
  1316. if isinstance(inputs[0], type(self)):
  1317. return getattr(self, '__{}__'.format(op_name))(inputs[1])
  1318. else:
  1319. name = flipped.get(op_name, '__r{}__'.format(op_name))
  1320. return getattr(self, name)(inputs[0])
  1321. if len(inputs) == 1:
  1322. # No alignment necessary.
  1323. sp_values = getattr(ufunc, method)(self.sp_values, **kwargs)
  1324. fill_value = getattr(ufunc, method)(self.fill_value, **kwargs)
  1325. return self._simple_new(sp_values,
  1326. self.sp_index,
  1327. SparseDtype(sp_values.dtype, fill_value))
  1328. result = getattr(ufunc, method)(*[np.asarray(x) for x in inputs],
  1329. **kwargs)
  1330. if out:
  1331. if len(out) == 1:
  1332. out = out[0]
  1333. return out
  1334. if type(result) is tuple:
  1335. return tuple(type(self)(x) for x in result)
  1336. elif method == 'at':
  1337. # no return value
  1338. return None
  1339. else:
  1340. return type(self)(result)
  1341. def __abs__(self):
  1342. return np.abs(self)
  1343. # ------------------------------------------------------------------------
  1344. # Ops
  1345. # ------------------------------------------------------------------------
  1346. @classmethod
  1347. def _create_unary_method(cls, op):
  1348. def sparse_unary_method(self):
  1349. fill_value = op(np.array(self.fill_value)).item()
  1350. values = op(self.sp_values)
  1351. dtype = SparseDtype(values.dtype, fill_value)
  1352. return cls._simple_new(values, self.sp_index, dtype)
  1353. name = '__{name}__'.format(name=op.__name__)
  1354. return compat.set_function_name(sparse_unary_method, name, cls)
  1355. @classmethod
  1356. def _create_arithmetic_method(cls, op):
  1357. def sparse_arithmetic_method(self, other):
  1358. op_name = op.__name__
  1359. if isinstance(other, (ABCSeries, ABCIndexClass)):
  1360. # Rely on pandas to dispatch to us.
  1361. return NotImplemented
  1362. if isinstance(other, SparseArray):
  1363. return _sparse_array_op(self, other, op, op_name)
  1364. elif is_scalar(other):
  1365. with np.errstate(all='ignore'):
  1366. fill = op(_get_fill(self), np.asarray(other))
  1367. result = op(self.sp_values, other)
  1368. if op_name == 'divmod':
  1369. left, right = result
  1370. lfill, rfill = fill
  1371. return (_wrap_result(op_name, left, self.sp_index, lfill),
  1372. _wrap_result(op_name, right, self.sp_index, rfill))
  1373. return _wrap_result(op_name, result, self.sp_index, fill)
  1374. else:
  1375. other = np.asarray(other)
  1376. with np.errstate(all='ignore'):
  1377. # TODO: delete sparse stuff in core/ops.py
  1378. # TODO: look into _wrap_result
  1379. if len(self) != len(other):
  1380. raise AssertionError(
  1381. ("length mismatch: {self} vs. {other}".format(
  1382. self=len(self), other=len(other))))
  1383. if not isinstance(other, SparseArray):
  1384. dtype = getattr(other, 'dtype', None)
  1385. other = SparseArray(other, fill_value=self.fill_value,
  1386. dtype=dtype)
  1387. return _sparse_array_op(self, other, op, op_name)
  1388. name = '__{name}__'.format(name=op.__name__)
  1389. return compat.set_function_name(sparse_arithmetic_method, name, cls)
  1390. @classmethod
  1391. def _create_comparison_method(cls, op):
  1392. def cmp_method(self, other):
  1393. op_name = op.__name__
  1394. if op_name in {'and_', 'or_'}:
  1395. op_name = op_name[:-1]
  1396. if isinstance(other, (ABCSeries, ABCIndexClass)):
  1397. # Rely on pandas to unbox and dispatch to us.
  1398. return NotImplemented
  1399. if not is_scalar(other) and not isinstance(other, type(self)):
  1400. # convert list-like to ndarray
  1401. other = np.asarray(other)
  1402. if isinstance(other, np.ndarray):
  1403. # TODO: make this more flexible than just ndarray...
  1404. if len(self) != len(other):
  1405. raise AssertionError("length mismatch: {self} vs. {other}"
  1406. .format(self=len(self),
  1407. other=len(other)))
  1408. other = SparseArray(other, fill_value=self.fill_value)
  1409. if isinstance(other, SparseArray):
  1410. return _sparse_array_op(self, other, op, op_name)
  1411. else:
  1412. with np.errstate(all='ignore'):
  1413. fill_value = op(self.fill_value, other)
  1414. result = op(self.sp_values, other)
  1415. return type(self)(result,
  1416. sparse_index=self.sp_index,
  1417. fill_value=fill_value,
  1418. dtype=np.bool_)
  1419. name = '__{name}__'.format(name=op.__name__)
  1420. return compat.set_function_name(cmp_method, name, cls)
  1421. @classmethod
  1422. def _add_unary_ops(cls):
  1423. cls.__pos__ = cls._create_unary_method(operator.pos)
  1424. cls.__neg__ = cls._create_unary_method(operator.neg)
  1425. cls.__invert__ = cls._create_unary_method(operator.invert)
  1426. @classmethod
  1427. def _add_comparison_ops(cls):
  1428. cls.__and__ = cls._create_comparison_method(operator.and_)
  1429. cls.__or__ = cls._create_comparison_method(operator.or_)
  1430. super(SparseArray, cls)._add_comparison_ops()
  1431. # ----------
  1432. # Formatting
  1433. # -----------
  1434. def __unicode__(self):
  1435. return '{self}\nFill: {fill}\n{index}'.format(
  1436. self=printing.pprint_thing(self),
  1437. fill=printing.pprint_thing(self.fill_value),
  1438. index=printing.pprint_thing(self.sp_index))
  1439. def _formatter(self, boxed=False):
  1440. # Defer to the formatter from the GenericArrayFormatter calling us.
  1441. # This will infer the correct formatter from the dtype of the values.
  1442. return None
  1443. SparseArray._add_arithmetic_ops()
  1444. SparseArray._add_comparison_ops()
  1445. SparseArray._add_unary_ops()
  1446. def _maybe_to_dense(obj):
  1447. """
  1448. try to convert to dense
  1449. """
  1450. if hasattr(obj, 'to_dense'):
  1451. return obj.to_dense()
  1452. return obj
  1453. def _maybe_to_sparse(array):
  1454. """
  1455. array must be SparseSeries or SparseArray
  1456. """
  1457. if isinstance(array, ABCSparseSeries):
  1458. array = array.values.copy()
  1459. return array
  1460. def _sanitize_values(arr):
  1461. """
  1462. return an ndarray for our input,
  1463. in a platform independent manner
  1464. """
  1465. if hasattr(arr, 'values'):
  1466. arr = arr.values
  1467. else:
  1468. # scalar
  1469. if is_scalar(arr):
  1470. arr = [arr]
  1471. # ndarray
  1472. if isinstance(arr, np.ndarray):
  1473. pass
  1474. elif is_list_like(arr) and len(arr) > 0:
  1475. arr = maybe_convert_platform(arr)
  1476. else:
  1477. arr = np.asarray(arr)
  1478. return arr
  1479. def make_sparse(arr, kind='block', fill_value=None, dtype=None, copy=False):
  1480. """
  1481. Convert ndarray to sparse format
  1482. Parameters
  1483. ----------
  1484. arr : ndarray
  1485. kind : {'block', 'integer'}
  1486. fill_value : NaN or another value
  1487. dtype : np.dtype, optional
  1488. copy : bool, default False
  1489. Returns
  1490. -------
  1491. (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar)
  1492. """
  1493. arr = _sanitize_values(arr)
  1494. if arr.ndim > 1:
  1495. raise TypeError("expected dimension <= 1 data")
  1496. if fill_value is None:
  1497. fill_value = na_value_for_dtype(arr.dtype)
  1498. if isna(fill_value):
  1499. mask = notna(arr)
  1500. else:
  1501. # For str arrays in NumPy 1.12.0, operator!= below isn't
  1502. # element-wise but just returns False if fill_value is not str,
  1503. # so cast to object comparison to be safe
  1504. if is_string_dtype(arr):
  1505. arr = arr.astype(object)
  1506. if is_object_dtype(arr.dtype):
  1507. # element-wise equality check method in numpy doesn't treat
  1508. # each element type, eg. 0, 0.0, and False are treated as
  1509. # same. So we have to check the both of its type and value.
  1510. mask = splib.make_mask_object_ndarray(arr, fill_value)
  1511. else:
  1512. mask = arr != fill_value
  1513. length = len(arr)
  1514. if length != len(mask):
  1515. # the arr is a SparseArray
  1516. indices = mask.sp_index.indices
  1517. else:
  1518. indices = mask.nonzero()[0].astype(np.int32)
  1519. index = _make_index(length, indices, kind)
  1520. sparsified_values = arr[mask]
  1521. if dtype is not None:
  1522. sparsified_values = astype_nansafe(sparsified_values, dtype=dtype)
  1523. # TODO: copy
  1524. return sparsified_values, index, fill_value
  1525. def _make_index(length, indices, kind):
  1526. if kind == 'block' or isinstance(kind, BlockIndex):
  1527. locs, lens = splib.get_blocks(indices)
  1528. index = BlockIndex(length, locs, lens)
  1529. elif kind == 'integer' or isinstance(kind, IntIndex):
  1530. index = IntIndex(length, indices)
  1531. else: # pragma: no cover
  1532. raise ValueError('must be block or integer type')
  1533. return index
  1534. # ----------------------------------------------------------------------------
  1535. # Accessor
  1536. @delegate_names(SparseArray, ['npoints', 'density', 'fill_value',
  1537. 'sp_values'],
  1538. typ='property')
  1539. class SparseAccessor(PandasDelegate):
  1540. """
  1541. Accessor for SparseSparse from other sparse matrix data types.
  1542. """
  1543. def __init__(self, data=None):
  1544. self._validate(data)
  1545. # Store the Series since we need that for to_coo
  1546. self._parent = data
  1547. @staticmethod
  1548. def _validate(data):
  1549. if not isinstance(data.dtype, SparseDtype):
  1550. msg = "Can only use the '.sparse' accessor with Sparse data."
  1551. raise AttributeError(msg)
  1552. def _delegate_property_get(self, name, *args, **kwargs):
  1553. return getattr(self._parent.values, name)
  1554. def _delegate_method(self, name, *args, **kwargs):
  1555. if name == 'from_coo':
  1556. return self.from_coo(*args, **kwargs)
  1557. elif name == 'to_coo':
  1558. return self.to_coo(*args, **kwargs)
  1559. else:
  1560. raise ValueError
  1561. @classmethod
  1562. def from_coo(cls, A, dense_index=False):
  1563. """
  1564. Create a SparseSeries from a scipy.sparse.coo_matrix.
  1565. Parameters
  1566. ----------
  1567. A : scipy.sparse.coo_matrix
  1568. dense_index : bool, default False
  1569. If False (default), the SparseSeries index consists of only the
  1570. coords of the non-null entries of the original coo_matrix.
  1571. If True, the SparseSeries index consists of the full sorted
  1572. (row, col) coordinates of the coo_matrix.
  1573. Returns
  1574. -------
  1575. s : SparseSeries
  1576. Examples
  1577. ---------
  1578. >>> from scipy import sparse
  1579. >>> A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])),
  1580. shape=(3, 4))
  1581. >>> A
  1582. <3x4 sparse matrix of type '<class 'numpy.float64'>'
  1583. with 3 stored elements in COOrdinate format>
  1584. >>> A.todense()
  1585. matrix([[ 0., 0., 1., 2.],
  1586. [ 3., 0., 0., 0.],
  1587. [ 0., 0., 0., 0.]])
  1588. >>> ss = pd.SparseSeries.from_coo(A)
  1589. >>> ss
  1590. 0 2 1
  1591. 3 2
  1592. 1 0 3
  1593. dtype: float64
  1594. BlockIndex
  1595. Block locations: array([0], dtype=int32)
  1596. Block lengths: array([3], dtype=int32)
  1597. """
  1598. from pandas.core.sparse.scipy_sparse import _coo_to_sparse_series
  1599. from pandas import Series
  1600. result = _coo_to_sparse_series(A, dense_index=dense_index)
  1601. # SparseSeries -> Series[sparse]
  1602. result = Series(result.values, index=result.index, copy=False)
  1603. return result
  1604. def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
  1605. """
  1606. Create a scipy.sparse.coo_matrix from a SparseSeries with MultiIndex.
  1607. Use row_levels and column_levels to determine the row and column
  1608. coordinates respectively. row_levels and column_levels are the names
  1609. (labels) or numbers of the levels. {row_levels, column_levels} must be
  1610. a partition of the MultiIndex level names (or numbers).
  1611. Parameters
  1612. ----------
  1613. row_levels : tuple/list
  1614. column_levels : tuple/list
  1615. sort_labels : bool, default False
  1616. Sort the row and column labels before forming the sparse matrix.
  1617. Returns
  1618. -------
  1619. y : scipy.sparse.coo_matrix
  1620. rows : list (row labels)
  1621. columns : list (column labels)
  1622. Examples
  1623. --------
  1624. >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan])
  1625. >>> s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0),
  1626. (1, 2, 'a', 1),
  1627. (1, 1, 'b', 0),
  1628. (1, 1, 'b', 1),
  1629. (2, 1, 'b', 0),
  1630. (2, 1, 'b', 1)],
  1631. names=['A', 'B', 'C', 'D'])
  1632. >>> ss = s.to_sparse()
  1633. >>> A, rows, columns = ss.to_coo(row_levels=['A', 'B'],
  1634. column_levels=['C', 'D'],
  1635. sort_labels=True)
  1636. >>> A
  1637. <3x4 sparse matrix of type '<class 'numpy.float64'>'
  1638. with 3 stored elements in COOrdinate format>
  1639. >>> A.todense()
  1640. matrix([[ 0., 0., 1., 3.],
  1641. [ 3., 0., 0., 0.],
  1642. [ 0., 0., 0., 0.]])
  1643. >>> rows
  1644. [(1, 1), (1, 2), (2, 1)]
  1645. >>> columns
  1646. [('a', 0), ('a', 1), ('b', 0), ('b', 1)]
  1647. """
  1648. from pandas.core.sparse.scipy_sparse import _sparse_series_to_coo
  1649. A, rows, columns = _sparse_series_to_coo(self._parent,
  1650. row_levels,
  1651. column_levels,
  1652. sort_labels=sort_labels)
  1653. return A, rows, columns