frame.py 37 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043
  1. """
  2. Data structures for sparse float data. Life is made simpler by dealing only
  3. with float64 data
  4. """
  5. from __future__ import division
  6. import warnings
  7. import numpy as np
  8. from pandas._libs.sparse import BlockIndex, get_blocks
  9. import pandas.compat as compat
  10. from pandas.compat import lmap
  11. from pandas.compat.numpy import function as nv
  12. from pandas.util._decorators import Appender
  13. from pandas.core.dtypes.cast import find_common_type, maybe_upcast
  14. from pandas.core.dtypes.common import ensure_platform_int, is_scipy_sparse
  15. from pandas.core.dtypes.missing import isna, notna
  16. import pandas.core.algorithms as algos
  17. from pandas.core.arrays.sparse import SparseArray, SparseDtype
  18. import pandas.core.common as com
  19. from pandas.core.frame import DataFrame
  20. import pandas.core.generic as generic
  21. from pandas.core.index import Index, MultiIndex, ensure_index
  22. import pandas.core.indexes.base as ibase
  23. from pandas.core.internals import (
  24. BlockManager, create_block_manager_from_arrays)
  25. from pandas.core.internals.construction import extract_index, prep_ndarray
  26. import pandas.core.ops as ops
  27. from pandas.core.series import Series
  28. from pandas.core.sparse.series import SparseSeries
  29. # pylint: disable=E1101,E1103,W0231,E0202
  30. _shared_doc_kwargs = dict(klass='SparseDataFrame')
  31. class SparseDataFrame(DataFrame):
  32. """
  33. DataFrame containing sparse floating point data in the form of SparseSeries
  34. objects
  35. Parameters
  36. ----------
  37. data : same types as can be passed to DataFrame or scipy.sparse.spmatrix
  38. .. versionchanged :: 0.23.0
  39. If data is a dict, argument order is maintained for Python 3.6
  40. and later.
  41. index : array-like, optional
  42. column : array-like, optional
  43. default_kind : {'block', 'integer'}, default 'block'
  44. Default sparse kind for converting Series to SparseSeries. Will not
  45. override SparseSeries passed into constructor
  46. default_fill_value : float
  47. Default fill_value for converting Series to SparseSeries
  48. (default: nan). Will not override SparseSeries passed in.
  49. """
  50. _subtyp = 'sparse_frame'
  51. def __init__(self, data=None, index=None, columns=None, default_kind=None,
  52. default_fill_value=None, dtype=None, copy=False):
  53. # pick up the defaults from the Sparse structures
  54. if isinstance(data, SparseDataFrame):
  55. if index is None:
  56. index = data.index
  57. if columns is None:
  58. columns = data.columns
  59. if default_fill_value is None:
  60. default_fill_value = data.default_fill_value
  61. if default_kind is None:
  62. default_kind = data.default_kind
  63. elif isinstance(data, (SparseSeries, SparseArray)):
  64. if index is None:
  65. index = data.index
  66. if default_fill_value is None:
  67. default_fill_value = data.fill_value
  68. if columns is None and hasattr(data, 'name'):
  69. columns = [data.name]
  70. if columns is None:
  71. raise Exception("cannot pass a series w/o a name or columns")
  72. data = {columns[0]: data}
  73. if default_fill_value is None:
  74. default_fill_value = np.nan
  75. if default_kind is None:
  76. default_kind = 'block'
  77. self._default_kind = default_kind
  78. self._default_fill_value = default_fill_value
  79. if is_scipy_sparse(data):
  80. mgr = self._init_spmatrix(data, index, columns, dtype=dtype,
  81. fill_value=default_fill_value)
  82. elif isinstance(data, dict):
  83. mgr = self._init_dict(data, index, columns, dtype=dtype)
  84. elif isinstance(data, (np.ndarray, list)):
  85. mgr = self._init_matrix(data, index, columns, dtype=dtype)
  86. elif isinstance(data, SparseDataFrame):
  87. mgr = self._init_mgr(data._data,
  88. dict(index=index, columns=columns),
  89. dtype=dtype, copy=copy)
  90. elif isinstance(data, DataFrame):
  91. mgr = self._init_dict(data, data.index, data.columns, dtype=dtype)
  92. elif isinstance(data, Series):
  93. mgr = self._init_dict(data.to_frame(), data.index,
  94. columns=None, dtype=dtype)
  95. elif isinstance(data, BlockManager):
  96. mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
  97. dtype=dtype, copy=copy)
  98. elif data is None:
  99. data = DataFrame()
  100. if index is None:
  101. index = Index([])
  102. else:
  103. index = ensure_index(index)
  104. if columns is None:
  105. columns = Index([])
  106. else:
  107. for c in columns:
  108. data[c] = SparseArray(np.nan, index=index,
  109. kind=self._default_kind,
  110. fill_value=self._default_fill_value)
  111. mgr = to_manager(data, columns, index)
  112. if dtype is not None:
  113. mgr = mgr.astype(dtype)
  114. else:
  115. msg = ('SparseDataFrame called with unknown type "{data_type}" '
  116. 'for data argument')
  117. raise TypeError(msg.format(data_type=type(data).__name__))
  118. generic.NDFrame.__init__(self, mgr)
  119. @property
  120. def _constructor(self):
  121. return SparseDataFrame
  122. _constructor_sliced = SparseSeries
  123. def _init_dict(self, data, index, columns, dtype=None):
  124. # pre-filter out columns if we passed it
  125. if columns is not None:
  126. columns = ensure_index(columns)
  127. data = {k: v for k, v in compat.iteritems(data) if k in columns}
  128. else:
  129. keys = com.dict_keys_to_ordered_list(data)
  130. columns = Index(keys)
  131. if index is None:
  132. index = extract_index(list(data.values()))
  133. def sp_maker(x):
  134. return SparseArray(x, kind=self._default_kind,
  135. fill_value=self._default_fill_value,
  136. copy=True, dtype=dtype)
  137. sdict = {}
  138. for k, v in compat.iteritems(data):
  139. if isinstance(v, Series):
  140. # Force alignment, no copy necessary
  141. if not v.index.equals(index):
  142. v = v.reindex(index)
  143. if not isinstance(v, SparseSeries):
  144. v = sp_maker(v.values)
  145. elif isinstance(v, SparseArray):
  146. v = v.copy()
  147. else:
  148. if isinstance(v, dict):
  149. v = [v.get(i, np.nan) for i in index]
  150. v = sp_maker(v)
  151. if index is not None and len(v) != len(index):
  152. msg = "Length of passed values is {}, index implies {}"
  153. raise ValueError(msg.format(len(v), len(index)))
  154. sdict[k] = v
  155. if len(columns.difference(sdict)):
  156. # TODO: figure out how to handle this case, all nan's?
  157. # add in any other columns we want to have (completeness)
  158. nan_arr = np.empty(len(index), dtype='float64')
  159. nan_arr.fill(np.nan)
  160. nan_arr = SparseArray(nan_arr, kind=self._default_kind,
  161. fill_value=self._default_fill_value,
  162. copy=False)
  163. sdict.update((c, nan_arr) for c in columns if c not in sdict)
  164. return to_manager(sdict, columns, index)
  165. def _init_matrix(self, data, index, columns, dtype=None):
  166. """ Init self from ndarray or list of lists """
  167. data = prep_ndarray(data, copy=False)
  168. index, columns = self._prep_index(data, index, columns)
  169. data = {idx: data[:, i] for i, idx in enumerate(columns)}
  170. return self._init_dict(data, index, columns, dtype)
  171. def _init_spmatrix(self, data, index, columns, dtype=None,
  172. fill_value=None):
  173. """ Init self from scipy.sparse matrix """
  174. index, columns = self._prep_index(data, index, columns)
  175. data = data.tocoo()
  176. N = len(index)
  177. # Construct a dict of SparseSeries
  178. sdict = {}
  179. values = Series(data.data, index=data.row, copy=False)
  180. for col, rowvals in values.groupby(data.col):
  181. # get_blocks expects int32 row indices in sorted order
  182. rowvals = rowvals.sort_index()
  183. rows = rowvals.index.values.astype(np.int32)
  184. blocs, blens = get_blocks(rows)
  185. sdict[columns[col]] = SparseSeries(
  186. rowvals.values, index=index,
  187. fill_value=fill_value,
  188. sparse_index=BlockIndex(N, blocs, blens))
  189. # Add any columns that were empty and thus not grouped on above
  190. sdict.update({column: SparseSeries(index=index,
  191. fill_value=fill_value,
  192. sparse_index=BlockIndex(N, [], []))
  193. for column in columns
  194. if column not in sdict})
  195. return self._init_dict(sdict, index, columns, dtype)
  196. def _prep_index(self, data, index, columns):
  197. N, K = data.shape
  198. if index is None:
  199. index = ibase.default_index(N)
  200. if columns is None:
  201. columns = ibase.default_index(K)
  202. if len(columns) != K:
  203. raise ValueError('Column length mismatch: {columns} vs. {K}'
  204. .format(columns=len(columns), K=K))
  205. if len(index) != N:
  206. raise ValueError('Index length mismatch: {index} vs. {N}'
  207. .format(index=len(index), N=N))
  208. return index, columns
  209. def to_coo(self):
  210. """
  211. Return the contents of the frame as a sparse SciPy COO matrix.
  212. .. versionadded:: 0.20.0
  213. Returns
  214. -------
  215. coo_matrix : scipy.sparse.spmatrix
  216. If the caller is heterogeneous and contains booleans or objects,
  217. the result will be of dtype=object. See Notes.
  218. Notes
  219. -----
  220. The dtype will be the lowest-common-denominator type (implicit
  221. upcasting); that is to say if the dtypes (even of numeric types)
  222. are mixed, the one that accommodates all will be chosen.
  223. e.g. If the dtypes are float16 and float32, dtype will be upcast to
  224. float32. By numpy.find_common_type convention, mixing int64 and
  225. and uint64 will result in a float64 dtype.
  226. """
  227. try:
  228. from scipy.sparse import coo_matrix
  229. except ImportError:
  230. raise ImportError('Scipy is not installed')
  231. dtype = find_common_type(self.dtypes)
  232. if isinstance(dtype, SparseDtype):
  233. dtype = dtype.subtype
  234. cols, rows, datas = [], [], []
  235. for col, name in enumerate(self):
  236. s = self[name]
  237. row = s.sp_index.to_int_index().indices
  238. cols.append(np.repeat(col, len(row)))
  239. rows.append(row)
  240. datas.append(s.sp_values.astype(dtype, copy=False))
  241. cols = np.concatenate(cols)
  242. rows = np.concatenate(rows)
  243. datas = np.concatenate(datas)
  244. return coo_matrix((datas, (rows, cols)), shape=self.shape)
  245. def __array_wrap__(self, result):
  246. return self._constructor(
  247. result, index=self.index, columns=self.columns,
  248. default_kind=self._default_kind,
  249. default_fill_value=self._default_fill_value).__finalize__(self)
  250. def __getstate__(self):
  251. # pickling
  252. return dict(_typ=self._typ, _subtyp=self._subtyp, _data=self._data,
  253. _default_fill_value=self._default_fill_value,
  254. _default_kind=self._default_kind)
  255. def _unpickle_sparse_frame_compat(self, state):
  256. """ original pickle format """
  257. series, cols, idx, fv, kind = state
  258. if not isinstance(cols, Index): # pragma: no cover
  259. from pandas.io.pickle import _unpickle_array
  260. columns = _unpickle_array(cols)
  261. else:
  262. columns = cols
  263. if not isinstance(idx, Index): # pragma: no cover
  264. from pandas.io.pickle import _unpickle_array
  265. index = _unpickle_array(idx)
  266. else:
  267. index = idx
  268. series_dict = DataFrame()
  269. for col, (sp_index, sp_values) in compat.iteritems(series):
  270. series_dict[col] = SparseSeries(sp_values, sparse_index=sp_index,
  271. fill_value=fv)
  272. self._data = to_manager(series_dict, columns, index)
  273. self._default_fill_value = fv
  274. self._default_kind = kind
  275. def to_dense(self):
  276. """
  277. Convert to dense DataFrame
  278. Returns
  279. -------
  280. df : DataFrame
  281. """
  282. data = {k: v.to_dense() for k, v in compat.iteritems(self)}
  283. return DataFrame(data, index=self.index, columns=self.columns)
  284. def _apply_columns(self, func):
  285. """ get new SparseDataFrame applying func to each columns """
  286. new_data = {col: func(series)
  287. for col, series in compat.iteritems(self)}
  288. return self._constructor(
  289. data=new_data, index=self.index, columns=self.columns,
  290. default_fill_value=self.default_fill_value).__finalize__(self)
  291. def astype(self, dtype):
  292. return self._apply_columns(lambda x: x.astype(dtype))
  293. def copy(self, deep=True):
  294. """
  295. Make a copy of this SparseDataFrame
  296. """
  297. result = super(SparseDataFrame, self).copy(deep=deep)
  298. result._default_fill_value = self._default_fill_value
  299. result._default_kind = self._default_kind
  300. return result
  301. @property
  302. def default_fill_value(self):
  303. return self._default_fill_value
  304. @property
  305. def default_kind(self):
  306. return self._default_kind
  307. @property
  308. def density(self):
  309. """
  310. Ratio of non-sparse points to total (dense) data points
  311. represented in the frame
  312. """
  313. tot_nonsparse = sum(ser.sp_index.npoints
  314. for _, ser in compat.iteritems(self))
  315. tot = len(self.index) * len(self.columns)
  316. return tot_nonsparse / float(tot)
  317. def fillna(self, value=None, method=None, axis=0, inplace=False,
  318. limit=None, downcast=None):
  319. new_self = super(SparseDataFrame,
  320. self).fillna(value=value, method=method, axis=axis,
  321. inplace=inplace, limit=limit,
  322. downcast=downcast)
  323. if not inplace:
  324. self = new_self
  325. # set the fill value if we are filling as a scalar with nothing special
  326. # going on
  327. if (value is not None and value == value and method is None and
  328. limit is None):
  329. self._default_fill_value = value
  330. if not inplace:
  331. return self
  332. # ----------------------------------------------------------------------
  333. # Support different internal representation of SparseDataFrame
  334. def _sanitize_column(self, key, value, **kwargs):
  335. """
  336. Creates a new SparseArray from the input value.
  337. Parameters
  338. ----------
  339. key : object
  340. value : scalar, Series, or array-like
  341. kwargs : dict
  342. Returns
  343. -------
  344. sanitized_column : SparseArray
  345. """
  346. def sp_maker(x, index=None):
  347. return SparseArray(x, index=index,
  348. fill_value=self._default_fill_value,
  349. kind=self._default_kind)
  350. if isinstance(value, SparseSeries):
  351. clean = value.reindex(self.index).as_sparse_array(
  352. fill_value=self._default_fill_value, kind=self._default_kind)
  353. elif isinstance(value, SparseArray):
  354. if len(value) != len(self.index):
  355. raise AssertionError('Length of values does not match '
  356. 'length of index')
  357. clean = value
  358. elif hasattr(value, '__iter__'):
  359. if isinstance(value, Series):
  360. clean = value.reindex(self.index)
  361. if not isinstance(value, SparseSeries):
  362. clean = sp_maker(clean)
  363. else:
  364. if len(value) != len(self.index):
  365. raise AssertionError('Length of values does not match '
  366. 'length of index')
  367. clean = sp_maker(value)
  368. # Scalar
  369. else:
  370. clean = sp_maker(value, self.index)
  371. # always return a SparseArray!
  372. return clean
  373. def get_value(self, index, col, takeable=False):
  374. """
  375. Quickly retrieve single value at passed column and index
  376. .. deprecated:: 0.21.0
  377. Please use .at[] or .iat[] accessors.
  378. Parameters
  379. ----------
  380. index : row label
  381. col : column label
  382. takeable : interpret the index/col as indexers, default False
  383. Returns
  384. -------
  385. value : scalar value
  386. """
  387. warnings.warn("get_value is deprecated and will be removed "
  388. "in a future release. Please use "
  389. ".at[] or .iat[] accessors instead", FutureWarning,
  390. stacklevel=2)
  391. return self._get_value(index, col, takeable=takeable)
  392. def _get_value(self, index, col, takeable=False):
  393. if takeable is True:
  394. series = self._iget_item_cache(col)
  395. else:
  396. series = self._get_item_cache(col)
  397. return series._get_value(index, takeable=takeable)
  398. _get_value.__doc__ = get_value.__doc__
  399. def set_value(self, index, col, value, takeable=False):
  400. """
  401. Put single value at passed column and index
  402. .. deprecated:: 0.21.0
  403. Please use .at[] or .iat[] accessors.
  404. Parameters
  405. ----------
  406. index : row label
  407. col : column label
  408. value : scalar value
  409. takeable : interpret the index/col as indexers, default False
  410. Notes
  411. -----
  412. This method *always* returns a new object. It is currently not
  413. particularly efficient (and potentially very expensive) but is provided
  414. for API compatibility with DataFrame
  415. Returns
  416. -------
  417. frame : DataFrame
  418. """
  419. warnings.warn("set_value is deprecated and will be removed "
  420. "in a future release. Please use "
  421. ".at[] or .iat[] accessors instead", FutureWarning,
  422. stacklevel=2)
  423. return self._set_value(index, col, value, takeable=takeable)
  424. def _set_value(self, index, col, value, takeable=False):
  425. dense = self.to_dense()._set_value(
  426. index, col, value, takeable=takeable)
  427. return dense.to_sparse(kind=self._default_kind,
  428. fill_value=self._default_fill_value)
  429. _set_value.__doc__ = set_value.__doc__
  430. def _slice(self, slobj, axis=0, kind=None):
  431. if axis == 0:
  432. new_index = self.index[slobj]
  433. new_columns = self.columns
  434. else:
  435. new_index = self.index
  436. new_columns = self.columns[slobj]
  437. return self.reindex(index=new_index, columns=new_columns)
  438. def xs(self, key, axis=0, copy=False):
  439. """
  440. Returns a row (cross-section) from the SparseDataFrame as a Series
  441. object.
  442. Parameters
  443. ----------
  444. key : some index contained in the index
  445. Returns
  446. -------
  447. xs : Series
  448. """
  449. if axis == 1:
  450. data = self[key]
  451. return data
  452. i = self.index.get_loc(key)
  453. data = self.take([i]).get_values()[0]
  454. return Series(data, index=self.columns)
  455. # ----------------------------------------------------------------------
  456. # Arithmetic-related methods
  457. def _combine_frame(self, other, func, fill_value=None, level=None):
  458. if level is not None:
  459. raise NotImplementedError("'level' argument is not supported")
  460. this, other = self.align(other, join='outer', level=level, copy=False)
  461. new_index, new_columns = this.index, this.columns
  462. if self.empty and other.empty:
  463. return self._constructor(index=new_index).__finalize__(self)
  464. new_data = {}
  465. if fill_value is not None:
  466. # TODO: be a bit more intelligent here
  467. for col in new_columns:
  468. if col in this and col in other:
  469. dleft = this[col].to_dense()
  470. dright = other[col].to_dense()
  471. result = dleft._binop(dright, func, fill_value=fill_value)
  472. result = result.to_sparse(fill_value=this[col].fill_value)
  473. new_data[col] = result
  474. else:
  475. for col in new_columns:
  476. if col in this and col in other:
  477. new_data[col] = func(this[col], other[col])
  478. new_fill_value = self._get_op_result_fill_value(other, func)
  479. return self._constructor(data=new_data, index=new_index,
  480. columns=new_columns,
  481. default_fill_value=new_fill_value
  482. ).__finalize__(self)
  483. def _combine_match_index(self, other, func, level=None):
  484. new_data = {}
  485. if level is not None:
  486. raise NotImplementedError("'level' argument is not supported")
  487. this, other = self.align(other, join='outer', axis=0, level=level,
  488. copy=False)
  489. for col, series in compat.iteritems(this):
  490. new_data[col] = func(series.values, other.values)
  491. fill_value = self._get_op_result_fill_value(other, func)
  492. return self._constructor(
  493. new_data, index=this.index, columns=self.columns,
  494. default_fill_value=fill_value).__finalize__(self)
  495. def _combine_match_columns(self, other, func, level=None):
  496. # patched version of DataFrame._combine_match_columns to account for
  497. # NumPy circumventing __rsub__ with float64 types, e.g.: 3.0 - series,
  498. # where 3.0 is numpy.float64 and series is a SparseSeries. Still
  499. # possible for this to happen, which is bothersome
  500. if level is not None:
  501. raise NotImplementedError("'level' argument is not supported")
  502. left, right = self.align(other, join='outer', axis=1, level=level,
  503. copy=False)
  504. assert left.columns.equals(right.index)
  505. new_data = {}
  506. for col in left.columns:
  507. new_data[col] = func(left[col], float(right[col]))
  508. return self._constructor(
  509. new_data, index=left.index, columns=left.columns,
  510. default_fill_value=self.default_fill_value).__finalize__(self)
  511. def _combine_const(self, other, func):
  512. return self._apply_columns(lambda x: func(x, other))
  513. def _get_op_result_fill_value(self, other, func):
  514. own_default = self.default_fill_value
  515. if isinstance(other, DataFrame):
  516. # i.e. called from _combine_frame
  517. other_default = getattr(other, 'default_fill_value', np.nan)
  518. # if the fill values are the same use them? or use a valid one
  519. if own_default == other_default:
  520. # TOOD: won't this evaluate as False if both are np.nan?
  521. fill_value = own_default
  522. elif np.isnan(own_default) and not np.isnan(other_default):
  523. fill_value = other_default
  524. elif not np.isnan(own_default) and np.isnan(other_default):
  525. fill_value = own_default
  526. else:
  527. fill_value = None
  528. elif isinstance(other, SparseSeries):
  529. # i.e. called from _combine_match_index
  530. # fill_value is a function of our operator
  531. if isna(other.fill_value) or isna(own_default):
  532. fill_value = np.nan
  533. else:
  534. fill_value = func(np.float64(own_default),
  535. np.float64(other.fill_value))
  536. else:
  537. raise NotImplementedError(type(other))
  538. return fill_value
  539. def _reindex_index(self, index, method, copy, level, fill_value=np.nan,
  540. limit=None, takeable=False):
  541. if level is not None:
  542. raise TypeError('Reindex by level not supported for sparse')
  543. if self.index.equals(index):
  544. if copy:
  545. return self.copy()
  546. else:
  547. return self
  548. if len(self.index) == 0:
  549. return self._constructor(
  550. index=index, columns=self.columns).__finalize__(self)
  551. indexer = self.index.get_indexer(index, method, limit=limit)
  552. indexer = ensure_platform_int(indexer)
  553. mask = indexer == -1
  554. need_mask = mask.any()
  555. new_series = {}
  556. for col, series in self.iteritems():
  557. if mask.all():
  558. continue
  559. values = series.values
  560. # .take returns SparseArray
  561. new = values.take(indexer)
  562. if need_mask:
  563. new = new.values
  564. # convert integer to float if necessary. need to do a lot
  565. # more than that, handle boolean etc also
  566. new, fill_value = maybe_upcast(new, fill_value=fill_value)
  567. np.putmask(new, mask, fill_value)
  568. new_series[col] = new
  569. return self._constructor(
  570. new_series, index=index, columns=self.columns,
  571. default_fill_value=self._default_fill_value).__finalize__(self)
  572. def _reindex_columns(self, columns, method, copy, level, fill_value=None,
  573. limit=None, takeable=False):
  574. if level is not None:
  575. raise TypeError('Reindex by level not supported for sparse')
  576. if notna(fill_value):
  577. raise NotImplementedError("'fill_value' argument is not supported")
  578. if limit:
  579. raise NotImplementedError("'limit' argument is not supported")
  580. if method is not None:
  581. raise NotImplementedError("'method' argument is not supported")
  582. # TODO: fill value handling
  583. sdict = {k: v for k, v in compat.iteritems(self) if k in columns}
  584. return self._constructor(
  585. sdict, index=self.index, columns=columns,
  586. default_fill_value=self._default_fill_value).__finalize__(self)
  587. def _reindex_with_indexers(self, reindexers, method=None, fill_value=None,
  588. limit=None, copy=False, allow_dups=False):
  589. if method is not None or limit is not None:
  590. raise NotImplementedError("cannot reindex with a method or limit "
  591. "with sparse")
  592. if fill_value is None:
  593. fill_value = np.nan
  594. reindexers = {self._get_axis_number(a): val
  595. for (a, val) in compat.iteritems(reindexers)}
  596. index, row_indexer = reindexers.get(0, (None, None))
  597. columns, col_indexer = reindexers.get(1, (None, None))
  598. if columns is None:
  599. columns = self.columns
  600. new_arrays = {}
  601. for col in columns:
  602. if col not in self:
  603. continue
  604. if row_indexer is not None:
  605. new_arrays[col] = algos.take_1d(self[col].get_values(),
  606. row_indexer,
  607. fill_value=fill_value)
  608. else:
  609. new_arrays[col] = self[col]
  610. return self._constructor(new_arrays, index=index,
  611. columns=columns).__finalize__(self)
  612. def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
  613. sort=False):
  614. if on is not None:
  615. raise NotImplementedError("'on' keyword parameter is not yet "
  616. "implemented")
  617. return self._join_index(other, how, lsuffix, rsuffix)
  618. def _join_index(self, other, how, lsuffix, rsuffix):
  619. if isinstance(other, Series):
  620. if other.name is None:
  621. raise ValueError('Other Series must have a name')
  622. other = SparseDataFrame(
  623. {other.name: other},
  624. default_fill_value=self._default_fill_value)
  625. join_index = self.index.join(other.index, how=how)
  626. this = self.reindex(join_index)
  627. other = other.reindex(join_index)
  628. this, other = this._maybe_rename_join(other, lsuffix, rsuffix)
  629. from pandas import concat
  630. return concat([this, other], axis=1, verify_integrity=True)
  631. def _maybe_rename_join(self, other, lsuffix, rsuffix):
  632. to_rename = self.columns.intersection(other.columns)
  633. if len(to_rename) > 0:
  634. if not lsuffix and not rsuffix:
  635. raise ValueError('columns overlap but no suffix specified: '
  636. '{to_rename}'.format(to_rename=to_rename))
  637. def lrenamer(x):
  638. if x in to_rename:
  639. return '{x}{lsuffix}'.format(x=x, lsuffix=lsuffix)
  640. return x
  641. def rrenamer(x):
  642. if x in to_rename:
  643. return '{x}{rsuffix}'.format(x=x, rsuffix=rsuffix)
  644. return x
  645. this = self.rename(columns=lrenamer)
  646. other = other.rename(columns=rrenamer)
  647. else:
  648. this = self
  649. return this, other
  650. def transpose(self, *args, **kwargs):
  651. """
  652. Returns a DataFrame with the rows/columns switched.
  653. """
  654. nv.validate_transpose(args, kwargs)
  655. return self._constructor(
  656. self.values.T, index=self.columns, columns=self.index,
  657. default_fill_value=self._default_fill_value,
  658. default_kind=self._default_kind).__finalize__(self)
  659. T = property(transpose)
  660. @Appender(DataFrame.count.__doc__)
  661. def count(self, axis=0, **kwds):
  662. if axis is None:
  663. axis = self._stat_axis_number
  664. return self.apply(lambda x: x.count(), axis=axis)
  665. def cumsum(self, axis=0, *args, **kwargs):
  666. """
  667. Return SparseDataFrame of cumulative sums over requested axis.
  668. Parameters
  669. ----------
  670. axis : {0, 1}
  671. 0 for row-wise, 1 for column-wise
  672. Returns
  673. -------
  674. y : SparseDataFrame
  675. """
  676. nv.validate_cumsum(args, kwargs)
  677. if axis is None:
  678. axis = self._stat_axis_number
  679. return self.apply(lambda x: x.cumsum(), axis=axis)
  680. @Appender(generic._shared_docs['isna'] % _shared_doc_kwargs)
  681. def isna(self):
  682. return self._apply_columns(lambda x: x.isna())
  683. isnull = isna
  684. @Appender(generic._shared_docs['notna'] % _shared_doc_kwargs)
  685. def notna(self):
  686. return self._apply_columns(lambda x: x.notna())
  687. notnull = notna
  688. def apply(self, func, axis=0, broadcast=None, reduce=None,
  689. result_type=None):
  690. """
  691. Analogous to DataFrame.apply, for SparseDataFrame
  692. Parameters
  693. ----------
  694. func : function
  695. Function to apply to each column
  696. axis : {0, 1, 'index', 'columns'}
  697. broadcast : bool, default False
  698. For aggregation functions, return object of same size with values
  699. propagated
  700. .. deprecated:: 0.23.0
  701. This argument will be removed in a future version, replaced
  702. by result_type='broadcast'.
  703. reduce : boolean or None, default None
  704. Try to apply reduction procedures. If the DataFrame is empty,
  705. apply will use reduce to determine whether the result should be a
  706. Series or a DataFrame. If reduce is None (the default), apply's
  707. return value will be guessed by calling func an empty Series (note:
  708. while guessing, exceptions raised by func will be ignored). If
  709. reduce is True a Series will always be returned, and if False a
  710. DataFrame will always be returned.
  711. .. deprecated:: 0.23.0
  712. This argument will be removed in a future version, replaced
  713. by result_type='reduce'.
  714. result_type : {'expand', 'reduce', 'broadcast, None}
  715. These only act when axis=1 {columns}:
  716. * 'expand' : list-like results will be turned into columns.
  717. * 'reduce' : return a Series if possible rather than expanding
  718. list-like results. This is the opposite to 'expand'.
  719. * 'broadcast' : results will be broadcast to the original shape
  720. of the frame, the original index & columns will be retained.
  721. The default behaviour (None) depends on the return value of the
  722. applied function: list-like results will be returned as a Series
  723. of those. However if the apply function returns a Series these
  724. are expanded to columns.
  725. .. versionadded:: 0.23.0
  726. Returns
  727. -------
  728. applied : Series or SparseDataFrame
  729. """
  730. if not len(self.columns):
  731. return self
  732. axis = self._get_axis_number(axis)
  733. if isinstance(func, np.ufunc):
  734. new_series = {}
  735. for k, v in compat.iteritems(self):
  736. applied = func(v)
  737. applied.fill_value = func(v.fill_value)
  738. new_series[k] = applied
  739. return self._constructor(
  740. new_series, index=self.index, columns=self.columns,
  741. default_fill_value=self._default_fill_value,
  742. default_kind=self._default_kind).__finalize__(self)
  743. from pandas.core.apply import frame_apply
  744. op = frame_apply(self,
  745. func=func,
  746. axis=axis,
  747. reduce=reduce,
  748. broadcast=broadcast,
  749. result_type=result_type)
  750. return op.get_result()
  751. def applymap(self, func):
  752. """
  753. Apply a function to a DataFrame that is intended to operate
  754. elementwise, i.e. like doing map(func, series) for each series in the
  755. DataFrame
  756. Parameters
  757. ----------
  758. func : function
  759. Python function, returns a single value from a single value
  760. Returns
  761. -------
  762. applied : DataFrame
  763. """
  764. return self.apply(lambda x: lmap(func, x))
  765. def to_manager(sdf, columns, index):
  766. """ create and return the block manager from a dataframe of series,
  767. columns, index
  768. """
  769. # from BlockManager perspective
  770. axes = [ensure_index(columns), ensure_index(index)]
  771. return create_block_manager_from_arrays(
  772. [sdf[c] for c in columns], columns, axes)
  773. def stack_sparse_frame(frame):
  774. """
  775. Only makes sense when fill_value is NaN
  776. """
  777. lengths = [s.sp_index.npoints for _, s in compat.iteritems(frame)]
  778. nobs = sum(lengths)
  779. # this is pretty fast
  780. minor_codes = np.repeat(np.arange(len(frame.columns)), lengths)
  781. inds_to_concat = []
  782. vals_to_concat = []
  783. # TODO: Figure out whether this can be reached.
  784. # I think this currently can't be reached because you can't build a
  785. # SparseDataFrame with a non-np.NaN fill value (fails earlier).
  786. for _, series in compat.iteritems(frame):
  787. if not np.isnan(series.fill_value):
  788. raise TypeError('This routine assumes NaN fill value')
  789. int_index = series.sp_index.to_int_index()
  790. inds_to_concat.append(int_index.indices)
  791. vals_to_concat.append(series.sp_values)
  792. major_codes = np.concatenate(inds_to_concat)
  793. stacked_values = np.concatenate(vals_to_concat)
  794. index = MultiIndex(levels=[frame.index, frame.columns],
  795. codes=[major_codes, minor_codes],
  796. verify_integrity=False)
  797. lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index,
  798. columns=['foo'])
  799. return lp.sort_index(level=0)
  800. def homogenize(series_dict):
  801. """
  802. Conform a set of SparseSeries (with NaN fill_value) to a common SparseIndex
  803. corresponding to the locations where they all have data
  804. Parameters
  805. ----------
  806. series_dict : dict or DataFrame
  807. Notes
  808. -----
  809. Using the dumbest algorithm I could think of. Should put some more thought
  810. into this
  811. Returns
  812. -------
  813. homogenized : dict of SparseSeries
  814. """
  815. index = None
  816. need_reindex = False
  817. for _, series in compat.iteritems(series_dict):
  818. if not np.isnan(series.fill_value):
  819. raise TypeError('this method is only valid with NaN fill values')
  820. if index is None:
  821. index = series.sp_index
  822. elif not series.sp_index.equals(index):
  823. need_reindex = True
  824. index = index.intersect(series.sp_index)
  825. if need_reindex:
  826. output = {}
  827. for name, series in compat.iteritems(series_dict):
  828. if not series.sp_index.equals(index):
  829. series = series.sparse_reindex(index)
  830. output[name] = series
  831. else:
  832. output = series_dict
  833. return output
  834. # use unaccelerated ops for sparse objects
  835. ops.add_flex_arithmetic_methods(SparseDataFrame)
  836. ops.add_special_arithmetic_methods(SparseDataFrame)