test_frame.py 53 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369
  1. # pylint: disable-msg=E1101,W0612
  2. import operator
  3. import numpy as np
  4. from numpy import nan
  5. import pytest
  6. from pandas._libs.sparse import BlockIndex, IntIndex
  7. from pandas.compat import lrange
  8. from pandas.errors import PerformanceWarning
  9. import pandas as pd
  10. from pandas import DataFrame, Panel, Series, bdate_range, compat
  11. from pandas.core.indexes.datetimes import DatetimeIndex
  12. from pandas.core.sparse import frame as spf
  13. from pandas.core.sparse.api import (
  14. SparseArray, SparseDataFrame, SparseDtype, SparseSeries)
  15. from pandas.tests.frame.test_api import SharedWithSparse
  16. from pandas.util import testing as tm
  17. from pandas.tseries.offsets import BDay
  18. class TestSparseDataFrame(SharedWithSparse):
  19. klass = SparseDataFrame
  20. # SharedWithSparse tests use generic, klass-agnostic assertion
  21. _assert_frame_equal = staticmethod(tm.assert_sp_frame_equal)
  22. _assert_series_equal = staticmethod(tm.assert_sp_series_equal)
  23. def test_iterrows(self, float_frame, float_string_frame):
  24. # Same as parent, but we don't ensure the sparse kind is the same.
  25. for k, v in float_frame.iterrows():
  26. exp = float_frame.loc[k]
  27. tm.assert_sp_series_equal(v, exp, check_kind=False)
  28. for k, v in float_string_frame.iterrows():
  29. exp = float_string_frame.loc[k]
  30. tm.assert_sp_series_equal(v, exp, check_kind=False)
  31. def test_itertuples(self, float_frame):
  32. for i, tup in enumerate(float_frame.itertuples()):
  33. s = self.klass._constructor_sliced(tup[1:])
  34. s.name = tup[0]
  35. expected = float_frame.iloc[i, :].reset_index(drop=True)
  36. tm.assert_sp_series_equal(s, expected, check_kind=False)
  37. def test_fill_value_when_combine_const(self):
  38. # GH12723
  39. dat = np.array([0, 1, np.nan, 3, 4, 5], dtype='float')
  40. df = SparseDataFrame({'foo': dat}, index=range(6))
  41. exp = df.fillna(0).add(2)
  42. res = df.add(2, fill_value=0)
  43. tm.assert_sp_frame_equal(res, exp)
  44. def test_values(self, empty_frame, float_frame):
  45. empty = empty_frame.values
  46. assert empty.shape == (0, 0)
  47. no_cols = SparseDataFrame(index=np.arange(10))
  48. mat = no_cols.values
  49. assert mat.shape == (10, 0)
  50. no_index = SparseDataFrame(columns=np.arange(10))
  51. mat = no_index.values
  52. assert mat.shape == (0, 10)
  53. def test_copy(self, float_frame):
  54. cp = float_frame.copy()
  55. assert isinstance(cp, SparseDataFrame)
  56. tm.assert_sp_frame_equal(cp, float_frame)
  57. # as of v0.15.0
  58. # this is now identical (but not is_a )
  59. assert cp.index.identical(float_frame.index)
  60. def test_constructor(self, float_frame, float_frame_int_kind,
  61. float_frame_fill0):
  62. for col, series in compat.iteritems(float_frame):
  63. assert isinstance(series, SparseSeries)
  64. assert isinstance(float_frame_int_kind['A'].sp_index, IntIndex)
  65. # constructed zframe from matrix above
  66. assert float_frame_fill0['A'].fill_value == 0
  67. # XXX: changed asarray
  68. expected = pd.SparseArray([0, 0, 0, 0, 1., 2., 3., 4., 5., 6.],
  69. fill_value=0, kind='block')
  70. tm.assert_sp_array_equal(expected,
  71. float_frame_fill0['A'].values)
  72. tm.assert_numpy_array_equal(np.array([0., 0., 0., 0., 1., 2.,
  73. 3., 4., 5., 6.]),
  74. float_frame_fill0['A'].to_dense().values)
  75. # construct no data
  76. sdf = SparseDataFrame(columns=np.arange(10), index=np.arange(10))
  77. for col, series in compat.iteritems(sdf):
  78. assert isinstance(series, SparseSeries)
  79. # construct from nested dict
  80. data = {c: s.to_dict() for c, s in compat.iteritems(float_frame)}
  81. sdf = SparseDataFrame(data)
  82. tm.assert_sp_frame_equal(sdf, float_frame)
  83. # TODO: test data is copied from inputs
  84. # init dict with different index
  85. idx = float_frame.index[:5]
  86. cons = SparseDataFrame(
  87. float_frame, index=idx, columns=float_frame.columns,
  88. default_fill_value=float_frame.default_fill_value,
  89. default_kind=float_frame.default_kind, copy=True)
  90. reindexed = float_frame.reindex(idx)
  91. tm.assert_sp_frame_equal(cons, reindexed, exact_indices=False)
  92. # assert level parameter breaks reindex
  93. with pytest.raises(TypeError):
  94. float_frame.reindex(idx, level=0)
  95. repr(float_frame)
  96. def test_constructor_dict_order(self):
  97. # GH19018
  98. # initialization ordering: by insertion order if python>= 3.6, else
  99. # order by value
  100. d = {'b': [2, 3], 'a': [0, 1]}
  101. frame = SparseDataFrame(data=d)
  102. if compat.PY36:
  103. expected = SparseDataFrame(data=d, columns=list('ba'))
  104. else:
  105. expected = SparseDataFrame(data=d, columns=list('ab'))
  106. tm.assert_sp_frame_equal(frame, expected)
  107. def test_constructor_ndarray(self, float_frame):
  108. # no index or columns
  109. sp = SparseDataFrame(float_frame.values)
  110. # 1d
  111. sp = SparseDataFrame(float_frame['A'].values, index=float_frame.index,
  112. columns=['A'])
  113. tm.assert_sp_frame_equal(sp, float_frame.reindex(columns=['A']))
  114. # raise on level argument
  115. pytest.raises(TypeError, float_frame.reindex, columns=['A'],
  116. level=1)
  117. # wrong length index / columns
  118. with pytest.raises(ValueError, match="^Index length"):
  119. SparseDataFrame(float_frame.values, index=float_frame.index[:-1])
  120. with pytest.raises(ValueError, match="^Column length"):
  121. SparseDataFrame(float_frame.values,
  122. columns=float_frame.columns[:-1])
  123. # GH 9272
  124. def test_constructor_empty(self):
  125. sp = SparseDataFrame()
  126. assert len(sp.index) == 0
  127. assert len(sp.columns) == 0
  128. def test_constructor_dataframe(self, float_frame):
  129. dense = float_frame.to_dense()
  130. sp = SparseDataFrame(dense)
  131. tm.assert_sp_frame_equal(sp, float_frame)
  132. def test_constructor_convert_index_once(self):
  133. arr = np.array([1.5, 2.5, 3.5])
  134. sdf = SparseDataFrame(columns=lrange(4), index=arr)
  135. assert sdf[0].index is sdf[1].index
  136. def test_constructor_from_series(self):
  137. # GH 2873
  138. x = Series(np.random.randn(10000), name='a')
  139. x = x.to_sparse(fill_value=0)
  140. assert isinstance(x, SparseSeries)
  141. df = SparseDataFrame(x)
  142. assert isinstance(df, SparseDataFrame)
  143. x = Series(np.random.randn(10000), name='a')
  144. y = Series(np.random.randn(10000), name='b')
  145. x2 = x.astype(float)
  146. x2.loc[:9998] = np.NaN
  147. # TODO: x_sparse is unused...fix
  148. x_sparse = x2.to_sparse(fill_value=np.NaN) # noqa
  149. # Currently fails too with weird ufunc error
  150. # df1 = SparseDataFrame([x_sparse, y])
  151. y.loc[:9998] = 0
  152. # TODO: y_sparse is unsused...fix
  153. y_sparse = y.to_sparse(fill_value=0) # noqa
  154. # without sparse value raises error
  155. # df2 = SparseDataFrame([x2_sparse, y])
  156. def test_constructor_from_dense_series(self):
  157. # GH 19393
  158. # series with name
  159. x = Series(np.random.randn(10000), name='a')
  160. result = SparseDataFrame(x)
  161. expected = x.to_frame().to_sparse()
  162. tm.assert_sp_frame_equal(result, expected)
  163. # series with no name
  164. x = Series(np.random.randn(10000))
  165. result = SparseDataFrame(x)
  166. expected = x.to_frame().to_sparse()
  167. tm.assert_sp_frame_equal(result, expected)
  168. def test_constructor_from_unknown_type(self):
  169. # GH 19393
  170. class Unknown(object):
  171. pass
  172. with pytest.raises(TypeError,
  173. match=('SparseDataFrame called with unknown type '
  174. '"Unknown" for data argument')):
  175. SparseDataFrame(Unknown())
  176. def test_constructor_preserve_attr(self):
  177. # GH 13866
  178. arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, fill_value=0)
  179. assert arr.dtype == SparseDtype(np.int64)
  180. assert arr.fill_value == 0
  181. df = pd.SparseDataFrame({'x': arr})
  182. assert df['x'].dtype == SparseDtype(np.int64)
  183. assert df['x'].fill_value == 0
  184. s = pd.SparseSeries(arr, name='x')
  185. assert s.dtype == SparseDtype(np.int64)
  186. assert s.fill_value == 0
  187. df = pd.SparseDataFrame(s)
  188. assert df['x'].dtype == SparseDtype(np.int64)
  189. assert df['x'].fill_value == 0
  190. df = pd.SparseDataFrame({'x': s})
  191. assert df['x'].dtype == SparseDtype(np.int64)
  192. assert df['x'].fill_value == 0
  193. def test_constructor_nan_dataframe(self):
  194. # GH 10079
  195. trains = np.arange(100)
  196. thresholds = [10, 20, 30, 40, 50, 60]
  197. tuples = [(i, j) for i in trains for j in thresholds]
  198. index = pd.MultiIndex.from_tuples(tuples,
  199. names=['trains', 'thresholds'])
  200. matrix = np.empty((len(index), len(trains)))
  201. matrix.fill(np.nan)
  202. df = pd.DataFrame(matrix, index=index, columns=trains, dtype=float)
  203. result = df.to_sparse()
  204. expected = pd.SparseDataFrame(matrix, index=index, columns=trains,
  205. dtype=float)
  206. tm.assert_sp_frame_equal(result, expected)
  207. def test_type_coercion_at_construction(self):
  208. # GH 15682
  209. result = pd.SparseDataFrame(
  210. {'a': [1, 0, 0], 'b': [0, 1, 0], 'c': [0, 0, 1]}, dtype='uint8',
  211. default_fill_value=0)
  212. expected = pd.SparseDataFrame(
  213. {'a': pd.SparseSeries([1, 0, 0], dtype='uint8'),
  214. 'b': pd.SparseSeries([0, 1, 0], dtype='uint8'),
  215. 'c': pd.SparseSeries([0, 0, 1], dtype='uint8')},
  216. default_fill_value=0)
  217. tm.assert_sp_frame_equal(result, expected)
  218. def test_dtypes(self):
  219. df = DataFrame(np.random.randn(10000, 4))
  220. df.loc[:9998] = np.nan
  221. sdf = df.to_sparse()
  222. result = sdf.get_dtype_counts()
  223. expected = Series({'Sparse[float64, nan]': 4})
  224. tm.assert_series_equal(result, expected)
  225. def test_shape(self, float_frame, float_frame_int_kind,
  226. float_frame_fill0, float_frame_fill2):
  227. # see gh-10452
  228. assert float_frame.shape == (10, 4)
  229. assert float_frame_int_kind.shape == (10, 4)
  230. assert float_frame_fill0.shape == (10, 4)
  231. assert float_frame_fill2.shape == (10, 4)
  232. def test_str(self):
  233. df = DataFrame(np.random.randn(10000, 4))
  234. df.loc[:9998] = np.nan
  235. sdf = df.to_sparse()
  236. str(sdf)
  237. def test_array_interface(self, float_frame):
  238. res = np.sqrt(float_frame)
  239. dres = np.sqrt(float_frame.to_dense())
  240. tm.assert_frame_equal(res.to_dense(), dres)
  241. def test_pickle(self, float_frame, float_frame_int_kind, float_frame_dense,
  242. float_frame_fill0, float_frame_fill0_dense,
  243. float_frame_fill2, float_frame_fill2_dense):
  244. def _test_roundtrip(frame, orig):
  245. result = tm.round_trip_pickle(frame)
  246. tm.assert_sp_frame_equal(frame, result)
  247. tm.assert_frame_equal(result.to_dense(), orig, check_dtype=False)
  248. _test_roundtrip(SparseDataFrame(), DataFrame())
  249. _test_roundtrip(float_frame, float_frame_dense)
  250. _test_roundtrip(float_frame_int_kind, float_frame_dense)
  251. _test_roundtrip(float_frame_fill0, float_frame_fill0_dense)
  252. _test_roundtrip(float_frame_fill2, float_frame_fill2_dense)
  253. def test_dense_to_sparse(self):
  254. df = DataFrame({'A': [nan, nan, nan, 1, 2],
  255. 'B': [1, 2, nan, nan, nan]})
  256. sdf = df.to_sparse()
  257. assert isinstance(sdf, SparseDataFrame)
  258. assert np.isnan(sdf.default_fill_value)
  259. assert isinstance(sdf['A'].sp_index, BlockIndex)
  260. tm.assert_frame_equal(sdf.to_dense(), df)
  261. sdf = df.to_sparse(kind='integer')
  262. assert isinstance(sdf['A'].sp_index, IntIndex)
  263. df = DataFrame({'A': [0, 0, 0, 1, 2],
  264. 'B': [1, 2, 0, 0, 0]}, dtype=float)
  265. sdf = df.to_sparse(fill_value=0)
  266. assert sdf.default_fill_value == 0
  267. tm.assert_frame_equal(sdf.to_dense(), df)
  268. def test_density(self):
  269. df = SparseSeries([nan, nan, nan, 0, 1, 2, 3, 4, 5, 6])
  270. assert df.density == 0.7
  271. df = SparseDataFrame({'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6],
  272. 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6],
  273. 'C': np.arange(10),
  274. 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]})
  275. assert df.density == 0.75
  276. def test_sparse_to_dense(self):
  277. pass
  278. def test_sparse_series_ops(self, float_frame):
  279. self._check_frame_ops(float_frame)
  280. def test_sparse_series_ops_i(self, float_frame_int_kind):
  281. self._check_frame_ops(float_frame_int_kind)
  282. def test_sparse_series_ops_z(self, float_frame_fill0):
  283. self._check_frame_ops(float_frame_fill0)
  284. def test_sparse_series_ops_fill(self, float_frame_fill2):
  285. self._check_frame_ops(float_frame_fill2)
  286. def _check_frame_ops(self, frame):
  287. def _compare_to_dense(a, b, da, db, op):
  288. sparse_result = op(a, b)
  289. dense_result = op(da, db)
  290. fill = sparse_result.default_fill_value
  291. dense_result = dense_result.to_sparse(fill_value=fill)
  292. tm.assert_sp_frame_equal(sparse_result, dense_result,
  293. exact_indices=False)
  294. if isinstance(a, DataFrame) and isinstance(db, DataFrame):
  295. mixed_result = op(a, db)
  296. assert isinstance(mixed_result, SparseDataFrame)
  297. tm.assert_sp_frame_equal(mixed_result, sparse_result,
  298. exact_indices=False)
  299. opnames = ['add', 'sub', 'mul', 'truediv', 'floordiv']
  300. ops = [getattr(operator, name) for name in opnames]
  301. fidx = frame.index
  302. # time series operations
  303. series = [frame['A'], frame['B'], frame['C'], frame['D'],
  304. frame['A'].reindex(fidx[:7]), frame['A'].reindex(fidx[::2]),
  305. SparseSeries(
  306. [], index=[])]
  307. for op in opnames:
  308. _compare_to_dense(frame, frame[::2], frame.to_dense(),
  309. frame[::2].to_dense(), getattr(operator, op))
  310. # 2304, no auto-broadcasting
  311. for i, s in enumerate(series):
  312. f = lambda a, b: getattr(a, op)(b, axis='index')
  313. _compare_to_dense(frame, s, frame.to_dense(), s.to_dense(), f)
  314. # rops are not implemented
  315. # _compare_to_dense(s, frame, s.to_dense(),
  316. # frame.to_dense(), f)
  317. # cross-sectional operations
  318. series = [frame.xs(fidx[0]), frame.xs(fidx[3]), frame.xs(fidx[5]),
  319. frame.xs(fidx[7]), frame.xs(fidx[5])[:2]]
  320. for op in ops:
  321. for s in series:
  322. _compare_to_dense(frame, s, frame.to_dense(), s, op)
  323. _compare_to_dense(s, frame, s, frame.to_dense(), op)
  324. # it works!
  325. result = frame + frame.loc[:, ['A', 'B']] # noqa
  326. def test_op_corners(self, float_frame, empty_frame):
  327. empty = empty_frame + empty_frame
  328. assert empty.empty
  329. foo = float_frame + empty_frame
  330. assert isinstance(foo.index, DatetimeIndex)
  331. tm.assert_frame_equal(foo, float_frame * np.nan)
  332. foo = empty_frame + float_frame
  333. tm.assert_frame_equal(foo, float_frame * np.nan)
  334. def test_scalar_ops(self):
  335. pass
  336. def test_getitem(self):
  337. # 1585 select multiple columns
  338. sdf = SparseDataFrame(index=[0, 1, 2], columns=['a', 'b', 'c'])
  339. result = sdf[['a', 'b']]
  340. exp = sdf.reindex(columns=['a', 'b'])
  341. tm.assert_sp_frame_equal(result, exp)
  342. pytest.raises(Exception, sdf.__getitem__, ['a', 'd'])
  343. def test_iloc(self, float_frame):
  344. # GH 2227
  345. result = float_frame.iloc[:, 0]
  346. assert isinstance(result, SparseSeries)
  347. tm.assert_sp_series_equal(result, float_frame['A'])
  348. # preserve sparse index type. #2251
  349. data = {'A': [0, 1]}
  350. iframe = SparseDataFrame(data, default_kind='integer')
  351. tm.assert_class_equal(iframe['A'].sp_index,
  352. iframe.iloc[:, 0].sp_index)
  353. def test_set_value(self, float_frame):
  354. # ok, as the index gets converted to object
  355. frame = float_frame.copy()
  356. with tm.assert_produces_warning(FutureWarning,
  357. check_stacklevel=False):
  358. res = frame.set_value('foobar', 'B', 1.5)
  359. assert res.index.dtype == 'object'
  360. res = float_frame
  361. res.index = res.index.astype(object)
  362. with tm.assert_produces_warning(FutureWarning,
  363. check_stacklevel=False):
  364. res = float_frame.set_value('foobar', 'B', 1.5)
  365. assert res is not float_frame
  366. assert res.index[-1] == 'foobar'
  367. with tm.assert_produces_warning(FutureWarning,
  368. check_stacklevel=False):
  369. assert res.get_value('foobar', 'B') == 1.5
  370. with tm.assert_produces_warning(FutureWarning,
  371. check_stacklevel=False):
  372. res2 = res.set_value('foobar', 'qux', 1.5)
  373. assert res2 is not res
  374. tm.assert_index_equal(res2.columns,
  375. pd.Index(list(float_frame.columns) + ['qux']))
  376. with tm.assert_produces_warning(FutureWarning,
  377. check_stacklevel=False):
  378. assert res2.get_value('foobar', 'qux') == 1.5
  379. def test_fancy_index_misc(self, float_frame):
  380. # axis = 0
  381. sliced = float_frame.iloc[-2:, :]
  382. expected = float_frame.reindex(index=float_frame.index[-2:])
  383. tm.assert_sp_frame_equal(sliced, expected)
  384. # axis = 1
  385. sliced = float_frame.iloc[:, -2:]
  386. expected = float_frame.reindex(columns=float_frame.columns[-2:])
  387. tm.assert_sp_frame_equal(sliced, expected)
  388. def test_getitem_overload(self, float_frame):
  389. # slicing
  390. sl = float_frame[:20]
  391. tm.assert_sp_frame_equal(sl,
  392. float_frame.reindex(float_frame.index[:20]))
  393. # boolean indexing
  394. d = float_frame.index[5]
  395. indexer = float_frame.index > d
  396. subindex = float_frame.index[indexer]
  397. subframe = float_frame[indexer]
  398. tm.assert_index_equal(subindex, subframe.index)
  399. pytest.raises(Exception, float_frame.__getitem__, indexer[:-1])
  400. def test_setitem(self, float_frame, float_frame_int_kind,
  401. float_frame_dense,
  402. float_frame_fill0, float_frame_fill0_dense,
  403. float_frame_fill2, float_frame_fill2_dense):
  404. def _check_frame(frame, orig):
  405. N = len(frame)
  406. # insert SparseSeries
  407. frame['E'] = frame['A']
  408. assert isinstance(frame['E'], SparseSeries)
  409. tm.assert_sp_series_equal(frame['E'], frame['A'],
  410. check_names=False)
  411. # insert SparseSeries differently-indexed
  412. to_insert = frame['A'][::2]
  413. frame['E'] = to_insert
  414. expected = to_insert.to_dense().reindex(frame.index)
  415. result = frame['E'].to_dense()
  416. tm.assert_series_equal(result, expected, check_names=False)
  417. assert result.name == 'E'
  418. # insert Series
  419. frame['F'] = frame['A'].to_dense()
  420. assert isinstance(frame['F'], SparseSeries)
  421. tm.assert_sp_series_equal(frame['F'], frame['A'],
  422. check_names=False)
  423. # insert Series differently-indexed
  424. to_insert = frame['A'].to_dense()[::2]
  425. frame['G'] = to_insert
  426. expected = to_insert.reindex(frame.index)
  427. expected.name = 'G'
  428. tm.assert_series_equal(frame['G'].to_dense(), expected)
  429. # insert ndarray
  430. frame['H'] = np.random.randn(N)
  431. assert isinstance(frame['H'], SparseSeries)
  432. to_sparsify = np.random.randn(N)
  433. to_sparsify[N // 2:] = frame.default_fill_value
  434. frame['I'] = to_sparsify
  435. assert len(frame['I'].sp_values) == N // 2
  436. # insert ndarray wrong size
  437. pytest.raises(Exception, frame.__setitem__, 'foo',
  438. np.random.randn(N - 1))
  439. # scalar value
  440. frame['J'] = 5
  441. assert len(frame['J'].sp_values) == N
  442. assert (frame['J'].sp_values == 5).all()
  443. frame['K'] = frame.default_fill_value
  444. assert len(frame['K'].sp_values) == 0
  445. _check_frame(float_frame, float_frame_dense)
  446. _check_frame(float_frame_int_kind, float_frame_dense)
  447. _check_frame(float_frame_fill0, float_frame_fill0_dense)
  448. _check_frame(float_frame_fill2, float_frame_fill2_dense)
  449. @pytest.mark.parametrize('values', [
  450. [True, False],
  451. [0, 1],
  452. [1, None],
  453. ['a', 'b'],
  454. [pd.Timestamp('2017'), pd.NaT],
  455. [pd.Timedelta('10s'), pd.NaT],
  456. ])
  457. def test_setitem_more(self, values):
  458. df = pd.DataFrame({"A": values})
  459. df['A'] = pd.SparseArray(values)
  460. expected = pd.DataFrame({'A': pd.SparseArray(values)})
  461. tm.assert_frame_equal(df, expected)
  462. def test_setitem_corner(self, float_frame):
  463. float_frame['a'] = float_frame['B']
  464. tm.assert_sp_series_equal(float_frame['a'], float_frame['B'],
  465. check_names=False)
  466. def test_setitem_array(self, float_frame):
  467. arr = float_frame['B']
  468. float_frame['E'] = arr
  469. tm.assert_sp_series_equal(float_frame['E'], float_frame['B'],
  470. check_names=False)
  471. float_frame['F'] = arr[:-1]
  472. index = float_frame.index[:-1]
  473. tm.assert_sp_series_equal(float_frame['E'].reindex(index),
  474. float_frame['F'].reindex(index),
  475. check_names=False)
  476. def test_setitem_chained_no_consolidate(self):
  477. # https://github.com/pandas-dev/pandas/pull/19268
  478. # issuecomment-361696418
  479. # chained setitem used to cause consolidation
  480. sdf = pd.SparseDataFrame([[np.nan, 1], [2, np.nan]])
  481. with pd.option_context('mode.chained_assignment', None):
  482. sdf[0][1] = 2
  483. assert len(sdf._data.blocks) == 2
  484. def test_delitem(self, float_frame):
  485. A = float_frame['A']
  486. C = float_frame['C']
  487. del float_frame['B']
  488. assert 'B' not in float_frame
  489. tm.assert_sp_series_equal(float_frame['A'], A)
  490. tm.assert_sp_series_equal(float_frame['C'], C)
  491. del float_frame['D']
  492. assert 'D' not in float_frame
  493. del float_frame['A']
  494. assert 'A' not in float_frame
  495. def test_set_columns(self, float_frame):
  496. float_frame.columns = float_frame.columns
  497. pytest.raises(Exception, setattr, float_frame, 'columns',
  498. float_frame.columns[:-1])
  499. def test_set_index(self, float_frame):
  500. float_frame.index = float_frame.index
  501. pytest.raises(Exception, setattr, float_frame, 'index',
  502. float_frame.index[:-1])
  503. def test_ctor_reindex(self):
  504. idx = pd.Index([0, 1, 2, 3])
  505. with pytest.raises(ValueError, match=''):
  506. pd.SparseDataFrame({"A": [1, 2]}, index=idx)
  507. def test_append(self, float_frame):
  508. a = float_frame[:5]
  509. b = float_frame[5:]
  510. appended = a.append(b)
  511. tm.assert_sp_frame_equal(appended, float_frame, exact_indices=False)
  512. a = float_frame.iloc[:5, :3]
  513. b = float_frame.iloc[5:]
  514. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  515. # Stacklevel is set for pd.concat, not append
  516. appended = a.append(b)
  517. tm.assert_sp_frame_equal(appended.iloc[:, :3], float_frame.iloc[:, :3],
  518. exact_indices=False)
  519. a = a[['B', 'C', 'A']].head(2)
  520. b = b.head(2)
  521. expected = pd.SparseDataFrame({
  522. "B": [0., 1, None, 3],
  523. "C": [0., 1, 5, 6],
  524. "A": [None, None, 2, 3],
  525. "D": [None, None, 5, None],
  526. }, index=a.index | b.index, columns=['B', 'C', 'A', 'D'])
  527. with tm.assert_produces_warning(None):
  528. appended = a.append(b, sort=False)
  529. tm.assert_frame_equal(appended, expected)
  530. with tm.assert_produces_warning(None):
  531. appended = a.append(b, sort=True)
  532. tm.assert_sp_frame_equal(appended, expected[['A', 'B', 'C', 'D']],
  533. consolidate_block_indices=True,
  534. check_kind=False)
  535. def test_astype(self):
  536. sparse = pd.SparseDataFrame({'A': SparseArray([1, 2, 3, 4],
  537. dtype=np.int64),
  538. 'B': SparseArray([4, 5, 6, 7],
  539. dtype=np.int64)})
  540. assert sparse['A'].dtype == SparseDtype(np.int64)
  541. assert sparse['B'].dtype == SparseDtype(np.int64)
  542. # retain fill_value
  543. res = sparse.astype(np.float64)
  544. exp = pd.SparseDataFrame({'A': SparseArray([1., 2., 3., 4.],
  545. fill_value=0,
  546. kind='integer'),
  547. 'B': SparseArray([4., 5., 6., 7.],
  548. fill_value=0,
  549. kind='integer')},
  550. default_fill_value=np.nan)
  551. tm.assert_sp_frame_equal(res, exp)
  552. assert res['A'].dtype == SparseDtype(np.float64, 0)
  553. assert res['B'].dtype == SparseDtype(np.float64, 0)
  554. # update fill_value
  555. res = sparse.astype(SparseDtype(np.float64, np.nan))
  556. exp = pd.SparseDataFrame({'A': SparseArray([1., 2., 3., 4.],
  557. fill_value=np.nan,
  558. kind='integer'),
  559. 'B': SparseArray([4., 5., 6., 7.],
  560. fill_value=np.nan,
  561. kind='integer')},
  562. default_fill_value=np.nan)
  563. tm.assert_sp_frame_equal(res, exp)
  564. assert res['A'].dtype == SparseDtype(np.float64, np.nan)
  565. assert res['B'].dtype == SparseDtype(np.float64, np.nan)
  566. def test_astype_bool(self):
  567. sparse = pd.SparseDataFrame({'A': SparseArray([0, 2, 0, 4],
  568. fill_value=0,
  569. dtype=np.int64),
  570. 'B': SparseArray([0, 5, 0, 7],
  571. fill_value=0,
  572. dtype=np.int64)},
  573. default_fill_value=0)
  574. assert sparse['A'].dtype == SparseDtype(np.int64)
  575. assert sparse['B'].dtype == SparseDtype(np.int64)
  576. res = sparse.astype(SparseDtype(bool, False))
  577. exp = pd.SparseDataFrame({'A': SparseArray([False, True, False, True],
  578. dtype=np.bool,
  579. fill_value=False,
  580. kind='integer'),
  581. 'B': SparseArray([False, True, False, True],
  582. dtype=np.bool,
  583. fill_value=False,
  584. kind='integer')},
  585. default_fill_value=False)
  586. tm.assert_sp_frame_equal(res, exp)
  587. assert res['A'].dtype == SparseDtype(np.bool)
  588. assert res['B'].dtype == SparseDtype(np.bool)
  589. def test_astype_object(self):
  590. # This may change in GH-23125
  591. df = pd.DataFrame({"A": SparseArray([0, 1]),
  592. "B": SparseArray([0, 1])})
  593. result = df.astype(object)
  594. dtype = SparseDtype(object, 0)
  595. expected = pd.DataFrame({"A": SparseArray([0, 1], dtype=dtype),
  596. "B": SparseArray([0, 1], dtype=dtype)})
  597. tm.assert_frame_equal(result, expected)
  598. def test_fillna(self, float_frame_fill0, float_frame_fill0_dense):
  599. df = float_frame_fill0.reindex(lrange(5))
  600. dense = float_frame_fill0_dense.reindex(lrange(5))
  601. result = df.fillna(0)
  602. expected = dense.fillna(0)
  603. tm.assert_sp_frame_equal(result, expected.to_sparse(fill_value=0),
  604. exact_indices=False)
  605. tm.assert_frame_equal(result.to_dense(), expected)
  606. result = df.copy()
  607. result.fillna(0, inplace=True)
  608. expected = dense.fillna(0)
  609. tm.assert_sp_frame_equal(result, expected.to_sparse(fill_value=0),
  610. exact_indices=False)
  611. tm.assert_frame_equal(result.to_dense(), expected)
  612. result = df.copy()
  613. result = df['A']
  614. result.fillna(0, inplace=True)
  615. expected = dense['A'].fillna(0)
  616. # this changes internal SparseArray repr
  617. # tm.assert_sp_series_equal(result, expected.to_sparse(fill_value=0))
  618. tm.assert_series_equal(result.to_dense(), expected)
  619. def test_fillna_fill_value(self):
  620. df = pd.DataFrame({'A': [1, 0, 0], 'B': [np.nan, np.nan, 4]})
  621. sparse = pd.SparseDataFrame(df)
  622. tm.assert_frame_equal(sparse.fillna(-1).to_dense(),
  623. df.fillna(-1), check_dtype=False)
  624. sparse = pd.SparseDataFrame(df, default_fill_value=0)
  625. tm.assert_frame_equal(sparse.fillna(-1).to_dense(),
  626. df.fillna(-1), check_dtype=False)
  627. def test_sparse_frame_pad_backfill_limit(self):
  628. index = np.arange(10)
  629. df = DataFrame(np.random.randn(10, 4), index=index)
  630. sdf = df.to_sparse()
  631. result = sdf[:2].reindex(index, method='pad', limit=5)
  632. with tm.assert_produces_warning(PerformanceWarning):
  633. expected = sdf[:2].reindex(index).fillna(method='pad')
  634. expected = expected.to_dense()
  635. expected.values[-3:] = np.nan
  636. expected = expected.to_sparse()
  637. tm.assert_frame_equal(result, expected)
  638. result = sdf[-2:].reindex(index, method='backfill', limit=5)
  639. with tm.assert_produces_warning(PerformanceWarning):
  640. expected = sdf[-2:].reindex(index).fillna(method='backfill')
  641. expected = expected.to_dense()
  642. expected.values[:3] = np.nan
  643. expected = expected.to_sparse()
  644. tm.assert_frame_equal(result, expected)
  645. def test_sparse_frame_fillna_limit(self):
  646. index = np.arange(10)
  647. df = DataFrame(np.random.randn(10, 4), index=index)
  648. sdf = df.to_sparse()
  649. result = sdf[:2].reindex(index)
  650. with tm.assert_produces_warning(PerformanceWarning):
  651. result = result.fillna(method='pad', limit=5)
  652. with tm.assert_produces_warning(PerformanceWarning):
  653. expected = sdf[:2].reindex(index).fillna(method='pad')
  654. expected = expected.to_dense()
  655. expected.values[-3:] = np.nan
  656. expected = expected.to_sparse()
  657. tm.assert_frame_equal(result, expected)
  658. result = sdf[-2:].reindex(index)
  659. with tm.assert_produces_warning(PerformanceWarning):
  660. result = result.fillna(method='backfill', limit=5)
  661. with tm.assert_produces_warning(PerformanceWarning):
  662. expected = sdf[-2:].reindex(index).fillna(method='backfill')
  663. expected = expected.to_dense()
  664. expected.values[:3] = np.nan
  665. expected = expected.to_sparse()
  666. tm.assert_frame_equal(result, expected)
  667. def test_rename(self, float_frame):
  668. result = float_frame.rename(index=str)
  669. expected = SparseDataFrame(float_frame.values,
  670. index=float_frame.index.strftime(
  671. "%Y-%m-%d %H:%M:%S"),
  672. columns=list('ABCD'))
  673. tm.assert_sp_frame_equal(result, expected)
  674. result = float_frame.rename(columns=lambda x: '%s%d' % (x, 1))
  675. data = {'A1': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6],
  676. 'B1': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6],
  677. 'C1': np.arange(10, dtype=np.float64),
  678. 'D1': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]}
  679. expected = SparseDataFrame(data, index=float_frame.index)
  680. tm.assert_sp_frame_equal(result, expected)
  681. def test_corr(self, float_frame):
  682. res = float_frame.corr()
  683. # XXX: this stays sparse
  684. tm.assert_frame_equal(res, float_frame.to_dense().corr().to_sparse())
  685. def test_describe(self, float_frame):
  686. float_frame['foo'] = np.nan
  687. float_frame.get_dtype_counts()
  688. str(float_frame)
  689. desc = float_frame.describe() # noqa
  690. def test_join(self, float_frame):
  691. left = float_frame.loc[:, ['A', 'B']]
  692. right = float_frame.loc[:, ['C', 'D']]
  693. joined = left.join(right)
  694. tm.assert_sp_frame_equal(joined, float_frame, exact_indices=False)
  695. right = float_frame.loc[:, ['B', 'D']]
  696. pytest.raises(Exception, left.join, right)
  697. with pytest.raises(ValueError, match='Other Series must have a name'):
  698. float_frame.join(Series(
  699. np.random.randn(len(float_frame)), index=float_frame.index))
  700. def test_reindex(self, float_frame, float_frame_int_kind,
  701. float_frame_fill0, float_frame_fill2):
  702. def _check_frame(frame):
  703. index = frame.index
  704. sidx = index[::2]
  705. sidx2 = index[:5] # noqa
  706. sparse_result = frame.reindex(sidx)
  707. dense_result = frame.to_dense().reindex(sidx)
  708. tm.assert_frame_equal(sparse_result.to_dense(), dense_result)
  709. tm.assert_frame_equal(frame.reindex(list(sidx)).to_dense(),
  710. dense_result)
  711. sparse_result2 = sparse_result.reindex(index)
  712. dense_result2 = dense_result.reindex(index)
  713. tm.assert_frame_equal(sparse_result2.to_dense(), dense_result2)
  714. # propagate CORRECT fill value
  715. tm.assert_almost_equal(sparse_result.default_fill_value,
  716. frame.default_fill_value)
  717. tm.assert_almost_equal(sparse_result['A'].fill_value,
  718. frame['A'].fill_value)
  719. # length zero
  720. length_zero = frame.reindex([])
  721. assert len(length_zero) == 0
  722. assert len(length_zero.columns) == len(frame.columns)
  723. assert len(length_zero['A']) == 0
  724. # frame being reindexed has length zero
  725. length_n = length_zero.reindex(index)
  726. assert len(length_n) == len(frame)
  727. assert len(length_n.columns) == len(frame.columns)
  728. assert len(length_n['A']) == len(frame)
  729. # reindex columns
  730. reindexed = frame.reindex(columns=['A', 'B', 'Z'])
  731. assert len(reindexed.columns) == 3
  732. tm.assert_almost_equal(reindexed['Z'].fill_value,
  733. frame.default_fill_value)
  734. assert np.isnan(reindexed['Z'].sp_values).all()
  735. _check_frame(float_frame)
  736. _check_frame(float_frame_int_kind)
  737. _check_frame(float_frame_fill0)
  738. _check_frame(float_frame_fill2)
  739. # with copy=False
  740. reindexed = float_frame.reindex(float_frame.index, copy=False)
  741. reindexed['F'] = reindexed['A']
  742. assert 'F' in float_frame
  743. reindexed = float_frame.reindex(float_frame.index)
  744. reindexed['G'] = reindexed['A']
  745. assert 'G' not in float_frame
  746. def test_reindex_fill_value(self, float_frame_fill0,
  747. float_frame_fill0_dense):
  748. rng = bdate_range('20110110', periods=20)
  749. result = float_frame_fill0.reindex(rng, fill_value=0)
  750. exp = float_frame_fill0_dense.reindex(rng, fill_value=0)
  751. exp = exp.to_sparse(float_frame_fill0.default_fill_value)
  752. tm.assert_sp_frame_equal(result, exp)
  753. def test_reindex_method(self):
  754. sparse = SparseDataFrame(data=[[11., 12., 14.],
  755. [21., 22., 24.],
  756. [41., 42., 44.]],
  757. index=[1, 2, 4],
  758. columns=[1, 2, 4],
  759. dtype=float)
  760. # Over indices
  761. # default method
  762. result = sparse.reindex(index=range(6))
  763. expected = SparseDataFrame(data=[[nan, nan, nan],
  764. [11., 12., 14.],
  765. [21., 22., 24.],
  766. [nan, nan, nan],
  767. [41., 42., 44.],
  768. [nan, nan, nan]],
  769. index=range(6),
  770. columns=[1, 2, 4],
  771. dtype=float)
  772. tm.assert_sp_frame_equal(result, expected)
  773. # method='bfill'
  774. result = sparse.reindex(index=range(6), method='bfill')
  775. expected = SparseDataFrame(data=[[11., 12., 14.],
  776. [11., 12., 14.],
  777. [21., 22., 24.],
  778. [41., 42., 44.],
  779. [41., 42., 44.],
  780. [nan, nan, nan]],
  781. index=range(6),
  782. columns=[1, 2, 4],
  783. dtype=float)
  784. tm.assert_sp_frame_equal(result, expected)
  785. # method='ffill'
  786. result = sparse.reindex(index=range(6), method='ffill')
  787. expected = SparseDataFrame(data=[[nan, nan, nan],
  788. [11., 12., 14.],
  789. [21., 22., 24.],
  790. [21., 22., 24.],
  791. [41., 42., 44.],
  792. [41., 42., 44.]],
  793. index=range(6),
  794. columns=[1, 2, 4],
  795. dtype=float)
  796. tm.assert_sp_frame_equal(result, expected)
  797. # Over columns
  798. # default method
  799. result = sparse.reindex(columns=range(6))
  800. expected = SparseDataFrame(data=[[nan, 11., 12., nan, 14., nan],
  801. [nan, 21., 22., nan, 24., nan],
  802. [nan, 41., 42., nan, 44., nan]],
  803. index=[1, 2, 4],
  804. columns=range(6),
  805. dtype=float)
  806. tm.assert_sp_frame_equal(result, expected)
  807. # method='bfill'
  808. with pytest.raises(NotImplementedError):
  809. sparse.reindex(columns=range(6), method='bfill')
  810. # method='ffill'
  811. with pytest.raises(NotImplementedError):
  812. sparse.reindex(columns=range(6), method='ffill')
  813. def test_take(self, float_frame):
  814. result = float_frame.take([1, 0, 2], axis=1)
  815. expected = float_frame.reindex(columns=['B', 'A', 'C'])
  816. tm.assert_sp_frame_equal(result, expected)
  817. def test_to_dense(self, float_frame, float_frame_int_kind,
  818. float_frame_dense,
  819. float_frame_fill0, float_frame_fill0_dense,
  820. float_frame_fill2, float_frame_fill2_dense):
  821. def _check(frame, orig):
  822. dense_dm = frame.to_dense()
  823. # Sparse[float] != float
  824. tm.assert_frame_equal(frame, dense_dm, check_dtype=False)
  825. tm.assert_frame_equal(dense_dm, orig, check_dtype=False)
  826. _check(float_frame, float_frame_dense)
  827. _check(float_frame_int_kind, float_frame_dense)
  828. _check(float_frame_fill0, float_frame_fill0_dense)
  829. _check(float_frame_fill2, float_frame_fill2_dense)
  830. @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
  831. def test_stack_sparse_frame(self, float_frame, float_frame_int_kind,
  832. float_frame_fill0, float_frame_fill2):
  833. def _check(frame):
  834. dense_frame = frame.to_dense() # noqa
  835. wp = Panel.from_dict({'foo': frame})
  836. from_dense_lp = wp.to_frame()
  837. from_sparse_lp = spf.stack_sparse_frame(frame)
  838. tm.assert_numpy_array_equal(from_dense_lp.values,
  839. from_sparse_lp.values)
  840. _check(float_frame)
  841. _check(float_frame_int_kind)
  842. # for now
  843. pytest.raises(Exception, _check, float_frame_fill0)
  844. pytest.raises(Exception, _check, float_frame_fill2)
  845. def test_transpose(self, float_frame, float_frame_int_kind,
  846. float_frame_dense,
  847. float_frame_fill0, float_frame_fill0_dense,
  848. float_frame_fill2, float_frame_fill2_dense):
  849. def _check(frame, orig):
  850. transposed = frame.T
  851. untransposed = transposed.T
  852. tm.assert_sp_frame_equal(frame, untransposed)
  853. tm.assert_frame_equal(frame.T.to_dense(), orig.T)
  854. tm.assert_frame_equal(frame.T.T.to_dense(), orig.T.T)
  855. tm.assert_sp_frame_equal(frame, frame.T.T, exact_indices=False)
  856. _check(float_frame, float_frame_dense)
  857. _check(float_frame_int_kind, float_frame_dense)
  858. _check(float_frame_fill0, float_frame_fill0_dense)
  859. _check(float_frame_fill2, float_frame_fill2_dense)
  860. def test_shift(self, float_frame, float_frame_int_kind, float_frame_dense,
  861. float_frame_fill0, float_frame_fill0_dense,
  862. float_frame_fill2, float_frame_fill2_dense):
  863. def _check(frame, orig):
  864. shifted = frame.shift(0)
  865. exp = orig.shift(0)
  866. tm.assert_frame_equal(shifted.to_dense(), exp)
  867. shifted = frame.shift(1)
  868. exp = orig.shift(1)
  869. tm.assert_frame_equal(shifted.to_dense(), exp)
  870. shifted = frame.shift(-2)
  871. exp = orig.shift(-2)
  872. tm.assert_frame_equal(shifted.to_dense(), exp)
  873. shifted = frame.shift(2, freq='B')
  874. exp = orig.shift(2, freq='B')
  875. exp = exp.to_sparse(frame.default_fill_value,
  876. kind=frame.default_kind)
  877. tm.assert_frame_equal(shifted, exp)
  878. shifted = frame.shift(2, freq=BDay())
  879. exp = orig.shift(2, freq=BDay())
  880. exp = exp.to_sparse(frame.default_fill_value,
  881. kind=frame.default_kind)
  882. tm.assert_frame_equal(shifted, exp)
  883. _check(float_frame, float_frame_dense)
  884. _check(float_frame_int_kind, float_frame_dense)
  885. _check(float_frame_fill0, float_frame_fill0_dense)
  886. _check(float_frame_fill2, float_frame_fill2_dense)
  887. def test_count(self, float_frame):
  888. dense_result = float_frame.to_dense().count()
  889. result = float_frame.count()
  890. tm.assert_series_equal(result.to_dense(), dense_result)
  891. result = float_frame.count(axis=None)
  892. tm.assert_series_equal(result.to_dense(), dense_result)
  893. result = float_frame.count(axis=0)
  894. tm.assert_series_equal(result.to_dense(), dense_result)
  895. result = float_frame.count(axis=1)
  896. dense_result = float_frame.to_dense().count(axis=1)
  897. # win32 don't check dtype
  898. tm.assert_series_equal(result, dense_result, check_dtype=False)
  899. def test_numpy_transpose(self):
  900. sdf = SparseDataFrame([1, 2, 3], index=[1, 2, 3], columns=['a'])
  901. result = np.transpose(np.transpose(sdf))
  902. tm.assert_sp_frame_equal(result, sdf)
  903. msg = "the 'axes' parameter is not supported"
  904. with pytest.raises(ValueError, match=msg):
  905. np.transpose(sdf, axes=1)
  906. def test_combine_first(self, float_frame):
  907. df = float_frame
  908. result = df[::2].combine_first(df)
  909. expected = df[::2].to_dense().combine_first(df.to_dense())
  910. expected = expected.to_sparse(fill_value=df.default_fill_value)
  911. tm.assert_sp_frame_equal(result, expected)
  912. @pytest.mark.xfail(reason="No longer supported.")
  913. def test_combine_first_with_dense(self):
  914. # We could support this if we allow
  915. # pd.core.dtypes.cast.find_common_type to special case SparseDtype
  916. # but I don't think that's worth it.
  917. df = self.frame
  918. result = df[::2].combine_first(df.to_dense())
  919. expected = df[::2].to_dense().combine_first(df.to_dense())
  920. expected = expected.to_sparse(fill_value=df.default_fill_value)
  921. tm.assert_sp_frame_equal(result, expected)
  922. def test_combine_add(self, float_frame):
  923. df = float_frame.to_dense()
  924. df2 = df.copy()
  925. df2['C'][:3] = np.nan
  926. df['A'][:3] = 5.7
  927. result = df.to_sparse().add(df2.to_sparse(), fill_value=0)
  928. expected = df.add(df2, fill_value=0).to_sparse()
  929. tm.assert_sp_frame_equal(result, expected)
  930. def test_isin(self):
  931. sparse_df = DataFrame({'flag': [1., 0., 1.]}).to_sparse(fill_value=0.)
  932. xp = sparse_df[sparse_df.flag == 1.]
  933. rs = sparse_df[sparse_df.flag.isin([1.])]
  934. tm.assert_frame_equal(xp, rs)
  935. def test_sparse_pow_issue(self):
  936. # 2220
  937. df = SparseDataFrame({'A': [1.1, 3.3], 'B': [2.5, -3.9]})
  938. # note : no error without nan
  939. df = SparseDataFrame({'A': [nan, 0, 1]})
  940. # note that 2 ** df works fine, also df ** 1
  941. result = 1 ** df
  942. r1 = result.take([0], 1)['A']
  943. r2 = result['A']
  944. assert len(r2.sp_values) == len(r1.sp_values)
  945. def test_as_blocks(self):
  946. df = SparseDataFrame({'A': [1.1, 3.3], 'B': [nan, -3.9]},
  947. dtype='float64')
  948. # deprecated 0.21.0
  949. with tm.assert_produces_warning(FutureWarning,
  950. check_stacklevel=False):
  951. df_blocks = df.blocks
  952. assert list(df_blocks.keys()) == ['Sparse[float64, nan]']
  953. tm.assert_frame_equal(df_blocks['Sparse[float64, nan]'], df)
  954. @pytest.mark.xfail(reason='nan column names in _init_dict problematic '
  955. '(GH#16894)')
  956. def test_nan_columnname(self):
  957. # GH 8822
  958. nan_colname = DataFrame(Series(1.0, index=[0]), columns=[nan])
  959. nan_colname_sparse = nan_colname.to_sparse()
  960. assert np.isnan(nan_colname_sparse.columns[0])
  961. def test_isna(self):
  962. # GH 8276
  963. df = pd.SparseDataFrame({'A': [np.nan, np.nan, 1, 2, np.nan],
  964. 'B': [0, np.nan, np.nan, 2, np.nan]})
  965. res = df.isna()
  966. exp = pd.SparseDataFrame({'A': [True, True, False, False, True],
  967. 'B': [False, True, True, False, True]},
  968. default_fill_value=True)
  969. exp._default_fill_value = np.nan
  970. tm.assert_sp_frame_equal(res, exp)
  971. # if fill_value is not nan, True can be included in sp_values
  972. df = pd.SparseDataFrame({'A': [0, 0, 1, 2, np.nan],
  973. 'B': [0, np.nan, 0, 2, np.nan]},
  974. default_fill_value=0.)
  975. res = df.isna()
  976. assert isinstance(res, pd.SparseDataFrame)
  977. exp = pd.DataFrame({'A': [False, False, False, False, True],
  978. 'B': [False, True, False, False, True]})
  979. tm.assert_frame_equal(res.to_dense(), exp)
  980. def test_notna(self):
  981. # GH 8276
  982. df = pd.SparseDataFrame({'A': [np.nan, np.nan, 1, 2, np.nan],
  983. 'B': [0, np.nan, np.nan, 2, np.nan]})
  984. res = df.notna()
  985. exp = pd.SparseDataFrame({'A': [False, False, True, True, False],
  986. 'B': [True, False, False, True, False]},
  987. default_fill_value=False)
  988. exp._default_fill_value = np.nan
  989. tm.assert_sp_frame_equal(res, exp)
  990. # if fill_value is not nan, True can be included in sp_values
  991. df = pd.SparseDataFrame({'A': [0, 0, 1, 2, np.nan],
  992. 'B': [0, np.nan, 0, 2, np.nan]},
  993. default_fill_value=0.)
  994. res = df.notna()
  995. assert isinstance(res, pd.SparseDataFrame)
  996. exp = pd.DataFrame({'A': [True, True, True, True, False],
  997. 'B': [True, False, True, True, False]})
  998. tm.assert_frame_equal(res.to_dense(), exp)
  999. class TestSparseDataFrameArithmetic(object):
  1000. def test_numeric_op_scalar(self):
  1001. df = pd.DataFrame({'A': [nan, nan, 0, 1, ],
  1002. 'B': [0, 1, 2, nan],
  1003. 'C': [1., 2., 3., 4.],
  1004. 'D': [nan, nan, nan, nan]})
  1005. sparse = df.to_sparse()
  1006. tm.assert_sp_frame_equal(sparse + 1, (df + 1).to_sparse())
  1007. def test_comparison_op_scalar(self):
  1008. # GH 13001
  1009. df = pd.DataFrame({'A': [nan, nan, 0, 1, ],
  1010. 'B': [0, 1, 2, nan],
  1011. 'C': [1., 2., 3., 4.],
  1012. 'D': [nan, nan, nan, nan]})
  1013. sparse = df.to_sparse()
  1014. # comparison changes internal repr, compare with dense
  1015. res = sparse > 1
  1016. assert isinstance(res, pd.SparseDataFrame)
  1017. tm.assert_frame_equal(res.to_dense(), df > 1)
  1018. res = sparse != 0
  1019. assert isinstance(res, pd.SparseDataFrame)
  1020. tm.assert_frame_equal(res.to_dense(), df != 0)
  1021. class TestSparseDataFrameAnalytics(object):
  1022. def test_cumsum(self, float_frame):
  1023. expected = SparseDataFrame(float_frame.to_dense().cumsum())
  1024. result = float_frame.cumsum()
  1025. tm.assert_sp_frame_equal(result, expected)
  1026. result = float_frame.cumsum(axis=None)
  1027. tm.assert_sp_frame_equal(result, expected)
  1028. result = float_frame.cumsum(axis=0)
  1029. tm.assert_sp_frame_equal(result, expected)
  1030. def test_numpy_cumsum(self, float_frame):
  1031. result = np.cumsum(float_frame)
  1032. expected = SparseDataFrame(float_frame.to_dense().cumsum())
  1033. tm.assert_sp_frame_equal(result, expected)
  1034. msg = "the 'dtype' parameter is not supported"
  1035. with pytest.raises(ValueError, match=msg):
  1036. np.cumsum(float_frame, dtype=np.int64)
  1037. msg = "the 'out' parameter is not supported"
  1038. with pytest.raises(ValueError, match=msg):
  1039. np.cumsum(float_frame, out=result)
  1040. def test_numpy_func_call(self, float_frame):
  1041. # no exception should be raised even though
  1042. # numpy passes in 'axis=None' or `axis=-1'
  1043. funcs = ['sum', 'cumsum', 'var',
  1044. 'mean', 'prod', 'cumprod',
  1045. 'std', 'min', 'max']
  1046. for func in funcs:
  1047. getattr(np, func)(float_frame)
  1048. @pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH 17386)')
  1049. def test_quantile(self):
  1050. # GH 17386
  1051. data = [[1, 1], [2, 10], [3, 100], [nan, nan]]
  1052. q = 0.1
  1053. sparse_df = SparseDataFrame(data)
  1054. result = sparse_df.quantile(q)
  1055. dense_df = DataFrame(data)
  1056. dense_expected = dense_df.quantile(q)
  1057. sparse_expected = SparseSeries(dense_expected)
  1058. tm.assert_series_equal(result, dense_expected)
  1059. tm.assert_sp_series_equal(result, sparse_expected)
  1060. @pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH 17386)')
  1061. def test_quantile_multi(self):
  1062. # GH 17386
  1063. data = [[1, 1], [2, 10], [3, 100], [nan, nan]]
  1064. q = [0.1, 0.5]
  1065. sparse_df = SparseDataFrame(data)
  1066. result = sparse_df.quantile(q)
  1067. dense_df = DataFrame(data)
  1068. dense_expected = dense_df.quantile(q)
  1069. sparse_expected = SparseDataFrame(dense_expected)
  1070. tm.assert_frame_equal(result, dense_expected)
  1071. tm.assert_sp_frame_equal(result, sparse_expected)
  1072. def test_assign_with_sparse_frame(self):
  1073. # GH 19163
  1074. df = pd.DataFrame({"a": [1, 2, 3]})
  1075. res = df.to_sparse(fill_value=False).assign(newcol=False)
  1076. exp = df.assign(newcol=False).to_sparse(fill_value=False)
  1077. tm.assert_sp_frame_equal(res, exp)
  1078. for column in res.columns:
  1079. assert type(res[column]) is SparseSeries
  1080. @pytest.mark.parametrize("inplace", [True, False])
  1081. @pytest.mark.parametrize("how", ["all", "any"])
  1082. def test_dropna(self, inplace, how):
  1083. # Tests regression #21172.
  1084. expected = pd.SparseDataFrame({"F2": [0, 1]})
  1085. input_df = pd.SparseDataFrame(
  1086. {"F1": [float('nan'), float('nan')], "F2": [0, 1]}
  1087. )
  1088. result_df = input_df.dropna(axis=1, inplace=inplace, how=how)
  1089. if inplace:
  1090. result_df = input_df
  1091. tm.assert_sp_frame_equal(expected, result_df)