test_combine_concat.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462
  1. # pylint: disable-msg=E1101,W0612
  2. import itertools
  3. import numpy as np
  4. import pytest
  5. from pandas.errors import PerformanceWarning
  6. import pandas as pd
  7. import pandas.util.testing as tm
  8. class TestSparseArrayConcat(object):
  9. @pytest.mark.parametrize('kind', ['integer', 'block'])
  10. def test_basic(self, kind):
  11. a = pd.SparseArray([1, 0, 0, 2], kind=kind)
  12. b = pd.SparseArray([1, 0, 2, 2], kind=kind)
  13. result = pd.SparseArray._concat_same_type([a, b])
  14. # Can't make any assertions about the sparse index itself
  15. # since we aren't don't merge sparse blocs across arrays
  16. # in to_concat
  17. expected = np.array([1, 2, 1, 2, 2], dtype='int64')
  18. tm.assert_numpy_array_equal(result.sp_values, expected)
  19. assert result.kind == kind
  20. @pytest.mark.parametrize('kind', ['integer', 'block'])
  21. def test_uses_first_kind(self, kind):
  22. other = 'integer' if kind == 'block' else 'block'
  23. a = pd.SparseArray([1, 0, 0, 2], kind=kind)
  24. b = pd.SparseArray([1, 0, 2, 2], kind=other)
  25. result = pd.SparseArray._concat_same_type([a, b])
  26. expected = np.array([1, 2, 1, 2, 2], dtype='int64')
  27. tm.assert_numpy_array_equal(result.sp_values, expected)
  28. assert result.kind == kind
  29. class TestSparseSeriesConcat(object):
  30. @pytest.mark.parametrize('kind', [
  31. 'integer',
  32. 'block',
  33. ])
  34. def test_concat(self, kind):
  35. val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan])
  36. val2 = np.array([3, np.nan, 4, 0, 0])
  37. sparse1 = pd.SparseSeries(val1, name='x', kind=kind)
  38. sparse2 = pd.SparseSeries(val2, name='y', kind=kind)
  39. res = pd.concat([sparse1, sparse2])
  40. exp = pd.concat([pd.Series(val1), pd.Series(val2)])
  41. exp = pd.SparseSeries(exp, kind=kind)
  42. tm.assert_sp_series_equal(res, exp, consolidate_block_indices=True)
  43. sparse1 = pd.SparseSeries(val1, fill_value=0, name='x', kind=kind)
  44. sparse2 = pd.SparseSeries(val2, fill_value=0, name='y', kind=kind)
  45. res = pd.concat([sparse1, sparse2])
  46. exp = pd.concat([pd.Series(val1), pd.Series(val2)])
  47. exp = pd.SparseSeries(exp, fill_value=0, kind=kind)
  48. tm.assert_sp_series_equal(res, exp, consolidate_block_indices=True)
  49. def test_concat_axis1(self):
  50. val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan])
  51. val2 = np.array([3, np.nan, 4, 0, 0])
  52. sparse1 = pd.SparseSeries(val1, name='x')
  53. sparse2 = pd.SparseSeries(val2, name='y')
  54. res = pd.concat([sparse1, sparse2], axis=1)
  55. exp = pd.concat([pd.Series(val1, name='x'),
  56. pd.Series(val2, name='y')], axis=1)
  57. exp = pd.SparseDataFrame(exp)
  58. tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True)
  59. def test_concat_different_fill(self):
  60. val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan])
  61. val2 = np.array([3, np.nan, 4, 0, 0])
  62. for kind in ['integer', 'block']:
  63. sparse1 = pd.SparseSeries(val1, name='x', kind=kind)
  64. sparse2 = pd.SparseSeries(val2, name='y', kind=kind, fill_value=0)
  65. with tm.assert_produces_warning(PerformanceWarning):
  66. res = pd.concat([sparse1, sparse2])
  67. exp = pd.concat([pd.Series(val1), pd.Series(val2)])
  68. exp = pd.SparseSeries(exp, kind=kind)
  69. tm.assert_sp_series_equal(res, exp)
  70. with tm.assert_produces_warning(PerformanceWarning):
  71. res = pd.concat([sparse2, sparse1])
  72. exp = pd.concat([pd.Series(val2), pd.Series(val1)])
  73. exp = pd.SparseSeries(exp, kind=kind, fill_value=0)
  74. tm.assert_sp_series_equal(res, exp)
  75. def test_concat_axis1_different_fill(self):
  76. val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan])
  77. val2 = np.array([3, np.nan, 4, 0, 0])
  78. sparse1 = pd.SparseSeries(val1, name='x')
  79. sparse2 = pd.SparseSeries(val2, name='y', fill_value=0)
  80. res = pd.concat([sparse1, sparse2], axis=1)
  81. exp = pd.concat([pd.Series(val1, name='x'),
  82. pd.Series(val2, name='y')], axis=1)
  83. assert isinstance(res, pd.SparseDataFrame)
  84. tm.assert_frame_equal(res.to_dense(), exp)
  85. def test_concat_different_kind(self):
  86. val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan])
  87. val2 = np.array([3, np.nan, 4, 0, 0])
  88. sparse1 = pd.SparseSeries(val1, name='x', kind='integer')
  89. sparse2 = pd.SparseSeries(val2, name='y', kind='block')
  90. res = pd.concat([sparse1, sparse2])
  91. exp = pd.concat([pd.Series(val1), pd.Series(val2)])
  92. exp = pd.SparseSeries(exp, kind=sparse1.kind)
  93. tm.assert_sp_series_equal(res, exp)
  94. res = pd.concat([sparse2, sparse1])
  95. exp = pd.concat([pd.Series(val2), pd.Series(val1)])
  96. exp = pd.SparseSeries(exp, kind=sparse2.kind)
  97. tm.assert_sp_series_equal(res, exp, consolidate_block_indices=True)
  98. @pytest.mark.parametrize('kind', [
  99. 'integer',
  100. 'block',
  101. ])
  102. def test_concat_sparse_dense(self, kind):
  103. # use first input's fill_value
  104. val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan])
  105. val2 = np.array([3, np.nan, 4, 0, 0])
  106. sparse = pd.SparseSeries(val1, name='x', kind=kind)
  107. dense = pd.Series(val2, name='y')
  108. res = pd.concat([sparse, dense])
  109. exp = pd.SparseSeries(pd.concat([pd.Series(val1), dense]), kind=kind)
  110. tm.assert_sp_series_equal(res, exp)
  111. res = pd.concat([dense, sparse, dense])
  112. exp = pd.concat([dense, pd.Series(val1), dense])
  113. # XXX: changed from SparseSeries to Series[sparse]
  114. exp = pd.Series(
  115. pd.SparseArray(exp, kind=kind),
  116. index=exp.index,
  117. name=exp.name,
  118. )
  119. tm.assert_series_equal(res, exp)
  120. sparse = pd.SparseSeries(val1, name='x', kind=kind, fill_value=0)
  121. dense = pd.Series(val2, name='y')
  122. res = pd.concat([sparse, dense])
  123. # XXX: changed from SparseSeries to Series[sparse]
  124. exp = pd.concat([pd.Series(val1), dense])
  125. exp = pd.Series(
  126. pd.SparseArray(exp, kind=kind, fill_value=0),
  127. index=exp.index,
  128. name=exp.name,
  129. )
  130. tm.assert_series_equal(res, exp)
  131. res = pd.concat([dense, sparse, dense])
  132. exp = pd.concat([dense, pd.Series(val1), dense])
  133. # XXX: changed from SparseSeries to Series[sparse]
  134. exp = pd.Series(
  135. pd.SparseArray(exp, kind=kind, fill_value=0),
  136. index=exp.index,
  137. name=exp.name,
  138. )
  139. tm.assert_series_equal(res, exp)
  140. class TestSparseDataFrameConcat(object):
  141. def setup_method(self, method):
  142. self.dense1 = pd.DataFrame({'A': [0., 1., 2., np.nan],
  143. 'B': [0., 0., 0., 0.],
  144. 'C': [np.nan, np.nan, np.nan, np.nan],
  145. 'D': [1., 2., 3., 4.]})
  146. self.dense2 = pd.DataFrame({'A': [5., 6., 7., 8.],
  147. 'B': [np.nan, 0., 7., 8.],
  148. 'C': [5., 6., np.nan, np.nan],
  149. 'D': [np.nan, np.nan, np.nan, np.nan]})
  150. self.dense3 = pd.DataFrame({'E': [5., 6., 7., 8.],
  151. 'F': [np.nan, 0., 7., 8.],
  152. 'G': [5., 6., np.nan, np.nan],
  153. 'H': [np.nan, np.nan, np.nan, np.nan]})
  154. def test_concat(self):
  155. # fill_value = np.nan
  156. sparse = self.dense1.to_sparse()
  157. sparse2 = self.dense2.to_sparse()
  158. res = pd.concat([sparse, sparse])
  159. exp = pd.concat([self.dense1, self.dense1]).to_sparse()
  160. tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True)
  161. res = pd.concat([sparse2, sparse2])
  162. exp = pd.concat([self.dense2, self.dense2]).to_sparse()
  163. tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True)
  164. res = pd.concat([sparse, sparse2])
  165. exp = pd.concat([self.dense1, self.dense2]).to_sparse()
  166. tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True)
  167. res = pd.concat([sparse2, sparse])
  168. exp = pd.concat([self.dense2, self.dense1]).to_sparse()
  169. tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True)
  170. # fill_value = 0
  171. sparse = self.dense1.to_sparse(fill_value=0)
  172. sparse2 = self.dense2.to_sparse(fill_value=0)
  173. res = pd.concat([sparse, sparse])
  174. exp = pd.concat([self.dense1, self.dense1]).to_sparse(fill_value=0)
  175. exp._default_fill_value = np.nan
  176. tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True)
  177. res = pd.concat([sparse2, sparse2])
  178. exp = pd.concat([self.dense2, self.dense2]).to_sparse(fill_value=0)
  179. exp._default_fill_value = np.nan
  180. tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True)
  181. res = pd.concat([sparse, sparse2])
  182. exp = pd.concat([self.dense1, self.dense2]).to_sparse(fill_value=0)
  183. exp._default_fill_value = np.nan
  184. tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True)
  185. res = pd.concat([sparse2, sparse])
  186. exp = pd.concat([self.dense2, self.dense1]).to_sparse(fill_value=0)
  187. exp._default_fill_value = np.nan
  188. tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True)
  189. def test_concat_different_fill_value(self):
  190. # 1st fill_value will be used
  191. sparse = self.dense1.to_sparse()
  192. sparse2 = self.dense2.to_sparse(fill_value=0)
  193. with tm.assert_produces_warning(PerformanceWarning):
  194. res = pd.concat([sparse, sparse2])
  195. exp = pd.concat([self.dense1, self.dense2]).to_sparse()
  196. tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True)
  197. with tm.assert_produces_warning(PerformanceWarning):
  198. res = pd.concat([sparse2, sparse])
  199. exp = pd.concat([self.dense2, self.dense1]).to_sparse(fill_value=0)
  200. exp._default_fill_value = np.nan
  201. tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True)
  202. def test_concat_different_columns_sort_warns(self):
  203. sparse = self.dense1.to_sparse()
  204. sparse3 = self.dense3.to_sparse()
  205. with tm.assert_produces_warning(FutureWarning):
  206. res = pd.concat([sparse, sparse3])
  207. with tm.assert_produces_warning(FutureWarning):
  208. exp = pd.concat([self.dense1, self.dense3])
  209. exp = exp.to_sparse()
  210. tm.assert_sp_frame_equal(res, exp, check_kind=False)
  211. def test_concat_different_columns(self):
  212. # fill_value = np.nan
  213. sparse = self.dense1.to_sparse()
  214. sparse3 = self.dense3.to_sparse()
  215. res = pd.concat([sparse, sparse3], sort=True)
  216. exp = pd.concat([self.dense1, self.dense3], sort=True).to_sparse()
  217. tm.assert_sp_frame_equal(res, exp, check_kind=False)
  218. res = pd.concat([sparse3, sparse], sort=True)
  219. exp = pd.concat([self.dense3, self.dense1], sort=True).to_sparse()
  220. exp._default_fill_value = np.nan
  221. tm.assert_sp_frame_equal(res, exp, check_kind=False)
  222. def test_concat_bug(self):
  223. from pandas.core.sparse.api import SparseDtype
  224. x = pd.SparseDataFrame({"A": pd.SparseArray([np.nan, np.nan],
  225. fill_value=0)})
  226. y = pd.SparseDataFrame({"B": []})
  227. res = pd.concat([x, y], sort=False)[['A']]
  228. exp = pd.DataFrame({"A": pd.SparseArray([np.nan, np.nan],
  229. dtype=SparseDtype(float, 0))})
  230. tm.assert_frame_equal(res, exp)
  231. def test_concat_different_columns_buggy(self):
  232. sparse = self.dense1.to_sparse(fill_value=0)
  233. sparse3 = self.dense3.to_sparse(fill_value=0)
  234. res = pd.concat([sparse, sparse3], sort=True)
  235. exp = (pd.concat([self.dense1, self.dense3], sort=True)
  236. .to_sparse(fill_value=0))
  237. exp._default_fill_value = np.nan
  238. tm.assert_sp_frame_equal(res, exp, check_kind=False,
  239. consolidate_block_indices=True)
  240. res = pd.concat([sparse3, sparse], sort=True)
  241. exp = (pd.concat([self.dense3, self.dense1], sort=True)
  242. .to_sparse(fill_value=0))
  243. exp._default_fill_value = np.nan
  244. tm.assert_sp_frame_equal(res, exp, check_kind=False,
  245. consolidate_block_indices=True)
  246. # different fill values
  247. sparse = self.dense1.to_sparse()
  248. sparse3 = self.dense3.to_sparse(fill_value=0)
  249. # each columns keeps its fill_value, thus compare in dense
  250. res = pd.concat([sparse, sparse3], sort=True)
  251. exp = pd.concat([self.dense1, self.dense3], sort=True)
  252. assert isinstance(res, pd.SparseDataFrame)
  253. tm.assert_frame_equal(res.to_dense(), exp)
  254. res = pd.concat([sparse3, sparse], sort=True)
  255. exp = pd.concat([self.dense3, self.dense1], sort=True)
  256. assert isinstance(res, pd.SparseDataFrame)
  257. tm.assert_frame_equal(res.to_dense(), exp)
  258. def test_concat_series(self):
  259. # fill_value = np.nan
  260. sparse = self.dense1.to_sparse()
  261. sparse2 = self.dense2.to_sparse()
  262. for col in ['A', 'D']:
  263. res = pd.concat([sparse, sparse2[col]])
  264. exp = pd.concat([self.dense1, self.dense2[col]]).to_sparse()
  265. tm.assert_sp_frame_equal(res, exp, check_kind=False)
  266. res = pd.concat([sparse2[col], sparse])
  267. exp = pd.concat([self.dense2[col], self.dense1]).to_sparse()
  268. tm.assert_sp_frame_equal(res, exp, check_kind=False)
  269. # fill_value = 0
  270. sparse = self.dense1.to_sparse(fill_value=0)
  271. sparse2 = self.dense2.to_sparse(fill_value=0)
  272. for col in ['C', 'D']:
  273. res = pd.concat([sparse, sparse2[col]])
  274. exp = pd.concat([self.dense1,
  275. self.dense2[col]]).to_sparse(fill_value=0)
  276. exp._default_fill_value = np.nan
  277. tm.assert_sp_frame_equal(res, exp, check_kind=False,
  278. consolidate_block_indices=True)
  279. res = pd.concat([sparse2[col], sparse])
  280. exp = pd.concat([self.dense2[col],
  281. self.dense1]).to_sparse(fill_value=0)
  282. exp['C'] = res['C']
  283. exp._default_fill_value = np.nan
  284. tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True,
  285. check_kind=False)
  286. def test_concat_axis1(self):
  287. # fill_value = np.nan
  288. sparse = self.dense1.to_sparse()
  289. sparse3 = self.dense3.to_sparse()
  290. res = pd.concat([sparse, sparse3], axis=1)
  291. exp = pd.concat([self.dense1, self.dense3], axis=1).to_sparse()
  292. tm.assert_sp_frame_equal(res, exp)
  293. res = pd.concat([sparse3, sparse], axis=1)
  294. exp = pd.concat([self.dense3, self.dense1], axis=1).to_sparse()
  295. exp._default_fill_value = np.nan
  296. tm.assert_sp_frame_equal(res, exp)
  297. # fill_value = 0
  298. sparse = self.dense1.to_sparse(fill_value=0)
  299. sparse3 = self.dense3.to_sparse(fill_value=0)
  300. res = pd.concat([sparse, sparse3], axis=1)
  301. exp = pd.concat([self.dense1, self.dense3],
  302. axis=1).to_sparse(fill_value=0)
  303. exp._default_fill_value = np.nan
  304. tm.assert_sp_frame_equal(res, exp)
  305. res = pd.concat([sparse3, sparse], axis=1)
  306. exp = pd.concat([self.dense3, self.dense1],
  307. axis=1).to_sparse(fill_value=0)
  308. exp._default_fill_value = np.nan
  309. tm.assert_sp_frame_equal(res, exp)
  310. # different fill values
  311. sparse = self.dense1.to_sparse()
  312. sparse3 = self.dense3.to_sparse(fill_value=0)
  313. # each columns keeps its fill_value, thus compare in dense
  314. res = pd.concat([sparse, sparse3], axis=1)
  315. exp = pd.concat([self.dense1, self.dense3], axis=1)
  316. assert isinstance(res, pd.SparseDataFrame)
  317. tm.assert_frame_equal(res.to_dense(), exp)
  318. res = pd.concat([sparse3, sparse], axis=1)
  319. exp = pd.concat([self.dense3, self.dense1], axis=1)
  320. assert isinstance(res, pd.SparseDataFrame)
  321. tm.assert_frame_equal(res.to_dense(), exp)
  322. @pytest.mark.parametrize('fill_value,sparse_idx,dense_idx',
  323. itertools.product([None, 0, 1, np.nan],
  324. [0, 1],
  325. [1, 0]))
  326. def test_concat_sparse_dense_rows(self, fill_value, sparse_idx, dense_idx):
  327. frames = [self.dense1, self.dense2]
  328. sparse_frame = [frames[dense_idx],
  329. frames[sparse_idx].to_sparse(fill_value=fill_value)]
  330. dense_frame = [frames[dense_idx], frames[sparse_idx]]
  331. # This will try both directions sparse + dense and dense + sparse
  332. for _ in range(2):
  333. res = pd.concat(sparse_frame)
  334. exp = pd.concat(dense_frame)
  335. assert isinstance(res, pd.SparseDataFrame)
  336. tm.assert_frame_equal(res.to_dense(), exp)
  337. sparse_frame = sparse_frame[::-1]
  338. dense_frame = dense_frame[::-1]
  339. @pytest.mark.parametrize('fill_value,sparse_idx,dense_idx',
  340. itertools.product([None, 0, 1, np.nan],
  341. [0, 1],
  342. [1, 0]))
  343. @pytest.mark.xfail(reason="The iloc fails and I can't make expected",
  344. strict=False)
  345. def test_concat_sparse_dense_cols(self, fill_value, sparse_idx, dense_idx):
  346. # See GH16874, GH18914 and #18686 for why this should be a DataFrame
  347. from pandas.core.dtypes.common import is_sparse
  348. frames = [self.dense1, self.dense3]
  349. sparse_frame = [frames[dense_idx],
  350. frames[sparse_idx].to_sparse(fill_value=fill_value)]
  351. dense_frame = [frames[dense_idx], frames[sparse_idx]]
  352. # This will try both directions sparse + dense and dense + sparse
  353. for _ in range(2):
  354. res = pd.concat(sparse_frame, axis=1)
  355. exp = pd.concat(dense_frame, axis=1)
  356. cols = [i for (i, x) in enumerate(res.dtypes) if is_sparse(x)]
  357. for col in cols:
  358. exp.iloc[:, col] = exp.iloc[:, col].astype("Sparse")
  359. for column in frames[dense_idx].columns:
  360. if dense_idx == sparse_idx:
  361. tm.assert_frame_equal(res[column], exp[column])
  362. else:
  363. tm.assert_series_equal(res[column], exp[column])
  364. tm.assert_frame_equal(res, exp)
  365. sparse_frame = sparse_frame[::-1]
  366. dense_frame = dense_frame[::-1]