test_combine_concat.py 33 KB


  1. # -*- coding: utf-8 -*-
  2. from __future__ import print_function
  3. from datetime import datetime
  4. import numpy as np
  5. import pytest
  6. from pandas.compat import lrange
  7. import pandas as pd
  8. from pandas import DataFrame, Index, Series, Timestamp, date_range
  9. from pandas.tests.frame.common import TestData
  10. import pandas.util.testing as tm
  11. from pandas.util.testing import assert_frame_equal, assert_series_equal
  12. class TestDataFrameConcatCommon(TestData):
  13. def test_concat_multiple_frames_dtypes(self):
  14. # GH 2759
  15. A = DataFrame(data=np.ones((10, 2)), columns=[
  16. 'foo', 'bar'], dtype=np.float64)
  17. B = DataFrame(data=np.ones((10, 2)), dtype=np.float32)
  18. results = pd.concat((A, B), axis=1).get_dtype_counts()
  19. expected = Series(dict(float64=2, float32=2))
  20. assert_series_equal(results, expected)
  21. @pytest.mark.parametrize('data', [
  22. pd.date_range('2000', periods=4),
  23. pd.date_range('2000', periods=4, tz="US/Central"),
  24. pd.period_range('2000', periods=4),
  25. pd.timedelta_range(0, periods=4),
  26. ])
  27. def test_combine_datetlike_udf(self, data):
  28. # https://github.com/pandas-dev/pandas/issues/23079
  29. df = pd.DataFrame({"A": data})
  30. other = df.copy()
  31. df.iloc[1, 0] = None
  32. def combiner(a, b):
  33. return b
  34. result = df.combine(other, combiner)
  35. tm.assert_frame_equal(result, other)
  36. def test_concat_multiple_tzs(self):
  37. # GH 12467
  38. # combining datetime tz-aware and naive DataFrames
  39. ts1 = Timestamp('2015-01-01', tz=None)
  40. ts2 = Timestamp('2015-01-01', tz='UTC')
  41. ts3 = Timestamp('2015-01-01', tz='EST')
  42. df1 = DataFrame(dict(time=[ts1]))
  43. df2 = DataFrame(dict(time=[ts2]))
  44. df3 = DataFrame(dict(time=[ts3]))
  45. results = pd.concat([df1, df2]).reset_index(drop=True)
  46. expected = DataFrame(dict(time=[ts1, ts2]), dtype=object)
  47. assert_frame_equal(results, expected)
  48. results = pd.concat([df1, df3]).reset_index(drop=True)
  49. expected = DataFrame(dict(time=[ts1, ts3]), dtype=object)
  50. assert_frame_equal(results, expected)
  51. results = pd.concat([df2, df3]).reset_index(drop=True)
  52. expected = DataFrame(dict(time=[ts2, ts3]))
  53. assert_frame_equal(results, expected)
  54. @pytest.mark.parametrize(
  55. 't1',
  56. [
  57. '2015-01-01',
  58. pytest.param(pd.NaT, marks=pytest.mark.xfail(
  59. reason='GH23037 incorrect dtype when concatenating'))])
  60. def test_concat_tz_NaT(self, t1):
  61. # GH 22796
  62. # Concating tz-aware multicolumn DataFrames
  63. ts1 = Timestamp(t1, tz='UTC')
  64. ts2 = Timestamp('2015-01-01', tz='UTC')
  65. ts3 = Timestamp('2015-01-01', tz='UTC')
  66. df1 = DataFrame([[ts1, ts2]])
  67. df2 = DataFrame([[ts3]])
  68. result = pd.concat([df1, df2])
  69. expected = DataFrame([[ts1, ts2], [ts3, pd.NaT]], index=[0, 0])
  70. assert_frame_equal(result, expected)
  71. def test_concat_tz_not_aligned(self):
  72. # GH 22796
  73. ts = pd.to_datetime([1, 2]).tz_localize("UTC")
  74. a = pd.DataFrame({"A": ts})
  75. b = pd.DataFrame({"A": ts, "B": ts})
  76. result = pd.concat([a, b], sort=True, ignore_index=True)
  77. expected = pd.DataFrame({"A": list(ts) + list(ts),
  78. "B": [pd.NaT, pd.NaT] + list(ts)})
  79. assert_frame_equal(result, expected)
  80. def test_concat_tuple_keys(self):
  81. # GH 14438
  82. df1 = pd.DataFrame(np.ones((2, 2)), columns=list('AB'))
  83. df2 = pd.DataFrame(np.ones((3, 2)) * 2, columns=list('AB'))
  84. results = pd.concat((df1, df2), keys=[('bee', 'bah'), ('bee', 'boo')])
  85. expected = pd.DataFrame(
  86. {'A': {('bee', 'bah', 0): 1.0,
  87. ('bee', 'bah', 1): 1.0,
  88. ('bee', 'boo', 0): 2.0,
  89. ('bee', 'boo', 1): 2.0,
  90. ('bee', 'boo', 2): 2.0},
  91. 'B': {('bee', 'bah', 0): 1.0,
  92. ('bee', 'bah', 1): 1.0,
  93. ('bee', 'boo', 0): 2.0,
  94. ('bee', 'boo', 1): 2.0,
  95. ('bee', 'boo', 2): 2.0}})
  96. assert_frame_equal(results, expected)
  97. def test_append_series_dict(self):
  98. df = DataFrame(np.random.randn(5, 4),
  99. columns=['foo', 'bar', 'baz', 'qux'])
  100. series = df.loc[4]
  101. msg = 'Indexes have overlapping values'
  102. with pytest.raises(ValueError, match=msg):
  103. df.append(series, verify_integrity=True)
  104. series.name = None
  105. msg = 'Can only append a Series if ignore_index=True'
  106. with pytest.raises(TypeError, match=msg):
  107. df.append(series, verify_integrity=True)
  108. result = df.append(series[::-1], ignore_index=True)
  109. expected = df.append(DataFrame({0: series[::-1]}, index=df.columns).T,
  110. ignore_index=True)
  111. assert_frame_equal(result, expected)
  112. # dict
  113. result = df.append(series.to_dict(), ignore_index=True)
  114. assert_frame_equal(result, expected)
  115. result = df.append(series[::-1][:3], ignore_index=True)
  116. expected = df.append(DataFrame({0: series[::-1][:3]}).T,
  117. ignore_index=True, sort=True)
  118. assert_frame_equal(result, expected.loc[:, result.columns])
  119. # can append when name set
  120. row = df.loc[4]
  121. row.name = 5
  122. result = df.append(row)
  123. expected = df.append(df[-1:], ignore_index=True)
  124. assert_frame_equal(result, expected)
  125. def test_append_list_of_series_dicts(self):
  126. df = DataFrame(np.random.randn(5, 4),
  127. columns=['foo', 'bar', 'baz', 'qux'])
  128. dicts = [x.to_dict() for idx, x in df.iterrows()]
  129. result = df.append(dicts, ignore_index=True)
  130. expected = df.append(df, ignore_index=True)
  131. assert_frame_equal(result, expected)
  132. # different columns
  133. dicts = [{'foo': 1, 'bar': 2, 'baz': 3, 'peekaboo': 4},
  134. {'foo': 5, 'bar': 6, 'baz': 7, 'peekaboo': 8}]
  135. result = df.append(dicts, ignore_index=True, sort=True)
  136. expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
  137. assert_frame_equal(result, expected)
  138. def test_append_empty_dataframe(self):
  139. # Empty df append empty df
  140. df1 = DataFrame([])
  141. df2 = DataFrame([])
  142. result = df1.append(df2)
  143. expected = df1.copy()
  144. assert_frame_equal(result, expected)
  145. # Non-empty df append empty df
  146. df1 = DataFrame(np.random.randn(5, 2))
  147. df2 = DataFrame()
  148. result = df1.append(df2)
  149. expected = df1.copy()
  150. assert_frame_equal(result, expected)
  151. # Empty df with columns append empty df
  152. df1 = DataFrame(columns=['bar', 'foo'])
  153. df2 = DataFrame()
  154. result = df1.append(df2)
  155. expected = df1.copy()
  156. assert_frame_equal(result, expected)
  157. # Non-Empty df with columns append empty df
  158. df1 = DataFrame(np.random.randn(5, 2), columns=['bar', 'foo'])
  159. df2 = DataFrame()
  160. result = df1.append(df2)
  161. expected = df1.copy()
  162. assert_frame_equal(result, expected)
  163. def test_append_dtypes(self):
  164. # GH 5754
  165. # row appends of different dtypes (so need to do by-item)
  166. # can sometimes infer the correct type
  167. df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(5))
  168. df2 = DataFrame()
  169. result = df1.append(df2)
  170. expected = df1.copy()
  171. assert_frame_equal(result, expected)
  172. df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
  173. df2 = DataFrame({'bar': 'foo'}, index=lrange(1, 2))
  174. result = df1.append(df2)
  175. expected = DataFrame({'bar': [Timestamp('20130101'), 'foo']})
  176. assert_frame_equal(result, expected)
  177. df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
  178. df2 = DataFrame({'bar': np.nan}, index=lrange(1, 2))
  179. result = df1.append(df2)
  180. expected = DataFrame(
  181. {'bar': Series([Timestamp('20130101'), np.nan], dtype='M8[ns]')})
  182. assert_frame_equal(result, expected)
  183. df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
  184. df2 = DataFrame({'bar': np.nan}, index=lrange(1, 2), dtype=object)
  185. result = df1.append(df2)
  186. expected = DataFrame(
  187. {'bar': Series([Timestamp('20130101'), np.nan], dtype='M8[ns]')})
  188. assert_frame_equal(result, expected)
  189. df1 = DataFrame({'bar': np.nan}, index=lrange(1))
  190. df2 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1, 2))
  191. result = df1.append(df2)
  192. expected = DataFrame(
  193. {'bar': Series([np.nan, Timestamp('20130101')], dtype='M8[ns]')})
  194. assert_frame_equal(result, expected)
  195. df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
  196. df2 = DataFrame({'bar': 1}, index=lrange(1, 2), dtype=object)
  197. result = df1.append(df2)
  198. expected = DataFrame({'bar': Series([Timestamp('20130101'), 1])})
  199. assert_frame_equal(result, expected)
  200. def test_update(self):
  201. df = DataFrame([[1.5, np.nan, 3.],
  202. [1.5, np.nan, 3.],
  203. [1.5, np.nan, 3],
  204. [1.5, np.nan, 3]])
  205. other = DataFrame([[3.6, 2., np.nan],
  206. [np.nan, np.nan, 7]], index=[1, 3])
  207. df.update(other)
  208. expected = DataFrame([[1.5, np.nan, 3],
  209. [3.6, 2, 3],
  210. [1.5, np.nan, 3],
  211. [1.5, np.nan, 7.]])
  212. assert_frame_equal(df, expected)
  213. def test_update_dtypes(self):
  214. # gh 3016
  215. df = DataFrame([[1., 2., False, True], [4., 5., True, False]],
  216. columns=['A', 'B', 'bool1', 'bool2'])
  217. other = DataFrame([[45, 45]], index=[0], columns=['A', 'B'])
  218. df.update(other)
  219. expected = DataFrame([[45., 45., False, True], [4., 5., True, False]],
  220. columns=['A', 'B', 'bool1', 'bool2'])
  221. assert_frame_equal(df, expected)
  222. def test_update_nooverwrite(self):
  223. df = DataFrame([[1.5, np.nan, 3.],
  224. [1.5, np.nan, 3.],
  225. [1.5, np.nan, 3],
  226. [1.5, np.nan, 3]])
  227. other = DataFrame([[3.6, 2., np.nan],
  228. [np.nan, np.nan, 7]], index=[1, 3])
  229. df.update(other, overwrite=False)
  230. expected = DataFrame([[1.5, np.nan, 3],
  231. [1.5, 2, 3],
  232. [1.5, np.nan, 3],
  233. [1.5, np.nan, 3.]])
  234. assert_frame_equal(df, expected)
  235. def test_update_filtered(self):
  236. df = DataFrame([[1.5, np.nan, 3.],
  237. [1.5, np.nan, 3.],
  238. [1.5, np.nan, 3],
  239. [1.5, np.nan, 3]])
  240. other = DataFrame([[3.6, 2., np.nan],
  241. [np.nan, np.nan, 7]], index=[1, 3])
  242. df.update(other, filter_func=lambda x: x > 2)
  243. expected = DataFrame([[1.5, np.nan, 3],
  244. [1.5, np.nan, 3],
  245. [1.5, np.nan, 3],
  246. [1.5, np.nan, 7.]])
  247. assert_frame_equal(df, expected)
  248. @pytest.mark.parametrize('bad_kwarg, exception, msg', [
  249. # errors must be 'ignore' or 'raise'
  250. ({'errors': 'something'}, ValueError, 'The parameter errors must.*'),
  251. ({'join': 'inner'}, NotImplementedError, 'Only left join is supported')
  252. ])
  253. def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg):
  254. df = DataFrame([[1.5, 1, 3.]])
  255. with pytest.raises(exception, match=msg):
  256. df.update(df, **bad_kwarg)
  257. def test_update_raise_on_overlap(self):
  258. df = DataFrame([[1.5, 1, 3.],
  259. [1.5, np.nan, 3.],
  260. [1.5, np.nan, 3],
  261. [1.5, np.nan, 3]])
  262. other = DataFrame([[2., np.nan],
  263. [np.nan, 7]], index=[1, 3], columns=[1, 2])
  264. with pytest.raises(ValueError, match="Data overlaps"):
  265. df.update(other, errors='raise')
  266. @pytest.mark.parametrize('raise_conflict', [True, False])
  267. def test_update_deprecation(self, raise_conflict):
  268. df = DataFrame([[1.5, 1, 3.]])
  269. other = DataFrame()
  270. with tm.assert_produces_warning(FutureWarning):
  271. df.update(other, raise_conflict=raise_conflict)
  272. def test_update_from_non_df(self):
  273. d = {'a': Series([1, 2, 3, 4]), 'b': Series([5, 6, 7, 8])}
  274. df = DataFrame(d)
  275. d['a'] = Series([5, 6, 7, 8])
  276. df.update(d)
  277. expected = DataFrame(d)
  278. assert_frame_equal(df, expected)
  279. d = {'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]}
  280. df = DataFrame(d)
  281. d['a'] = [5, 6, 7, 8]
  282. df.update(d)
  283. expected = DataFrame(d)
  284. assert_frame_equal(df, expected)
  285. def test_join_str_datetime(self):
  286. str_dates = ['20120209', '20120222']
  287. dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)]
  288. A = DataFrame(str_dates, index=lrange(2), columns=['aa'])
  289. C = DataFrame([[1, 2], [3, 4]], index=str_dates, columns=dt_dates)
  290. tst = A.join(C, on='aa')
  291. assert len(tst.columns) == 3
  292. def test_join_multiindex_leftright(self):
  293. # GH 10741
  294. df1 = (pd.DataFrame([['a', 'x', 0.471780], ['a', 'y', 0.774908],
  295. ['a', 'z', 0.563634], ['b', 'x', -0.353756],
  296. ['b', 'y', 0.368062], ['b', 'z', -1.721840],
  297. ['c', 'x', 1], ['c', 'y', 2], ['c', 'z', 3]],
  298. columns=['first', 'second', 'value1'])
  299. .set_index(['first', 'second']))
  300. df2 = (pd.DataFrame([['a', 10], ['b', 20]],
  301. columns=['first', 'value2'])
  302. .set_index(['first']))
  303. exp = pd.DataFrame([[0.471780, 10], [0.774908, 10], [0.563634, 10],
  304. [-0.353756, 20], [0.368062, 20],
  305. [-1.721840, 20],
  306. [1.000000, np.nan], [2.000000, np.nan],
  307. [3.000000, np.nan]],
  308. index=df1.index, columns=['value1', 'value2'])
  309. # these must be the same results (but columns are flipped)
  310. assert_frame_equal(df1.join(df2, how='left'), exp)
  311. assert_frame_equal(df2.join(df1, how='right'),
  312. exp[['value2', 'value1']])
  313. exp_idx = pd.MultiIndex.from_product([['a', 'b'], ['x', 'y', 'z']],
  314. names=['first', 'second'])
  315. exp = pd.DataFrame([[0.471780, 10], [0.774908, 10], [0.563634, 10],
  316. [-0.353756, 20], [0.368062, 20], [-1.721840, 20]],
  317. index=exp_idx, columns=['value1', 'value2'])
  318. assert_frame_equal(df1.join(df2, how='right'), exp)
  319. assert_frame_equal(df2.join(df1, how='left'),
  320. exp[['value2', 'value1']])
  321. def test_concat_named_keys(self):
  322. # GH 14252
  323. df = pd.DataFrame({'foo': [1, 2], 'bar': [0.1, 0.2]})
  324. index = Index(['a', 'b'], name='baz')
  325. concatted_named_from_keys = pd.concat([df, df], keys=index)
  326. expected_named = pd.DataFrame(
  327. {'foo': [1, 2, 1, 2], 'bar': [0.1, 0.2, 0.1, 0.2]},
  328. index=pd.MultiIndex.from_product((['a', 'b'], [0, 1]),
  329. names=['baz', None]))
  330. assert_frame_equal(concatted_named_from_keys, expected_named)
  331. index_no_name = Index(['a', 'b'], name=None)
  332. concatted_named_from_names = pd.concat(
  333. [df, df], keys=index_no_name, names=['baz'])
  334. assert_frame_equal(concatted_named_from_names, expected_named)
  335. concatted_unnamed = pd.concat([df, df], keys=index_no_name)
  336. expected_unnamed = pd.DataFrame(
  337. {'foo': [1, 2, 1, 2], 'bar': [0.1, 0.2, 0.1, 0.2]},
  338. index=pd.MultiIndex.from_product((['a', 'b'], [0, 1]),
  339. names=[None, None]))
  340. assert_frame_equal(concatted_unnamed, expected_unnamed)
  341. def test_concat_axis_parameter(self):
  342. # GH 14369
  343. df1 = pd.DataFrame({'A': [0.1, 0.2]}, index=range(2))
  344. df2 = pd.DataFrame({'A': [0.3, 0.4]}, index=range(2))
  345. # Index/row/0 DataFrame
  346. expected_index = pd.DataFrame(
  347. {'A': [0.1, 0.2, 0.3, 0.4]}, index=[0, 1, 0, 1])
  348. concatted_index = pd.concat([df1, df2], axis='index')
  349. assert_frame_equal(concatted_index, expected_index)
  350. concatted_row = pd.concat([df1, df2], axis='rows')
  351. assert_frame_equal(concatted_row, expected_index)
  352. concatted_0 = pd.concat([df1, df2], axis=0)
  353. assert_frame_equal(concatted_0, expected_index)
  354. # Columns/1 DataFrame
  355. expected_columns = pd.DataFrame(
  356. [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=['A', 'A'])
  357. concatted_columns = pd.concat([df1, df2], axis='columns')
  358. assert_frame_equal(concatted_columns, expected_columns)
  359. concatted_1 = pd.concat([df1, df2], axis=1)
  360. assert_frame_equal(concatted_1, expected_columns)
  361. series1 = pd.Series([0.1, 0.2])
  362. series2 = pd.Series([0.3, 0.4])
  363. # Index/row/0 Series
  364. expected_index_series = pd.Series(
  365. [0.1, 0.2, 0.3, 0.4], index=[0, 1, 0, 1])
  366. concatted_index_series = pd.concat([series1, series2], axis='index')
  367. assert_series_equal(concatted_index_series, expected_index_series)
  368. concatted_row_series = pd.concat([series1, series2], axis='rows')
  369. assert_series_equal(concatted_row_series, expected_index_series)
  370. concatted_0_series = pd.concat([series1, series2], axis=0)
  371. assert_series_equal(concatted_0_series, expected_index_series)
  372. # Columns/1 Series
  373. expected_columns_series = pd.DataFrame(
  374. [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=[0, 1])
  375. concatted_columns_series = pd.concat(
  376. [series1, series2], axis='columns')
  377. assert_frame_equal(concatted_columns_series, expected_columns_series)
  378. concatted_1_series = pd.concat([series1, series2], axis=1)
  379. assert_frame_equal(concatted_1_series, expected_columns_series)
  380. # Testing ValueError
  381. with pytest.raises(ValueError, match='No axis named'):
  382. pd.concat([series1, series2], axis='something')
  383. def test_concat_numerical_names(self):
  384. # #15262 # #12223
  385. df = pd.DataFrame({'col': range(9)},
  386. dtype='int32',
  387. index=(pd.MultiIndex
  388. .from_product([['A0', 'A1', 'A2'],
  389. ['B0', 'B1', 'B2']],
  390. names=[1, 2])))
  391. result = pd.concat((df.iloc[:2, :], df.iloc[-2:, :]))
  392. expected = pd.DataFrame({'col': [0, 1, 7, 8]},
  393. dtype='int32',
  394. index=pd.MultiIndex.from_tuples([('A0', 'B0'),
  395. ('A0', 'B1'),
  396. ('A2', 'B1'),
  397. ('A2', 'B2')],
  398. names=[1, 2]))
  399. tm.assert_frame_equal(result, expected)
  400. class TestDataFrameCombineFirst(TestData):
  401. def test_combine_first_mixed(self):
  402. a = Series(['a', 'b'], index=lrange(2))
  403. b = Series(lrange(2), index=lrange(2))
  404. f = DataFrame({'A': a, 'B': b})
  405. a = Series(['a', 'b'], index=lrange(5, 7))
  406. b = Series(lrange(2), index=lrange(5, 7))
  407. g = DataFrame({'A': a, 'B': b})
  408. exp = pd.DataFrame({'A': list('abab'), 'B': [0., 1., 0., 1.]},
  409. index=[0, 1, 5, 6])
  410. combined = f.combine_first(g)
  411. tm.assert_frame_equal(combined, exp)
  412. def test_combine_first(self):
  413. # disjoint
  414. head, tail = self.frame[:5], self.frame[5:]
  415. combined = head.combine_first(tail)
  416. reordered_frame = self.frame.reindex(combined.index)
  417. assert_frame_equal(combined, reordered_frame)
  418. assert tm.equalContents(combined.columns, self.frame.columns)
  419. assert_series_equal(combined['A'], reordered_frame['A'])
  420. # same index
  421. fcopy = self.frame.copy()
  422. fcopy['A'] = 1
  423. del fcopy['C']
  424. fcopy2 = self.frame.copy()
  425. fcopy2['B'] = 0
  426. del fcopy2['D']
  427. combined = fcopy.combine_first(fcopy2)
  428. assert (combined['A'] == 1).all()
  429. assert_series_equal(combined['B'], fcopy['B'])
  430. assert_series_equal(combined['C'], fcopy2['C'])
  431. assert_series_equal(combined['D'], fcopy['D'])
  432. # overlap
  433. head, tail = reordered_frame[:10].copy(), reordered_frame
  434. head['A'] = 1
  435. combined = head.combine_first(tail)
  436. assert (combined['A'][:10] == 1).all()
  437. # reverse overlap
  438. tail['A'][:10] = 0
  439. combined = tail.combine_first(head)
  440. assert (combined['A'][:10] == 0).all()
  441. # no overlap
  442. f = self.frame[:10]
  443. g = self.frame[10:]
  444. combined = f.combine_first(g)
  445. assert_series_equal(combined['A'].reindex(f.index), f['A'])
  446. assert_series_equal(combined['A'].reindex(g.index), g['A'])
  447. # corner cases
  448. comb = self.frame.combine_first(self.empty)
  449. assert_frame_equal(comb, self.frame)
  450. comb = self.empty.combine_first(self.frame)
  451. assert_frame_equal(comb, self.frame)
  452. comb = self.frame.combine_first(DataFrame(index=["faz", "boo"]))
  453. assert "faz" in comb.index
  454. # #2525
  455. df = DataFrame({'a': [1]}, index=[datetime(2012, 1, 1)])
  456. df2 = DataFrame({}, columns=['b'])
  457. result = df.combine_first(df2)
  458. assert 'b' in result
  459. def test_combine_first_mixed_bug(self):
  460. idx = Index(['a', 'b', 'c', 'e'])
  461. ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx)
  462. ser2 = Series(['a', 'b', 'c', 'e'], index=idx)
  463. ser3 = Series([12, 4, 5, 97], index=idx)
  464. frame1 = DataFrame({"col0": ser1,
  465. "col2": ser2,
  466. "col3": ser3})
  467. idx = Index(['a', 'b', 'c', 'f'])
  468. ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx)
  469. ser2 = Series(['a', 'b', 'c', 'f'], index=idx)
  470. ser3 = Series([12, 4, 5, 97], index=idx)
  471. frame2 = DataFrame({"col1": ser1,
  472. "col2": ser2,
  473. "col5": ser3})
  474. combined = frame1.combine_first(frame2)
  475. assert len(combined.columns) == 5
  476. # gh 3016 (same as in update)
  477. df = DataFrame([[1., 2., False, True], [4., 5., True, False]],
  478. columns=['A', 'B', 'bool1', 'bool2'])
  479. other = DataFrame([[45, 45]], index=[0], columns=['A', 'B'])
  480. result = df.combine_first(other)
  481. assert_frame_equal(result, df)
  482. df.loc[0, 'A'] = np.nan
  483. result = df.combine_first(other)
  484. df.loc[0, 'A'] = 45
  485. assert_frame_equal(result, df)
  486. # doc example
  487. df1 = DataFrame({'A': [1., np.nan, 3., 5., np.nan],
  488. 'B': [np.nan, 2., 3., np.nan, 6.]})
  489. df2 = DataFrame({'A': [5., 2., 4., np.nan, 3., 7.],
  490. 'B': [np.nan, np.nan, 3., 4., 6., 8.]})
  491. result = df1.combine_first(df2)
  492. expected = DataFrame(
  493. {'A': [1, 2, 3, 5, 3, 7.], 'B': [np.nan, 2, 3, 4, 6, 8]})
  494. assert_frame_equal(result, expected)
  495. # GH3552, return object dtype with bools
  496. df1 = DataFrame(
  497. [[np.nan, 3., True], [-4.6, np.nan, True], [np.nan, 7., False]])
  498. df2 = DataFrame(
  499. [[-42.6, np.nan, True], [-5., 1.6, False]], index=[1, 2])
  500. result = df1.combine_first(df2)[2]
  501. expected = Series([True, True, False], name=2)
  502. assert_series_equal(result, expected)
  503. # GH 3593, converting datetime64[ns] incorrecly
  504. df0 = DataFrame({"a": [datetime(2000, 1, 1),
  505. datetime(2000, 1, 2),
  506. datetime(2000, 1, 3)]})
  507. df1 = DataFrame({"a": [None, None, None]})
  508. df2 = df1.combine_first(df0)
  509. assert_frame_equal(df2, df0)
  510. df2 = df0.combine_first(df1)
  511. assert_frame_equal(df2, df0)
  512. df0 = DataFrame({"a": [datetime(2000, 1, 1),
  513. datetime(2000, 1, 2),
  514. datetime(2000, 1, 3)]})
  515. df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]})
  516. df2 = df1.combine_first(df0)
  517. result = df0.copy()
  518. result.iloc[0, :] = df1.iloc[0, :]
  519. assert_frame_equal(df2, result)
  520. df2 = df0.combine_first(df1)
  521. assert_frame_equal(df2, df0)
  522. def test_combine_first_align_nan(self):
  523. # GH 7509 (not fixed)
  524. dfa = pd.DataFrame([[pd.Timestamp('2011-01-01'), 2]],
  525. columns=['a', 'b'])
  526. dfb = pd.DataFrame([[4], [5]], columns=['b'])
  527. assert dfa['a'].dtype == 'datetime64[ns]'
  528. assert dfa['b'].dtype == 'int64'
  529. res = dfa.combine_first(dfb)
  530. exp = pd.DataFrame({'a': [pd.Timestamp('2011-01-01'), pd.NaT],
  531. 'b': [2., 5.]}, columns=['a', 'b'])
  532. tm.assert_frame_equal(res, exp)
  533. assert res['a'].dtype == 'datetime64[ns]'
  534. # ToDo: this must be int64
  535. assert res['b'].dtype == 'float64'
  536. res = dfa.iloc[:0].combine_first(dfb)
  537. exp = pd.DataFrame({'a': [np.nan, np.nan],
  538. 'b': [4, 5]}, columns=['a', 'b'])
  539. tm.assert_frame_equal(res, exp)
  540. # ToDo: this must be datetime64
  541. assert res['a'].dtype == 'float64'
  542. # ToDo: this must be int64
  543. assert res['b'].dtype == 'int64'
  544. def test_combine_first_timezone(self):
  545. # see gh-7630
  546. data1 = pd.to_datetime('20100101 01:01').tz_localize('UTC')
  547. df1 = pd.DataFrame(columns=['UTCdatetime', 'abc'],
  548. data=data1,
  549. index=pd.date_range('20140627', periods=1))
  550. data2 = pd.to_datetime('20121212 12:12').tz_localize('UTC')
  551. df2 = pd.DataFrame(columns=['UTCdatetime', 'xyz'],
  552. data=data2,
  553. index=pd.date_range('20140628', periods=1))
  554. res = df2[['UTCdatetime']].combine_first(df1)
  555. exp = pd.DataFrame({'UTCdatetime': [pd.Timestamp('2010-01-01 01:01',
  556. tz='UTC'),
  557. pd.Timestamp('2012-12-12 12:12',
  558. tz='UTC')],
  559. 'abc': [pd.Timestamp('2010-01-01 01:01:00',
  560. tz='UTC'), pd.NaT]},
  561. columns=['UTCdatetime', 'abc'],
  562. index=pd.date_range('20140627', periods=2,
  563. freq='D'))
  564. tm.assert_frame_equal(res, exp)
  565. assert res['UTCdatetime'].dtype == 'datetime64[ns, UTC]'
  566. assert res['abc'].dtype == 'datetime64[ns, UTC]'
  567. # see gh-10567
  568. dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='UTC')
  569. df1 = pd.DataFrame({'DATE': dts1})
  570. dts2 = pd.date_range('2015-01-03', '2015-01-05', tz='UTC')
  571. df2 = pd.DataFrame({'DATE': dts2})
  572. res = df1.combine_first(df2)
  573. tm.assert_frame_equal(res, df1)
  574. assert res['DATE'].dtype == 'datetime64[ns, UTC]'
  575. dts1 = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03',
  576. '2011-01-04'], tz='US/Eastern')
  577. df1 = pd.DataFrame({'DATE': dts1}, index=[1, 3, 5, 7])
  578. dts2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02',
  579. '2012-01-03'], tz='US/Eastern')
  580. df2 = pd.DataFrame({'DATE': dts2}, index=[2, 4, 5])
  581. res = df1.combine_first(df2)
  582. exp_dts = pd.DatetimeIndex(['2011-01-01', '2012-01-01', 'NaT',
  583. '2012-01-02', '2011-01-03', '2011-01-04'],
  584. tz='US/Eastern')
  585. exp = pd.DataFrame({'DATE': exp_dts}, index=[1, 2, 3, 4, 5, 7])
  586. tm.assert_frame_equal(res, exp)
  587. # different tz
  588. dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='US/Eastern')
  589. df1 = pd.DataFrame({'DATE': dts1})
  590. dts2 = pd.date_range('2015-01-03', '2015-01-05')
  591. df2 = pd.DataFrame({'DATE': dts2})
  592. # if df1 doesn't have NaN, keep its dtype
  593. res = df1.combine_first(df2)
  594. tm.assert_frame_equal(res, df1)
  595. assert res['DATE'].dtype == 'datetime64[ns, US/Eastern]'
  596. dts1 = pd.date_range('2015-01-01', '2015-01-02', tz='US/Eastern')
  597. df1 = pd.DataFrame({'DATE': dts1})
  598. dts2 = pd.date_range('2015-01-01', '2015-01-03')
  599. df2 = pd.DataFrame({'DATE': dts2})
  600. res = df1.combine_first(df2)
  601. exp_dts = [pd.Timestamp('2015-01-01', tz='US/Eastern'),
  602. pd.Timestamp('2015-01-02', tz='US/Eastern'),
  603. pd.Timestamp('2015-01-03')]
  604. exp = pd.DataFrame({'DATE': exp_dts})
  605. tm.assert_frame_equal(res, exp)
  606. assert res['DATE'].dtype == 'object'
  607. def test_combine_first_timedelta(self):
  608. data1 = pd.TimedeltaIndex(['1 day', 'NaT', '3 day', '4day'])
  609. df1 = pd.DataFrame({'TD': data1}, index=[1, 3, 5, 7])
  610. data2 = pd.TimedeltaIndex(['10 day', '11 day', '12 day'])
  611. df2 = pd.DataFrame({'TD': data2}, index=[2, 4, 5])
  612. res = df1.combine_first(df2)
  613. exp_dts = pd.TimedeltaIndex(['1 day', '10 day', 'NaT',
  614. '11 day', '3 day', '4 day'])
  615. exp = pd.DataFrame({'TD': exp_dts}, index=[1, 2, 3, 4, 5, 7])
  616. tm.assert_frame_equal(res, exp)
  617. assert res['TD'].dtype == 'timedelta64[ns]'
  618. def test_combine_first_period(self):
  619. data1 = pd.PeriodIndex(['2011-01', 'NaT', '2011-03',
  620. '2011-04'], freq='M')
  621. df1 = pd.DataFrame({'P': data1}, index=[1, 3, 5, 7])
  622. data2 = pd.PeriodIndex(['2012-01-01', '2012-02',
  623. '2012-03'], freq='M')
  624. df2 = pd.DataFrame({'P': data2}, index=[2, 4, 5])
  625. res = df1.combine_first(df2)
  626. exp_dts = pd.PeriodIndex(['2011-01', '2012-01', 'NaT',
  627. '2012-02', '2011-03', '2011-04'],
  628. freq='M')
  629. exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7])
  630. tm.assert_frame_equal(res, exp)
  631. assert res['P'].dtype == data1.dtype
  632. # different freq
  633. dts2 = pd.PeriodIndex(['2012-01-01', '2012-01-02',
  634. '2012-01-03'], freq='D')
  635. df2 = pd.DataFrame({'P': dts2}, index=[2, 4, 5])
  636. res = df1.combine_first(df2)
  637. exp_dts = [pd.Period('2011-01', freq='M'),
  638. pd.Period('2012-01-01', freq='D'),
  639. pd.NaT,
  640. pd.Period('2012-01-02', freq='D'),
  641. pd.Period('2011-03', freq='M'),
  642. pd.Period('2011-04', freq='M')]
  643. exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7])
  644. tm.assert_frame_equal(res, exp)
  645. assert res['P'].dtype == 'object'
  646. def test_combine_first_int(self):
  647. # GH14687 - integer series that do no align exactly
  648. df1 = pd.DataFrame({'a': [0, 1, 3, 5]}, dtype='int64')
  649. df2 = pd.DataFrame({'a': [1, 4]}, dtype='int64')
  650. res = df1.combine_first(df2)
  651. tm.assert_frame_equal(res, df1)
  652. assert res['a'].dtype == 'int64'
  653. @pytest.mark.parametrize("val", [1, 1.0])
  654. def test_combine_first_with_asymmetric_other(self, val):
  655. # see gh-20699
  656. df1 = pd.DataFrame({'isNum': [val]})
  657. df2 = pd.DataFrame({'isBool': [True]})
  658. res = df1.combine_first(df2)
  659. exp = pd.DataFrame({'isBool': [True], 'isNum': [val]})
  660. tm.assert_frame_equal(res, exp)
  661. def test_concat_datetime_datetime64_frame(self):
  662. # #2624
  663. rows = []
  664. rows.append([datetime(2010, 1, 1), 1])
  665. rows.append([datetime(2010, 1, 2), 'hi'])
  666. df2_obj = DataFrame.from_records(rows, columns=['date', 'test'])
  667. ind = date_range(start="2000/1/1", freq="D", periods=10)
  668. df1 = DataFrame({'date': ind, 'test': lrange(10)})
  669. # it works!
  670. pd.concat([df1, df2_obj])
  671. class TestDataFrameUpdate(TestData):
  672. def test_update_nan(self):
  673. # #15593 #15617
  674. # test 1
  675. df1 = DataFrame({'A': [1.0, 2, 3], 'B': date_range('2000', periods=3)})
  676. df2 = DataFrame({'A': [None, 2, 3]})
  677. expected = df1.copy()
  678. df1.update(df2, overwrite=False)
  679. tm.assert_frame_equal(df1, expected)
  680. # test 2
  681. df1 = DataFrame({'A': [1.0, None, 3],
  682. 'B': date_range('2000', periods=3)})
  683. df2 = DataFrame({'A': [None, 2, 3]})
  684. expected = DataFrame({'A': [1.0, 2, 3],
  685. 'B': date_range('2000', periods=3)})
  686. df1.update(df2, overwrite=False)
  687. tm.assert_frame_equal(df1, expected)