test_combine_concat.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863
  1. # -*- coding: utf-8 -*-
  2. from __future__ import print_function
  3. from datetime import datetime
  4. import numpy as np
  5. import pytest
  6. from pandas.compat import lrange
  7. import pandas as pd
  8. from pandas import DataFrame, Index, Series, Timestamp, date_range
  9. from pandas.tests.frame.common import TestData
  10. import pandas.util.testing as tm
  11. from pandas.util.testing import assert_frame_equal, assert_series_equal
  12. class TestDataFrameConcatCommon(TestData):
  13. def test_concat_multiple_frames_dtypes(self):
  14. # GH 2759
  15. A = DataFrame(data=np.ones((10, 2)), columns=[
  16. 'foo', 'bar'], dtype=np.float64)
  17. B = DataFrame(data=np.ones((10, 2)), dtype=np.float32)
  18. results = pd.concat((A, B), axis=1).get_dtype_counts()
  19. expected = Series(dict(float64=2, float32=2))
  20. assert_series_equal(results, expected)
  21. @pytest.mark.parametrize('data', [
  22. pd.date_range('2000', periods=4),
  23. pd.date_range('2000', periods=4, tz="US/Central"),
  24. pd.period_range('2000', periods=4),
  25. pd.timedelta_range(0, periods=4),
  26. ])
  27. def test_combine_datetlike_udf(self, data):
  28. # https://github.com/pandas-dev/pandas/issues/23079
  29. df = pd.DataFrame({"A": data})
  30. other = df.copy()
  31. df.iloc[1, 0] = None
  32. def combiner(a, b):
  33. return b
  34. result = df.combine(other, combiner)
  35. tm.assert_frame_equal(result, other)
  36. def test_concat_multiple_tzs(self):
  37. # GH 12467
  38. # combining datetime tz-aware and naive DataFrames
  39. ts1 = Timestamp('2015-01-01', tz=None)
  40. ts2 = Timestamp('2015-01-01', tz='UTC')
  41. ts3 = Timestamp('2015-01-01', tz='EST')
  42. df1 = DataFrame(dict(time=[ts1]))
  43. df2 = DataFrame(dict(time=[ts2]))
  44. df3 = DataFrame(dict(time=[ts3]))
  45. results = pd.concat([df1, df2]).reset_index(drop=True)
  46. expected = DataFrame(dict(time=[ts1, ts2]), dtype=object)
  47. assert_frame_equal(results, expected)
  48. results = pd.concat([df1, df3]).reset_index(drop=True)
  49. expected = DataFrame(dict(time=[ts1, ts3]), dtype=object)
  50. assert_frame_equal(results, expected)
  51. results = pd.concat([df2, df3]).reset_index(drop=True)
  52. expected = DataFrame(dict(time=[ts2, ts3]))
  53. assert_frame_equal(results, expected)
  54. @pytest.mark.parametrize(
  55. 't1',
  56. [
  57. '2015-01-01',
  58. pytest.param(pd.NaT, marks=pytest.mark.xfail(
  59. reason='GH23037 incorrect dtype when concatenating'))])
  60. def test_concat_tz_NaT(self, t1):
  61. # GH 22796
  62. # Concating tz-aware multicolumn DataFrames
  63. ts1 = Timestamp(t1, tz='UTC')
  64. ts2 = Timestamp('2015-01-01', tz='UTC')
  65. ts3 = Timestamp('2015-01-01', tz='UTC')
  66. df1 = DataFrame([[ts1, ts2]])
  67. df2 = DataFrame([[ts3]])
  68. result = pd.concat([df1, df2])
  69. expected = DataFrame([[ts1, ts2], [ts3, pd.NaT]], index=[0, 0])
  70. assert_frame_equal(result, expected)
  71. def test_concat_tz_not_aligned(self):
  72. # GH 22796
  73. ts = pd.to_datetime([1, 2]).tz_localize("UTC")
  74. a = pd.DataFrame({"A": ts})
  75. b = pd.DataFrame({"A": ts, "B": ts})
  76. result = pd.concat([a, b], sort=True, ignore_index=True)
  77. expected = pd.DataFrame({"A": list(ts) + list(ts),
  78. "B": [pd.NaT, pd.NaT] + list(ts)})
  79. assert_frame_equal(result, expected)
  80. def test_concat_tuple_keys(self):
  81. # GH 14438
  82. df1 = pd.DataFrame(np.ones((2, 2)), columns=list('AB'))
  83. df2 = pd.DataFrame(np.ones((3, 2)) * 2, columns=list('AB'))
  84. results = pd.concat((df1, df2), keys=[('bee', 'bah'), ('bee', 'boo')])
  85. expected = pd.DataFrame(
  86. {'A': {('bee', 'bah', 0): 1.0,
  87. ('bee', 'bah', 1): 1.0,
  88. ('bee', 'boo', 0): 2.0,
  89. ('bee', 'boo', 1): 2.0,
  90. ('bee', 'boo', 2): 2.0},
  91. 'B': {('bee', 'bah', 0): 1.0,
  92. ('bee', 'bah', 1): 1.0,
  93. ('bee', 'boo', 0): 2.0,
  94. ('bee', 'boo', 1): 2.0,
  95. ('bee', 'boo', 2): 2.0}})
  96. assert_frame_equal(results, expected)
  97. def test_append_series_dict(self):
  98. df = DataFrame(np.random.randn(5, 4),
  99. columns=['foo', 'bar', 'baz', 'qux'])
  100. series = df.loc[4]
  101. msg = 'Indexes have overlapping values'
  102. with pytest.raises(ValueError, match=msg):
  103. df.append(series, verify_integrity=True)
  104. series.name = None
  105. msg = 'Can only append a Series if ignore_index=True'
  106. with pytest.raises(TypeError, match=msg):
  107. df.append(series, verify_integrity=True)
  108. result = df.append(series[::-1], ignore_index=True)
  109. expected = df.append(DataFrame({0: series[::-1]}, index=df.columns).T,
  110. ignore_index=True)
  111. assert_frame_equal(result, expected)
  112. # dict
  113. result = df.append(series.to_dict(), ignore_index=True)
  114. assert_frame_equal(result, expected)
  115. result = df.append(series[::-1][:3], ignore_index=True)
  116. expected = df.append(DataFrame({0: series[::-1][:3]}).T,
  117. ignore_index=True, sort=True)
  118. assert_frame_equal(result, expected.loc[:, result.columns])
  119. # can append when name set
  120. row = df.loc[4]
  121. row.name = 5
  122. result = df.append(row)
  123. expected = df.append(df[-1:], ignore_index=True)
  124. assert_frame_equal(result, expected)
  125. def test_append_list_of_series_dicts(self):
  126. df = DataFrame(np.random.randn(5, 4),
  127. columns=['foo', 'bar', 'baz', 'qux'])
  128. dicts = [x.to_dict() for idx, x in df.iterrows()]
  129. result = df.append(dicts, ignore_index=True)
  130. expected = df.append(df, ignore_index=True)
  131. assert_frame_equal(result, expected)
  132. # different columns
  133. dicts = [{'foo': 1, 'bar': 2, 'baz': 3, 'peekaboo': 4},
  134. {'foo': 5, 'bar': 6, 'baz': 7, 'peekaboo': 8}]
  135. result = df.append(dicts, ignore_index=True, sort=True)
  136. expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
  137. assert_frame_equal(result, expected)
  138. def test_append_empty_dataframe(self):
  139. # Empty df append empty df
  140. df1 = DataFrame([])
  141. df2 = DataFrame([])
  142. result = df1.append(df2)
  143. expected = df1.copy()
  144. assert_frame_equal(result, expected)
  145. # Non-empty df append empty df
  146. df1 = DataFrame(np.random.randn(5, 2))
  147. df2 = DataFrame()
  148. result = df1.append(df2)
  149. expected = df1.copy()
  150. assert_frame_equal(result, expected)
  151. # Empty df with columns append empty df
  152. df1 = DataFrame(columns=['bar', 'foo'])
  153. df2 = DataFrame()
  154. result = df1.append(df2)
  155. expected = df1.copy()
  156. assert_frame_equal(result, expected)
  157. # Non-Empty df with columns append empty df
  158. df1 = DataFrame(np.random.randn(5, 2), columns=['bar', 'foo'])
  159. df2 = DataFrame()
  160. result = df1.append(df2)
  161. expected = df1.copy()
  162. assert_frame_equal(result, expected)
  163. def test_append_dtypes(self):
  164. # GH 5754
  165. # row appends of different dtypes (so need to do by-item)
  166. # can sometimes infer the correct type
  167. df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(5))
  168. df2 = DataFrame()
  169. result = df1.append(df2)
  170. expected = df1.copy()
  171. assert_frame_equal(result, expected)
  172. df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
  173. df2 = DataFrame({'bar': 'foo'}, index=lrange(1, 2))
  174. result = df1.append(df2)
  175. expected = DataFrame({'bar': [Timestamp('20130101'), 'foo']})
  176. assert_frame_equal(result, expected)
  177. df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
  178. df2 = DataFrame({'bar': np.nan}, index=lrange(1, 2))
  179. result = df1.append(df2)
  180. expected = DataFrame(
  181. {'bar': Series([Timestamp('20130101'), np.nan], dtype='M8[ns]')})
  182. assert_frame_equal(result, expected)
  183. df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
  184. df2 = DataFrame({'bar': np.nan}, index=lrange(1, 2), dtype=object)
  185. result = df1.append(df2)
  186. expected = DataFrame(
  187. {'bar': Series([Timestamp('20130101'), np.nan], dtype='M8[ns]')})
  188. assert_frame_equal(result, expected)
  189. df1 = DataFrame({'bar': np.nan}, index=lrange(1))
  190. df2 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1, 2))
  191. result = df1.append(df2)
  192. expected = DataFrame(
  193. {'bar': Series([np.nan, Timestamp('20130101')], dtype='M8[ns]')})
  194. assert_frame_equal(result, expected)
  195. df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
  196. df2 = DataFrame({'bar': 1}, index=lrange(1, 2), dtype=object)
  197. result = df1.append(df2)
  198. expected = DataFrame({'bar': Series([Timestamp('20130101'), 1])})
  199. assert_frame_equal(result, expected)
  200. def test_update(self):
  201. df = DataFrame([[1.5, np.nan, 3.],
  202. [1.5, np.nan, 3.],
  203. [1.5, np.nan, 3],
  204. [1.5, np.nan, 3]])
  205. other = DataFrame([[3.6, 2., np.nan],
  206. [np.nan, np.nan, 7]], index=[1, 3])
  207. df.update(other)
  208. expected = DataFrame([[1.5, np.nan, 3],
  209. [3.6, 2, 3],
  210. [1.5, np.nan, 3],
  211. [1.5, np.nan, 7.]])
  212. assert_frame_equal(df, expected)
  213. def test_update_dtypes(self):
  214. # gh 3016
  215. df = DataFrame([[1., 2., False, True], [4., 5., True, False]],
  216. columns=['A', 'B', 'bool1', 'bool2'])
  217. other = DataFrame([[45, 45]], index=[0], columns=['A', 'B'])
  218. df.update(other)
  219. expected = DataFrame([[45., 45., False, True], [4., 5., True, False]],
  220. columns=['A', 'B', 'bool1', 'bool2'])
  221. assert_frame_equal(df, expected)
  222. def test_update_nooverwrite(self):
  223. df = DataFrame([[1.5, np.nan, 3.],
  224. [1.5, np.nan, 3.],
  225. [1.5, np.nan, 3],
  226. [1.5, np.nan, 3]])
  227. other = DataFrame([[3.6, 2., np.nan],
  228. [np.nan, np.nan, 7]], index=[1, 3])
  229. df.update(other, overwrite=False)
  230. expected = DataFrame([[1.5, np.nan, 3],
  231. [1.5, 2, 3],
  232. [1.5, np.nan, 3],
  233. [1.5, np.nan, 3.]])
  234. assert_frame_equal(df, expected)
  235. def test_update_filtered(self):
  236. df = DataFrame([[1.5, np.nan, 3.],
  237. [1.5, np.nan, 3.],
  238. [1.5, np.nan, 3],
  239. [1.5, np.nan, 3]])
  240. other = DataFrame([[3.6, 2., np.nan],
  241. [np.nan, np.nan, 7]], index=[1, 3])
  242. df.update(other, filter_func=lambda x: x > 2)
  243. expected = DataFrame([[1.5, np.nan, 3],
  244. [1.5, np.nan, 3],
  245. [1.5, np.nan, 3],
  246. [1.5, np.nan, 7.]])
  247. assert_frame_equal(df, expected)
  248. @pytest.mark.parametrize('bad_kwarg, exception, msg', [
  249. # errors must be 'ignore' or 'raise'
  250. ({'errors': 'something'}, ValueError, 'The parameter errors must.*'),
  251. ({'join': 'inner'}, NotImplementedError, 'Only left join is supported')
  252. ])
  253. def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg):
  254. df = DataFrame([[1.5, 1, 3.]])
  255. with pytest.raises(exception, match=msg):
  256. df.update(df, **bad_kwarg)
  257. def test_update_raise_on_overlap(self):
  258. df = DataFrame([[1.5, 1, 3.],
  259. [1.5, np.nan, 3.],
  260. [1.5, np.nan, 3],
  261. [1.5, np.nan, 3]])
  262. other = DataFrame([[2., np.nan],
  263. [np.nan, 7]], index=[1, 3], columns=[1, 2])
  264. with pytest.raises(ValueError, match="Data overlaps"):
  265. df.update(other, errors='raise')
  266. @pytest.mark.parametrize('raise_conflict', [True, False])
  267. def test_update_deprecation(self, raise_conflict):
  268. df = DataFrame([[1.5, 1, 3.]])
  269. other = DataFrame()
  270. with tm.assert_produces_warning(FutureWarning):
  271. df.update(other, raise_conflict=raise_conflict)
  272. def test_update_from_non_df(self):
  273. d = {'a': Series([1, 2, 3, 4]), 'b': Series([5, 6, 7, 8])}
  274. df = DataFrame(d)
  275. d['a'] = Series([5, 6, 7, 8])
  276. df.update(d)
  277. expected = DataFrame(d)
  278. assert_frame_equal(df, expected)
  279. d = {'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]}
  280. df = DataFrame(d)
  281. d['a'] = [5, 6, 7, 8]
  282. df.update(d)
  283. expected = DataFrame(d)
  284. assert_frame_equal(df, expected)
  285. def test_join_str_datetime(self):
  286. str_dates = ['20120209', '20120222']
  287. dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)]
  288. A = DataFrame(str_dates, index=lrange(2), columns=['aa'])
  289. C = DataFrame([[1, 2], [3, 4]], index=str_dates, columns=dt_dates)
  290. tst = A.join(C, on='aa')
  291. assert len(tst.columns) == 3
  292. def test_join_multiindex_leftright(self):
  293. # GH 10741
  294. df1 = (pd.DataFrame([['a', 'x', 0.471780], ['a', 'y', 0.774908],
  295. ['a', 'z', 0.563634], ['b', 'x', -0.353756],
  296. ['b', 'y', 0.368062], ['b', 'z', -1.721840],
  297. ['c', 'x', 1], ['c', 'y', 2], ['c', 'z', 3]],
  298. columns=['first', 'second', 'value1'])
  299. .set_index(['first', 'second']))
  300. df2 = (pd.DataFrame([['a', 10], ['b', 20]],
  301. columns=['first', 'value2'])
  302. .set_index(['first']))
  303. exp = pd.DataFrame([[0.471780, 10], [0.774908, 10], [0.563634, 10],
  304. [-0.353756, 20], [0.368062, 20],
  305. [-1.721840, 20],
  306. [1.000000, np.nan], [2.000000, np.nan],
  307. [3.000000, np.nan]],
  308. index=df1.index, columns=['value1', 'value2'])
  309. # these must be the same results (but columns are flipped)
  310. assert_frame_equal(df1.join(df2, how='left'), exp)
  311. assert_frame_equal(df2.join(df1, how='right'),
  312. exp[['value2', 'value1']])
  313. exp_idx = pd.MultiIndex.from_product([['a', 'b'], ['x', 'y', 'z']],
  314. names=['first', 'second'])
  315. exp = pd.DataFrame([[0.471780, 10], [0.774908, 10], [0.563634, 10],
  316. [-0.353756, 20], [0.368062, 20], [-1.721840, 20]],
  317. index=exp_idx, columns=['value1', 'value2'])
  318. assert_frame_equal(df1.join(df2, how='right'), exp)
  319. assert_frame_equal(df2.join(df1, how='left'),
  320. exp[['value2', 'value1']])
  321. def test_concat_named_keys(self):
  322. # GH 14252
  323. df = pd.DataFrame({'foo': [1, 2], 'bar': [0.1, 0.2]})
  324. index = Index(['a', 'b'], name='baz')
  325. concatted_named_from_keys = pd.concat([df, df], keys=index)
  326. expected_named = pd.DataFrame(
  327. {'foo': [1, 2, 1, 2], 'bar': [0.1, 0.2, 0.1, 0.2]},
  328. index=pd.MultiIndex.from_product((['a', 'b'], [0, 1]),
  329. names=['baz', None]))
  330. assert_frame_equal(concatted_named_from_keys, expected_named)
  331. index_no_name = Index(['a', 'b'], name=None)
  332. concatted_named_from_names = pd.concat(
  333. [df, df], keys=index_no_name, names=['baz'])
  334. assert_frame_equal(concatted_named_from_names, expected_named)
  335. concatted_unnamed = pd.concat([df, df], keys=index_no_name)
  336. expected_unnamed = pd.DataFrame(
  337. {'foo': [1, 2, 1, 2], 'bar': [0.1, 0.2, 0.1, 0.2]},
  338. index=pd.MultiIndex.from_product((['a', 'b'], [0, 1]),
  339. names=[None, None]))
  340. assert_frame_equal(concatted_unnamed, expected_unnamed)
  341. def test_concat_axis_parameter(self):
  342. # GH 14369
  343. df1 = pd.DataFrame({'A': [0.1, 0.2]}, index=range(2))
  344. df2 = pd.DataFrame({'A': [0.3, 0.4]}, index=range(2))
  345. # Index/row/0 DataFrame
  346. expected_index = pd.DataFrame(
  347. {'A': [0.1, 0.2, 0.3, 0.4]}, index=[0, 1, 0, 1])
  348. concatted_index = pd.concat([df1, df2], axis='index')
  349. assert_frame_equal(concatted_index, expected_index)
  350. concatted_row = pd.concat([df1, df2], axis='rows')
  351. assert_frame_equal(concatted_row, expected_index)
  352. concatted_0 = pd.concat([df1, df2], axis=0)
  353. assert_frame_equal(concatted_0, expected_index)
  354. # Columns/1 DataFrame
  355. expected_columns = pd.DataFrame(
  356. [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=['A', 'A'])
  357. concatted_columns = pd.concat([df1, df2], axis='columns')
  358. assert_frame_equal(concatted_columns, expected_columns)
  359. concatted_1 = pd.concat([df1, df2], axis=1)
  360. assert_frame_equal(concatted_1, expected_columns)
  361. series1 = pd.Series([0.1, 0.2])
  362. series2 = pd.Series([0.3, 0.4])
  363. # Index/row/0 Series
  364. expected_index_series = pd.Series(
  365. [0.1, 0.2, 0.3, 0.4], index=[0, 1, 0, 1])
  366. concatted_index_series = pd.concat([series1, series2], axis='index')
  367. assert_series_equal(concatted_index_series, expected_index_series)
  368. concatted_row_series = pd.concat([series1, series2], axis='rows')
  369. assert_series_equal(concatted_row_series, expected_index_series)
  370. concatted_0_series = pd.concat([series1, series2], axis=0)
  371. assert_series_equal(concatted_0_series, expected_index_series)
  372. # Columns/1 Series
  373. expected_columns_series = pd.DataFrame(
  374. [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=[0, 1])
  375. concatted_columns_series = pd.concat(
  376. [series1, series2], axis='columns')
  377. assert_frame_equal(concatted_columns_series, expected_columns_series)
  378. concatted_1_series = pd.concat([series1, series2], axis=1)
  379. assert_frame_equal(concatted_1_series, expected_columns_series)
  380. # Testing ValueError
  381. with pytest.raises(ValueError, match='No axis named'):
  382. pd.concat([series1, series2], axis='something')
  383. def test_concat_numerical_names(self):
  384. # #15262 # #12223
  385. df = pd.DataFrame({'col': range(9)},
  386. dtype='int32',
  387. index=(pd.MultiIndex
  388. .from_product([['A0', 'A1', 'A2'],
  389. ['B0', 'B1', 'B2']],
  390. names=[1, 2])))
  391. result = pd.concat((df.iloc[:2, :], df.iloc[-2:, :]))
  392. expected = pd.DataFrame({'col': [0, 1, 7, 8]},
  393. dtype='int32',
  394. index=pd.MultiIndex.from_tuples([('A0', 'B0'),
  395. ('A0', 'B1'),
  396. ('A2', 'B1'),
  397. ('A2', 'B2')],
  398. names=[1, 2]))
  399. tm.assert_frame_equal(result, expected)
  400. class TestDataFrameCombineFirst(TestData):
  401. def test_combine_first_mixed(self):
  402. a = Series(['a', 'b'], index=lrange(2))
  403. b = Series(lrange(2), index=lrange(2))
  404. f = DataFrame({'A': a, 'B': b})
  405. a = Series(['a', 'b'], index=lrange(5, 7))
  406. b = Series(lrange(2), index=lrange(5, 7))
  407. g = DataFrame({'A': a, 'B': b})
  408. exp = pd.DataFrame({'A': list('abab'), 'B': [0., 1., 0., 1.]},
  409. index=[0, 1, 5, 6])
  410. combined = f.combine_first(g)
  411. tm.assert_frame_equal(combined, exp)
  412. def test_combine_first(self):
  413. # disjoint
  414. head, tail = self.frame[:5], self.frame[5:]
  415. combined = head.combine_first(tail)
  416. reordered_frame = self.frame.reindex(combined.index)
  417. assert_frame_equal(combined, reordered_frame)
  418. assert tm.equalContents(combined.columns, self.frame.columns)
  419. assert_series_equal(combined['A'], reordered_frame['A'])
  420. # same index
  421. fcopy = self.frame.copy()
  422. fcopy['A'] = 1
  423. del fcopy['C']
  424. fcopy2 = self.frame.copy()
  425. fcopy2['B'] = 0
  426. del fcopy2['D']
  427. combined = fcopy.combine_first(fcopy2)
  428. assert (combined['A'] == 1).all()
  429. assert_series_equal(combined['B'], fcopy['B'])
  430. assert_series_equal(combined['C'], fcopy2['C'])
  431. assert_series_equal(combined['D'], fcopy['D'])
  432. # overlap
  433. head, tail = reordered_frame[:10].copy(), reordered_frame
  434. head['A'] = 1
  435. combined = head.combine_first(tail)
  436. assert (combined['A'][:10] == 1).all()
  437. # reverse overlap
  438. tail['A'][:10] = 0
  439. combined = tail.combine_first(head)
  440. assert (combined['A'][:10] == 0).all()
  441. # no overlap
  442. f = self.frame[:10]
  443. g = self.frame[10:]
  444. combined = f.combine_first(g)
  445. assert_series_equal(combined['A'].reindex(f.index), f['A'])
  446. assert_series_equal(combined['A'].reindex(g.index), g['A'])
  447. # corner cases
  448. comb = self.frame.combine_first(self.empty)
  449. assert_frame_equal(comb, self.frame)
  450. comb = self.empty.combine_first(self.frame)
  451. assert_frame_equal(comb, self.frame)
  452. comb = self.frame.combine_first(DataFrame(index=["faz", "boo"]))
  453. assert "faz" in comb.index
  454. # #2525
  455. df = DataFrame({'a': [1]}, index=[datetime(2012, 1, 1)])
  456. df2 = DataFrame({}, columns=['b'])
  457. result = df.combine_first(df2)
  458. assert 'b' in result
  459. def test_combine_first_mixed_bug(self):
  460. idx = Index(['a', 'b', 'c', 'e'])
  461. ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx)
  462. ser2 = Series(['a', 'b', 'c', 'e'], index=idx)
  463. ser3 = Series([12, 4, 5, 97], index=idx)
  464. frame1 = DataFrame({"col0": ser1,
  465. "col2": ser2,
  466. "col3": ser3})
  467. idx = Index(['a', 'b', 'c', 'f'])
  468. ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx)
  469. ser2 = Series(['a', 'b', 'c', 'f'], index=idx)
  470. ser3 = Series([12, 4, 5, 97], index=idx)
  471. frame2 = DataFrame({"col1": ser1,
  472. "col2": ser2,
  473. "col5": ser3})
  474. combined = frame1.combine_first(frame2)
  475. assert len(combined.columns) == 5
  476. # gh 3016 (same as in update)
  477. df = DataFrame([[1., 2., False, True], [4., 5., True, False]],
  478. columns=['A', 'B', 'bool1', 'bool2'])
  479. other = DataFrame([[45, 45]], index=[0], columns=['A', 'B'])
  480. result = df.combine_first(other)
  481. assert_frame_equal(result, df)
  482. df.loc[0, 'A'] = np.nan
  483. result = df.combine_first(other)
  484. df.loc[0, 'A'] = 45
  485. assert_frame_equal(result, df)
  486. # doc example
  487. df1 = DataFrame({'A': [1., np.nan, 3., 5., np.nan],
  488. 'B': [np.nan, 2., 3., np.nan, 6.]})
  489. df2 = DataFrame({'A': [5., 2., 4., np.nan, 3., 7.],
  490. 'B': [np.nan, np.nan, 3., 4., 6., 8.]})
  491. result = df1.combine_first(df2)
  492. expected = DataFrame(
  493. {'A': [1, 2, 3, 5, 3, 7.], 'B': [np.nan, 2, 3, 4, 6, 8]})
  494. assert_frame_equal(result, expected)
  495. # GH3552, return object dtype with bools
  496. df1 = DataFrame(
  497. [[np.nan, 3., True], [-4.6, np.nan, True], [np.nan, 7., False]])
  498. df2 = DataFrame(
  499. [[-42.6, np.nan, True], [-5., 1.6, False]], index=[1, 2])
  500. result = df1.combine_first(df2)[2]
  501. expected = Series([True, True, False], name=2)
  502. assert_series_equal(result, expected)
  503. # GH 3593, converting datetime64[ns] incorrecly
  504. df0 = DataFrame({"a": [datetime(2000, 1, 1),
  505. datetime(2000, 1, 2),
  506. datetime(2000, 1, 3)]})
  507. df1 = DataFrame({"a": [None, None, None]})
  508. df2 = df1.combine_first(df0)
  509. assert_frame_equal(df2, df0)
  510. df2 = df0.combine_first(df1)
  511. assert_frame_equal(df2, df0)
  512. df0 = DataFrame({"a": [datetime(2000, 1, 1),
  513. datetime(2000, 1, 2),
  514. datetime(2000, 1, 3)]})
  515. df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]})
  516. df2 = df1.combine_first(df0)
  517. result = df0.copy()
  518. result.iloc[0, :] = df1.iloc[0, :]
  519. assert_frame_equal(df2, result)
  520. df2 = df0.combine_first(df1)
  521. assert_frame_equal(df2, df0)
  522. def test_combine_first_align_nan(self):
  523. # GH 7509 (not fixed)
  524. dfa = pd.DataFrame([[pd.Timestamp('2011-01-01'), 2]],
  525. columns=['a', 'b'])
  526. dfb = pd.DataFrame([[4], [5]], columns=['b'])
  527. assert dfa['a'].dtype == 'datetime64[ns]'
  528. assert dfa['b'].dtype == 'int64'
  529. res = dfa.combine_first(dfb)
  530. exp = pd.DataFrame({'a': [pd.Timestamp('2011-01-01'), pd.NaT],
  531. 'b': [2., 5.]}, columns=['a', 'b'])
  532. tm.assert_frame_equal(res, exp)
  533. assert res['a'].dtype == 'datetime64[ns]'
  534. # ToDo: this must be int64
  535. assert res['b'].dtype == 'float64'
  536. res = dfa.iloc[:0].combine_first(dfb)
  537. exp = pd.DataFrame({'a': [np.nan, np.nan],
  538. 'b': [4, 5]}, columns=['a', 'b'])
  539. tm.assert_frame_equal(res, exp)
  540. # ToDo: this must be datetime64
  541. assert res['a'].dtype == 'float64'
  542. # ToDo: this must be int64
  543. assert res['b'].dtype == 'int64'
  544. def test_combine_first_timezone(self):
  545. # see gh-7630
  546. data1 = pd.to_datetime('20100101 01:01').tz_localize('UTC')
  547. df1 = pd.DataFrame(columns=['UTCdatetime', 'abc'],
  548. data=data1,
  549. index=pd.date_range('20140627', periods=1))
  550. data2 = pd.to_datetime('20121212 12:12').tz_localize('UTC')
  551. df2 = pd.DataFrame(columns=['UTCdatetime', 'xyz'],
  552. data=data2,
  553. index=pd.date_range('20140628', periods=1))
  554. res = df2[['UTCdatetime']].combine_first(df1)
  555. exp = pd.DataFrame({'UTCdatetime': [pd.Timestamp('2010-01-01 01:01',
  556. tz='UTC'),
  557. pd.Timestamp('2012-12-12 12:12',
  558. tz='UTC')],
  559. 'abc': [pd.Timestamp('2010-01-01 01:01:00',
  560. tz='UTC'), pd.NaT]},
  561. columns=['UTCdatetime', 'abc'],
  562. index=pd.date_range('20140627', periods=2,
  563. freq='D'))
  564. tm.assert_frame_equal(res, exp)
  565. assert res['UTCdatetime'].dtype == 'datetime64[ns, UTC]'
  566. assert res['abc'].dtype == 'datetime64[ns, UTC]'
  567. # see gh-10567
  568. dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='UTC')
  569. df1 = pd.DataFrame({'DATE': dts1})
  570. dts2 = pd.date_range('2015-01-03', '2015-01-05', tz='UTC')
  571. df2 = pd.DataFrame({'DATE': dts2})
  572. res = df1.combine_first(df2)
  573. tm.assert_frame_equal(res, df1)
  574. assert res['DATE'].dtype == 'datetime64[ns, UTC]'
  575. dts1 = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03',
  576. '2011-01-04'], tz='US/Eastern')
  577. df1 = pd.DataFrame({'DATE': dts1}, index=[1, 3, 5, 7])
  578. dts2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02',
  579. '2012-01-03'], tz='US/Eastern')
  580. df2 = pd.DataFrame({'DATE': dts2}, index=[2, 4, 5])
  581. res = df1.combine_first(df2)
  582. exp_dts = pd.DatetimeIndex(['2011-01-01', '2012-01-01', 'NaT',
  583. '2012-01-02', '2011-01-03', '2011-01-04'],
  584. tz='US/Eastern')
  585. exp = pd.DataFrame({'DATE': exp_dts}, index=[1, 2, 3, 4, 5, 7])
  586. tm.assert_frame_equal(res, exp)
  587. # different tz
  588. dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='US/Eastern')
  589. df1 = pd.DataFrame({'DATE': dts1})
  590. dts2 = pd.date_range('2015-01-03', '2015-01-05')
  591. df2 = pd.DataFrame({'DATE': dts2})
  592. # if df1 doesn't have NaN, keep its dtype
  593. res = df1.combine_first(df2)
  594. tm.assert_frame_equal(res, df1)
  595. assert res['DATE'].dtype == 'datetime64[ns, US/Eastern]'
  596. dts1 = pd.date_range('2015-01-01', '2015-01-02', tz='US/Eastern')
  597. df1 = pd.DataFrame({'DATE': dts1})
  598. dts2 = pd.date_range('2015-01-01', '2015-01-03')
  599. df2 = pd.DataFrame({'DATE': dts2})
  600. res = df1.combine_first(df2)
  601. exp_dts = [pd.Timestamp('2015-01-01', tz='US/Eastern'),
  602. pd.Timestamp('2015-01-02', tz='US/Eastern'),
  603. pd.Timestamp('2015-01-03')]
  604. exp = pd.DataFrame({'DATE': exp_dts})
  605. tm.assert_frame_equal(res, exp)
  606. assert res['DATE'].dtype == 'object'
  607. def test_combine_first_timedelta(self):
  608. data1 = pd.TimedeltaIndex(['1 day', 'NaT', '3 day', '4day'])
  609. df1 = pd.DataFrame({'TD': data1}, index=[1, 3, 5, 7])
  610. data2 = pd.TimedeltaIndex(['10 day', '11 day', '12 day'])
  611. df2 = pd.DataFrame({'TD': data2}, index=[2, 4, 5])
  612. res = df1.combine_first(df2)
  613. exp_dts = pd.TimedeltaIndex(['1 day', '10 day', 'NaT',
  614. '11 day', '3 day', '4 day'])
  615. exp = pd.DataFrame({'TD': exp_dts}, index=[1, 2, 3, 4, 5, 7])
  616. tm.assert_frame_equal(res, exp)
  617. assert res['TD'].dtype == 'timedelta64[ns]'
  618. def test_combine_first_period(self):
  619. data1 = pd.PeriodIndex(['2011-01', 'NaT', '2011-03',
  620. '2011-04'], freq='M')
  621. df1 = pd.DataFrame({'P': data1}, index=[1, 3, 5, 7])
  622. data2 = pd.PeriodIndex(['2012-01-01', '2012-02',
  623. '2012-03'], freq='M')
  624. df2 = pd.DataFrame({'P': data2}, index=[2, 4, 5])
  625. res = df1.combine_first(df2)
  626. exp_dts = pd.PeriodIndex(['2011-01', '2012-01', 'NaT',
  627. '2012-02', '2011-03', '2011-04'],
  628. freq='M')
  629. exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7])
  630. tm.assert_frame_equal(res, exp)
  631. assert res['P'].dtype == data1.dtype
  632. # different freq
  633. dts2 = pd.PeriodIndex(['2012-01-01', '2012-01-02',
  634. '2012-01-03'], freq='D')
  635. df2 = pd.DataFrame({'P': dts2}, index=[2, 4, 5])
  636. res = df1.combine_first(df2)
  637. exp_dts = [pd.Period('2011-01', freq='M'),
  638. pd.Period('2012-01-01', freq='D'),
  639. pd.NaT,
  640. pd.Period('2012-01-02', freq='D'),
  641. pd.Period('2011-03', freq='M'),
  642. pd.Period('2011-04', freq='M')]
  643. exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7])
  644. tm.assert_frame_equal(res, exp)
  645. assert res['P'].dtype == 'object'
  646. def test_combine_first_int(self):
  647. # GH14687 - integer series that do no align exactly
  648. df1 = pd.DataFrame({'a': [0, 1, 3, 5]}, dtype='int64')
  649. df2 = pd.DataFrame({'a': [1, 4]}, dtype='int64')
  650. res = df1.combine_first(df2)
  651. tm.assert_frame_equal(res, df1)
  652. assert res['a'].dtype == 'int64'
  653. @pytest.mark.parametrize("val", [1, 1.0])
  654. def test_combine_first_with_asymmetric_other(self, val):
  655. # see gh-20699
  656. df1 = pd.DataFrame({'isNum': [val]})
  657. df2 = pd.DataFrame({'isBool': [True]})
  658. res = df1.combine_first(df2)
  659. exp = pd.DataFrame({'isBool': [True], 'isNum': [val]})
  660. tm.assert_frame_equal(res, exp)
  661. def test_concat_datetime_datetime64_frame(self):
  662. # #2624
  663. rows = []
  664. rows.append([datetime(2010, 1, 1), 1])
  665. rows.append([datetime(2010, 1, 2), 'hi'])
  666. df2_obj = DataFrame.from_records(rows, columns=['date', 'test'])
  667. ind = date_range(start="2000/1/1", freq="D", periods=10)
  668. df1 = DataFrame({'date': ind, 'test': lrange(10)})
  669. # it works!
  670. pd.concat([df1, df2_obj])
  671. class TestDataFrameUpdate(TestData):
  672. def test_update_nan(self):
  673. # #15593 #15617
  674. # test 1
  675. df1 = DataFrame({'A': [1.0, 2, 3], 'B': date_range('2000', periods=3)})
  676. df2 = DataFrame({'A': [None, 2, 3]})
  677. expected = df1.copy()
  678. df1.update(df2, overwrite=False)
  679. tm.assert_frame_equal(df1, expected)
  680. # test 2
  681. df1 = DataFrame({'A': [1.0, None, 3],
  682. 'B': date_range('2000', periods=3)})
  683. df2 = DataFrame({'A': [None, 2, 3]})
  684. expected = DataFrame({'A': [1.0, 2, 3],
  685. 'B': date_range('2000', periods=3)})
  686. df1.update(df2, overwrite=False)
  687. tm.assert_frame_equal(df1, expected)