test_join.py 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880
  1. # pylint: disable=E1103
  2. from warnings import catch_warnings
  3. import numpy as np
  4. from numpy.random import randn
  5. import pytest
  6. from pandas._libs import join as libjoin
  7. import pandas.compat as compat
  8. from pandas.compat import lrange
  9. import pandas as pd
  10. from pandas import DataFrame, Index, MultiIndex, Series, concat, merge
  11. from pandas.tests.reshape.merge.test_merge import NGROUPS, N, get_test_data
  12. import pandas.util.testing as tm
  13. from pandas.util.testing import assert_frame_equal
  14. a_ = np.array
  15. @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
  16. class TestJoin(object):
  17. def setup_method(self, method):
  18. # aggregate multiple columns
  19. self.df = DataFrame({'key1': get_test_data(),
  20. 'key2': get_test_data(),
  21. 'data1': np.random.randn(N),
  22. 'data2': np.random.randn(N)})
  23. # exclude a couple keys for fun
  24. self.df = self.df[self.df['key2'] > 1]
  25. self.df2 = DataFrame({'key1': get_test_data(n=N // 5),
  26. 'key2': get_test_data(ngroups=NGROUPS // 2,
  27. n=N // 5),
  28. 'value': np.random.randn(N // 5)})
  29. index, data = tm.getMixedTypeDict()
  30. self.target = DataFrame(data, index=index)
  31. # Join on string value
  32. self.source = DataFrame({'MergedA': data['A'], 'MergedD': data['D']},
  33. index=data['C'])
  34. def test_cython_left_outer_join(self):
  35. left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
  36. right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
  37. max_group = 5
  38. ls, rs = libjoin.left_outer_join(left, right, max_group)
  39. exp_ls = left.argsort(kind='mergesort')
  40. exp_rs = right.argsort(kind='mergesort')
  41. exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5,
  42. 6, 6, 7, 7, 8, 8, 9, 10])
  43. exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3,
  44. 4, 5, 4, 5, 4, 5, -1, -1])
  45. exp_ls = exp_ls.take(exp_li)
  46. exp_ls[exp_li == -1] = -1
  47. exp_rs = exp_rs.take(exp_ri)
  48. exp_rs[exp_ri == -1] = -1
  49. tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
  50. tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
  51. def test_cython_right_outer_join(self):
  52. left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
  53. right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
  54. max_group = 5
  55. rs, ls = libjoin.left_outer_join(right, left, max_group)
  56. exp_ls = left.argsort(kind='mergesort')
  57. exp_rs = right.argsort(kind='mergesort')
  58. # 0 1 1 1
  59. exp_li = a_([0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5,
  60. # 2 2 4
  61. 6, 7, 8, 6, 7, 8, -1])
  62. exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3,
  63. 4, 4, 4, 5, 5, 5, 6])
  64. exp_ls = exp_ls.take(exp_li)
  65. exp_ls[exp_li == -1] = -1
  66. exp_rs = exp_rs.take(exp_ri)
  67. exp_rs[exp_ri == -1] = -1
  68. tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
  69. tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
  70. def test_cython_inner_join(self):
  71. left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
  72. right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64)
  73. max_group = 5
  74. ls, rs = libjoin.inner_join(left, right, max_group)
  75. exp_ls = left.argsort(kind='mergesort')
  76. exp_rs = right.argsort(kind='mergesort')
  77. exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5,
  78. 6, 6, 7, 7, 8, 8])
  79. exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3,
  80. 4, 5, 4, 5, 4, 5])
  81. exp_ls = exp_ls.take(exp_li)
  82. exp_ls[exp_li == -1] = -1
  83. exp_rs = exp_rs.take(exp_ri)
  84. exp_rs[exp_ri == -1] = -1
  85. tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
  86. tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
  87. def test_left_outer_join(self):
  88. joined_key2 = merge(self.df, self.df2, on='key2')
  89. _check_join(self.df, self.df2, joined_key2, ['key2'], how='left')
  90. joined_both = merge(self.df, self.df2)
  91. _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
  92. how='left')
  93. def test_right_outer_join(self):
  94. joined_key2 = merge(self.df, self.df2, on='key2', how='right')
  95. _check_join(self.df, self.df2, joined_key2, ['key2'], how='right')
  96. joined_both = merge(self.df, self.df2, how='right')
  97. _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
  98. how='right')
  99. def test_full_outer_join(self):
  100. joined_key2 = merge(self.df, self.df2, on='key2', how='outer')
  101. _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer')
  102. joined_both = merge(self.df, self.df2, how='outer')
  103. _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
  104. how='outer')
  105. def test_inner_join(self):
  106. joined_key2 = merge(self.df, self.df2, on='key2', how='inner')
  107. _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner')
  108. joined_both = merge(self.df, self.df2, how='inner')
  109. _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
  110. how='inner')
  111. def test_handle_overlap(self):
  112. joined = merge(self.df, self.df2, on='key2',
  113. suffixes=['.foo', '.bar'])
  114. assert 'key1.foo' in joined
  115. assert 'key1.bar' in joined
  116. def test_handle_overlap_arbitrary_key(self):
  117. joined = merge(self.df, self.df2,
  118. left_on='key2', right_on='key1',
  119. suffixes=['.foo', '.bar'])
  120. assert 'key1.foo' in joined
  121. assert 'key2.bar' in joined
  122. def test_join_on(self):
  123. target = self.target
  124. source = self.source
  125. merged = target.join(source, on='C')
  126. tm.assert_series_equal(merged['MergedA'], target['A'],
  127. check_names=False)
  128. tm.assert_series_equal(merged['MergedD'], target['D'],
  129. check_names=False)
  130. # join with duplicates (fix regression from DataFrame/Matrix merge)
  131. df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
  132. df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])
  133. joined = df.join(df2, on='key')
  134. expected = DataFrame({'key': ['a', 'a', 'b', 'b', 'c'],
  135. 'value': [0, 0, 1, 1, 2]})
  136. assert_frame_equal(joined, expected)
  137. # Test when some are missing
  138. df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'],
  139. columns=['one'])
  140. df_b = DataFrame([['foo'], ['bar']], index=[1, 2],
  141. columns=['two'])
  142. df_c = DataFrame([[1], [2]], index=[1, 2],
  143. columns=['three'])
  144. joined = df_a.join(df_b, on='one')
  145. joined = joined.join(df_c, on='one')
  146. assert np.isnan(joined['two']['c'])
  147. assert np.isnan(joined['three']['c'])
  148. # merge column not p resent
  149. with pytest.raises(KeyError, match="^'E'$"):
  150. target.join(source, on='E')
  151. # overlap
  152. source_copy = source.copy()
  153. source_copy['A'] = 0
  154. msg = ("You are trying to merge on float64 and object columns. If"
  155. " you wish to proceed you should use pd.concat")
  156. with pytest.raises(ValueError, match=msg):
  157. target.join(source_copy, on='A')
  158. def test_join_on_fails_with_different_right_index(self):
  159. df = DataFrame({'a': np.random.choice(['m', 'f'], size=3),
  160. 'b': np.random.randn(3)})
  161. df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10),
  162. 'b': np.random.randn(10)},
  163. index=tm.makeCustomIndex(10, 2))
  164. msg = (r'len\(left_on\) must equal the number of levels in the index'
  165. ' of "right"')
  166. with pytest.raises(ValueError, match=msg):
  167. merge(df, df2, left_on='a', right_index=True)
  168. def test_join_on_fails_with_different_left_index(self):
  169. df = DataFrame({'a': np.random.choice(['m', 'f'], size=3),
  170. 'b': np.random.randn(3)},
  171. index=tm.makeCustomIndex(3, 2))
  172. df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10),
  173. 'b': np.random.randn(10)})
  174. msg = (r'len\(right_on\) must equal the number of levels in the index'
  175. ' of "left"')
  176. with pytest.raises(ValueError, match=msg):
  177. merge(df, df2, right_on='b', left_index=True)
  178. def test_join_on_fails_with_different_column_counts(self):
  179. df = DataFrame({'a': np.random.choice(['m', 'f'], size=3),
  180. 'b': np.random.randn(3)})
  181. df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10),
  182. 'b': np.random.randn(10)},
  183. index=tm.makeCustomIndex(10, 2))
  184. msg = r"len\(right_on\) must equal len\(left_on\)"
  185. with pytest.raises(ValueError, match=msg):
  186. merge(df, df2, right_on='a', left_on=['a', 'b'])
  187. @pytest.mark.parametrize("wrong_type", [2, 'str', None, np.array([0, 1])])
  188. def test_join_on_fails_with_wrong_object_type(self, wrong_type):
  189. # GH12081 - original issue
  190. # GH21220 - merging of Series and DataFrame is now allowed
  191. # Edited test to remove the Series object from test parameters
  192. df = DataFrame({'a': [1, 1]})
  193. msg = ("Can only merge Series or DataFrame objects, a {} was passed"
  194. .format(str(type(wrong_type))))
  195. with pytest.raises(TypeError, match=msg):
  196. merge(wrong_type, df, left_on='a', right_on='a')
  197. with pytest.raises(TypeError, match=msg):
  198. merge(df, wrong_type, left_on='a', right_on='a')
  199. def test_join_on_pass_vector(self):
  200. expected = self.target.join(self.source, on='C')
  201. del expected['C']
  202. join_col = self.target.pop('C')
  203. result = self.target.join(self.source, on=join_col)
  204. assert_frame_equal(result, expected)
  205. def test_join_with_len0(self):
  206. # nothing to merge
  207. merged = self.target.join(self.source.reindex([]), on='C')
  208. for col in self.source:
  209. assert col in merged
  210. assert merged[col].isna().all()
  211. merged2 = self.target.join(self.source.reindex([]), on='C',
  212. how='inner')
  213. tm.assert_index_equal(merged2.columns, merged.columns)
  214. assert len(merged2) == 0
  215. def test_join_on_inner(self):
  216. df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']})
  217. df2 = DataFrame({'value': [0, 1]}, index=['a', 'b'])
  218. joined = df.join(df2, on='key', how='inner')
  219. expected = df.join(df2, on='key')
  220. expected = expected[expected['value'].notna()]
  221. tm.assert_series_equal(joined['key'], expected['key'],
  222. check_dtype=False)
  223. tm.assert_series_equal(joined['value'], expected['value'],
  224. check_dtype=False)
  225. tm.assert_index_equal(joined.index, expected.index)
  226. def test_join_on_singlekey_list(self):
  227. df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
  228. df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])
  229. # corner cases
  230. joined = df.join(df2, on=['key'])
  231. expected = df.join(df2, on='key')
  232. assert_frame_equal(joined, expected)
  233. def test_join_on_series(self):
  234. result = self.target.join(self.source['MergedA'], on='C')
  235. expected = self.target.join(self.source[['MergedA']], on='C')
  236. assert_frame_equal(result, expected)
  237. def test_join_on_series_buglet(self):
  238. # GH #638
  239. df = DataFrame({'a': [1, 1]})
  240. ds = Series([2], index=[1], name='b')
  241. result = df.join(ds, on='a')
  242. expected = DataFrame({'a': [1, 1],
  243. 'b': [2, 2]}, index=df.index)
  244. tm.assert_frame_equal(result, expected)
  245. def test_join_index_mixed(self, join_type):
  246. # no overlapping blocks
  247. df1 = DataFrame(index=np.arange(10))
  248. df1['bool'] = True
  249. df1['string'] = 'foo'
  250. df2 = DataFrame(index=np.arange(5, 15))
  251. df2['int'] = 1
  252. df2['float'] = 1.
  253. joined = df1.join(df2, how=join_type)
  254. expected = _join_by_hand(df1, df2, how=join_type)
  255. assert_frame_equal(joined, expected)
  256. joined = df2.join(df1, how=join_type)
  257. expected = _join_by_hand(df2, df1, how=join_type)
  258. assert_frame_equal(joined, expected)
  259. def test_join_index_mixed_overlap(self):
  260. df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True},
  261. index=np.arange(10),
  262. columns=['A', 'B', 'C', 'D'])
  263. assert df1['B'].dtype == np.int64
  264. assert df1['D'].dtype == np.bool_
  265. df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True},
  266. index=np.arange(0, 10, 2),
  267. columns=['A', 'B', 'C', 'D'])
  268. # overlap
  269. joined = df1.join(df2, lsuffix='_one', rsuffix='_two')
  270. expected_columns = ['A_one', 'B_one', 'C_one', 'D_one',
  271. 'A_two', 'B_two', 'C_two', 'D_two']
  272. df1.columns = expected_columns[:4]
  273. df2.columns = expected_columns[4:]
  274. expected = _join_by_hand(df1, df2)
  275. assert_frame_equal(joined, expected)
  276. def test_join_empty_bug(self):
  277. # generated an exception in 0.4.3
  278. x = DataFrame()
  279. x.join(DataFrame([3], index=[0], columns=['A']), how='outer')
  280. def test_join_unconsolidated(self):
  281. # GH #331
  282. a = DataFrame(randn(30, 2), columns=['a', 'b'])
  283. c = Series(randn(30))
  284. a['c'] = c
  285. d = DataFrame(randn(30, 1), columns=['q'])
  286. # it works!
  287. a.join(d)
  288. d.join(a)
  289. def test_join_multiindex(self):
  290. index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'],
  291. [1, 2, 3, 1, 2, 3]],
  292. names=['first', 'second'])
  293. index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'],
  294. [1, 2, 3, 1, 2, 3]],
  295. names=['first', 'second'])
  296. df1 = DataFrame(data=np.random.randn(6), index=index1,
  297. columns=['var X'])
  298. df2 = DataFrame(data=np.random.randn(6), index=index2,
  299. columns=['var Y'])
  300. df1 = df1.sort_index(level=0)
  301. df2 = df2.sort_index(level=0)
  302. joined = df1.join(df2, how='outer')
  303. ex_index = Index(index1.values).union(Index(index2.values))
  304. expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
  305. expected.index.names = index1.names
  306. assert_frame_equal(joined, expected)
  307. assert joined.index.names == index1.names
  308. df1 = df1.sort_index(level=1)
  309. df2 = df2.sort_index(level=1)
  310. joined = df1.join(df2, how='outer').sort_index(level=0)
  311. ex_index = Index(index1.values).union(Index(index2.values))
  312. expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
  313. expected.index.names = index1.names
  314. assert_frame_equal(joined, expected)
  315. assert joined.index.names == index1.names
  316. def test_join_inner_multiindex(self):
  317. key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
  318. 'qux', 'snap']
  319. key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
  320. 'three', 'one']
  321. data = np.random.randn(len(key1))
  322. data = DataFrame({'key1': key1, 'key2': key2,
  323. 'data': data})
  324. index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
  325. ['one', 'two', 'three']],
  326. codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
  327. [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
  328. names=['first', 'second'])
  329. to_join = DataFrame(np.random.randn(10, 3), index=index,
  330. columns=['j_one', 'j_two', 'j_three'])
  331. joined = data.join(to_join, on=['key1', 'key2'], how='inner')
  332. expected = merge(data, to_join.reset_index(),
  333. left_on=['key1', 'key2'],
  334. right_on=['first', 'second'], how='inner',
  335. sort=False)
  336. expected2 = merge(to_join, data,
  337. right_on=['key1', 'key2'], left_index=True,
  338. how='inner', sort=False)
  339. assert_frame_equal(joined, expected2.reindex_like(joined))
  340. expected2 = merge(to_join, data, right_on=['key1', 'key2'],
  341. left_index=True, how='inner', sort=False)
  342. expected = expected.drop(['first', 'second'], axis=1)
  343. expected.index = joined.index
  344. assert joined.index.is_monotonic
  345. assert_frame_equal(joined, expected)
  346. # _assert_same_contents(expected, expected2.loc[:, expected.columns])
  347. def test_join_hierarchical_mixed(self):
  348. # GH 2024
  349. df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c'])
  350. new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]})
  351. other_df = DataFrame(
  352. [(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd'])
  353. other_df.set_index('a', inplace=True)
  354. # GH 9455, 12219
  355. with tm.assert_produces_warning(UserWarning):
  356. result = merge(new_df, other_df, left_index=True, right_index=True)
  357. assert ('b', 'mean') in result
  358. assert 'b' in result
  359. def test_join_float64_float32(self):
  360. a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64)
  361. b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32)
  362. joined = a.join(b)
  363. assert joined.dtypes['a'] == 'float64'
  364. assert joined.dtypes['b'] == 'float64'
  365. assert joined.dtypes['c'] == 'float32'
  366. a = np.random.randint(0, 5, 100).astype('int64')
  367. b = np.random.random(100).astype('float64')
  368. c = np.random.random(100).astype('float32')
  369. df = DataFrame({'a': a, 'b': b, 'c': c})
  370. xpdf = DataFrame({'a': a, 'b': b, 'c': c})
  371. s = DataFrame(np.random.random(5).astype('float32'), columns=['md'])
  372. rs = df.merge(s, left_on='a', right_index=True)
  373. assert rs.dtypes['a'] == 'int64'
  374. assert rs.dtypes['b'] == 'float64'
  375. assert rs.dtypes['c'] == 'float32'
  376. assert rs.dtypes['md'] == 'float32'
  377. xp = xpdf.merge(s, left_on='a', right_index=True)
  378. assert_frame_equal(rs, xp)
  379. def test_join_many_non_unique_index(self):
  380. df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
  381. df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
  382. df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
  383. idf1 = df1.set_index(["a", "b"])
  384. idf2 = df2.set_index(["a", "b"])
  385. idf3 = df3.set_index(["a", "b"])
  386. result = idf1.join([idf2, idf3], how='outer')
  387. df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer')
  388. expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer')
  389. result = result.reset_index()
  390. expected = expected[result.columns]
  391. expected['a'] = expected.a.astype('int64')
  392. expected['b'] = expected.b.astype('int64')
  393. assert_frame_equal(result, expected)
  394. df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
  395. df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
  396. df3 = DataFrame(
  397. {"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]})
  398. idf1 = df1.set_index(["a", "b"])
  399. idf2 = df2.set_index(["a", "b"])
  400. idf3 = df3.set_index(["a", "b"])
  401. result = idf1.join([idf2, idf3], how='inner')
  402. df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner')
  403. expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner')
  404. result = result.reset_index()
  405. assert_frame_equal(result, expected.loc[:, result.columns])
  406. # GH 11519
  407. df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
  408. 'foo', 'bar', 'foo', 'foo'],
  409. 'B': ['one', 'one', 'two', 'three',
  410. 'two', 'two', 'one', 'three'],
  411. 'C': np.random.randn(8),
  412. 'D': np.random.randn(8)})
  413. s = Series(np.repeat(np.arange(8), 2),
  414. index=np.repeat(np.arange(8), 2), name='TEST')
  415. inner = df.join(s, how='inner')
  416. outer = df.join(s, how='outer')
  417. left = df.join(s, how='left')
  418. right = df.join(s, how='right')
  419. assert_frame_equal(inner, outer)
  420. assert_frame_equal(inner, left)
  421. assert_frame_equal(inner, right)
  422. def test_join_sort(self):
  423. left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'],
  424. 'value': [1, 2, 3, 4]})
  425. right = DataFrame({'value2': ['a', 'b', 'c']},
  426. index=['bar', 'baz', 'foo'])
  427. joined = left.join(right, on='key', sort=True)
  428. expected = DataFrame({'key': ['bar', 'baz', 'foo', 'foo'],
  429. 'value': [2, 3, 1, 4],
  430. 'value2': ['a', 'b', 'c', 'c']},
  431. index=[1, 2, 0, 3])
  432. assert_frame_equal(joined, expected)
  433. # smoke test
  434. joined = left.join(right, on='key', sort=False)
  435. tm.assert_index_equal(joined.index, pd.Index(lrange(4)))
  436. def test_join_mixed_non_unique_index(self):
  437. # GH 12814, unorderable types in py3 with a non-unique index
  438. df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a'])
  439. df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4])
  440. result = df1.join(df2)
  441. expected = DataFrame({'a': [1, 2, 3, 3, 4],
  442. 'b': [5, np.nan, 6, 7, np.nan]},
  443. index=[1, 2, 3, 3, 'a'])
  444. tm.assert_frame_equal(result, expected)
  445. df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a'])
  446. df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4])
  447. result = df3.join(df4)
  448. expected = DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 6, np.nan]},
  449. index=[1, 2, 2, 'a'])
  450. tm.assert_frame_equal(result, expected)
  451. def test_join_non_unique_period_index(self):
  452. # GH #16871
  453. index = pd.period_range('2016-01-01', periods=16, freq='M')
  454. df = DataFrame([i for i in range(len(index))],
  455. index=index, columns=['pnum'])
  456. df2 = concat([df, df])
  457. result = df.join(df2, how='inner', rsuffix='_df2')
  458. expected = DataFrame(
  459. np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
  460. columns=['pnum', 'pnum_df2'], index=df2.sort_index().index)
  461. tm.assert_frame_equal(result, expected)
  462. def test_mixed_type_join_with_suffix(self):
  463. # GH #916
  464. df = DataFrame(np.random.randn(20, 6),
  465. columns=['a', 'b', 'c', 'd', 'e', 'f'])
  466. df.insert(0, 'id', 0)
  467. df.insert(5, 'dt', 'foo')
  468. grouped = df.groupby('id')
  469. mn = grouped.mean()
  470. cn = grouped.count()
  471. # it works!
  472. mn.join(cn, rsuffix='_right')
  473. def test_join_many(self):
  474. df = DataFrame(np.random.randn(10, 6), columns=list('abcdef'))
  475. df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]]
  476. joined = df_list[0].join(df_list[1:])
  477. tm.assert_frame_equal(joined, df)
  478. df_list = [df[['a', 'b']][:-2],
  479. df[['c', 'd']][2:], df[['e', 'f']][1:9]]
  480. def _check_diff_index(df_list, result, exp_index):
  481. reindexed = [x.reindex(exp_index) for x in df_list]
  482. expected = reindexed[0].join(reindexed[1:])
  483. tm.assert_frame_equal(result, expected)
  484. # different join types
  485. joined = df_list[0].join(df_list[1:], how='outer')
  486. _check_diff_index(df_list, joined, df.index)
  487. joined = df_list[0].join(df_list[1:])
  488. _check_diff_index(df_list, joined, df_list[0].index)
  489. joined = df_list[0].join(df_list[1:], how='inner')
  490. _check_diff_index(df_list, joined, df.index[2:8])
  491. msg = "Joining multiple DataFrames only supported for joining on index"
  492. with pytest.raises(ValueError, match=msg):
  493. df_list[0].join(df_list[1:], on='a')
  494. def test_join_many_mixed(self):
  495. df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
  496. df['key'] = ['foo', 'bar'] * 4
  497. df1 = df.loc[:, ['A', 'B']]
  498. df2 = df.loc[:, ['C', 'D']]
  499. df3 = df.loc[:, ['key']]
  500. result = df1.join([df2, df3])
  501. assert_frame_equal(result, df)
  502. def test_join_dups(self):
  503. # joining dups
  504. df = concat([DataFrame(np.random.randn(10, 4),
  505. columns=['A', 'A', 'B', 'B']),
  506. DataFrame(np.random.randint(0, 10, size=20)
  507. .reshape(10, 2),
  508. columns=['A', 'C'])],
  509. axis=1)
  510. expected = concat([df, df], axis=1)
  511. result = df.join(df, rsuffix='_2')
  512. result.columns = expected.columns
  513. assert_frame_equal(result, expected)
  514. # GH 4975, invalid join on dups
  515. w = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
  516. x = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
  517. y = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
  518. z = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
  519. dta = x.merge(y, left_index=True, right_index=True).merge(
  520. z, left_index=True, right_index=True, how="outer")
  521. dta = dta.merge(w, left_index=True, right_index=True)
  522. expected = concat([x, y, z, w], axis=1)
  523. expected.columns = ['x_x', 'y_x', 'x_y',
  524. 'y_y', 'x_x', 'y_x', 'x_y', 'y_y']
  525. assert_frame_equal(dta, expected)
  526. def test_panel_join(self):
  527. with catch_warnings(record=True):
  528. panel = tm.makePanel()
  529. tm.add_nans(panel)
  530. p1 = panel.iloc[:2, :10, :3]
  531. p2 = panel.iloc[2:, 5:, 2:]
  532. # left join
  533. result = p1.join(p2)
  534. expected = p1.copy()
  535. expected['ItemC'] = p2['ItemC']
  536. tm.assert_panel_equal(result, expected)
  537. # right join
  538. result = p1.join(p2, how='right')
  539. expected = p2.copy()
  540. expected['ItemA'] = p1['ItemA']
  541. expected['ItemB'] = p1['ItemB']
  542. expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC'])
  543. tm.assert_panel_equal(result, expected)
  544. # inner join
  545. result = p1.join(p2, how='inner')
  546. expected = panel.iloc[:, 5:10, 2:3]
  547. tm.assert_panel_equal(result, expected)
  548. # outer join
  549. result = p1.join(p2, how='outer')
  550. expected = p1.reindex(major=panel.major_axis,
  551. minor=panel.minor_axis)
  552. expected = expected.join(p2.reindex(major=panel.major_axis,
  553. minor=panel.minor_axis))
  554. tm.assert_panel_equal(result, expected)
  555. def test_panel_join_overlap(self):
  556. with catch_warnings(record=True):
  557. panel = tm.makePanel()
  558. tm.add_nans(panel)
  559. p1 = panel.loc[['ItemA', 'ItemB', 'ItemC']]
  560. p2 = panel.loc[['ItemB', 'ItemC']]
  561. # Expected index is
  562. #
  563. # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2
  564. joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2')
  565. p1_suf = p1.loc[['ItemB', 'ItemC']].add_suffix('_p1')
  566. p2_suf = p2.loc[['ItemB', 'ItemC']].add_suffix('_p2')
  567. no_overlap = panel.loc[['ItemA']]
  568. expected = no_overlap.join(p1_suf.join(p2_suf))
  569. tm.assert_panel_equal(joined, expected)
  570. def test_panel_join_many(self):
  571. with catch_warnings(record=True):
  572. tm.K = 10
  573. panel = tm.makePanel()
  574. tm.K = 4
  575. panels = [panel.iloc[:2], panel.iloc[2:6], panel.iloc[6:]]
  576. joined = panels[0].join(panels[1:])
  577. tm.assert_panel_equal(joined, panel)
  578. panels = [panel.iloc[:2, :-5],
  579. panel.iloc[2:6, 2:],
  580. panel.iloc[6:, 5:-7]]
  581. data_dict = {}
  582. for p in panels:
  583. data_dict.update(p.iteritems())
  584. joined = panels[0].join(panels[1:], how='inner')
  585. expected = pd.Panel.from_dict(data_dict, intersect=True)
  586. tm.assert_panel_equal(joined, expected)
  587. joined = panels[0].join(panels[1:], how='outer')
  588. expected = pd.Panel.from_dict(data_dict, intersect=False)
  589. tm.assert_panel_equal(joined, expected)
  590. # edge cases
  591. msg = "Suffixes not supported when passing multiple panels"
  592. with pytest.raises(ValueError, match=msg):
  593. panels[0].join(panels[1:], how='outer', lsuffix='foo',
  594. rsuffix='bar')
  595. msg = "Right join not supported with multiple panels"
  596. with pytest.raises(ValueError, match=msg):
  597. panels[0].join(panels[1:], how='right')
  598. def test_join_multi_to_multi(self, join_type):
  599. # GH 20475
  600. leftindex = MultiIndex.from_product([list('abc'), list('xy'), [1, 2]],
  601. names=['abc', 'xy', 'num'])
  602. left = DataFrame({'v1': range(12)}, index=leftindex)
  603. rightindex = MultiIndex.from_product([list('abc'), list('xy')],
  604. names=['abc', 'xy'])
  605. right = DataFrame({'v2': [100 * i for i in range(1, 7)]},
  606. index=rightindex)
  607. result = left.join(right, on=['abc', 'xy'], how=join_type)
  608. expected = (left.reset_index()
  609. .merge(right.reset_index(),
  610. on=['abc', 'xy'], how=join_type)
  611. .set_index(['abc', 'xy', 'num'])
  612. )
  613. assert_frame_equal(expected, result)
  614. msg = (r'len\(left_on\) must equal the number of levels in the index'
  615. ' of "right"')
  616. with pytest.raises(ValueError, match=msg):
  617. left.join(right, on='xy', how=join_type)
  618. with pytest.raises(ValueError, match=msg):
  619. right.join(left, on=['abc', 'xy'], how=join_type)
  620. def test_join_on_tz_aware_datetimeindex(self):
  621. # GH 23931
  622. df1 = pd.DataFrame(
  623. {
  624. 'date': pd.date_range(start='2018-01-01', periods=5,
  625. tz='America/Chicago'),
  626. 'vals': list('abcde')
  627. }
  628. )
  629. df2 = pd.DataFrame(
  630. {
  631. 'date': pd.date_range(start='2018-01-03', periods=5,
  632. tz='America/Chicago'),
  633. 'vals_2': list('tuvwx')
  634. }
  635. )
  636. result = df1.join(df2.set_index('date'), on='date')
  637. expected = df1.copy()
  638. expected['vals_2'] = pd.Series([np.nan] * len(expected), dtype=object)
  639. assert_frame_equal(result, expected)
  640. def _check_join(left, right, result, join_col, how='left',
  641. lsuffix='_x', rsuffix='_y'):
  642. # some smoke tests
  643. for c in join_col:
  644. assert(result[c].notna().all())
  645. left_grouped = left.groupby(join_col)
  646. right_grouped = right.groupby(join_col)
  647. for group_key, group in result.groupby(join_col):
  648. l_joined = _restrict_to_columns(group, left.columns, lsuffix)
  649. r_joined = _restrict_to_columns(group, right.columns, rsuffix)
  650. try:
  651. lgroup = left_grouped.get_group(group_key)
  652. except KeyError:
  653. if how in ('left', 'inner'):
  654. raise AssertionError('key %s should not have been in the join'
  655. % str(group_key))
  656. _assert_all_na(l_joined, left.columns, join_col)
  657. else:
  658. _assert_same_contents(l_joined, lgroup)
  659. try:
  660. rgroup = right_grouped.get_group(group_key)
  661. except KeyError:
  662. if how in ('right', 'inner'):
  663. raise AssertionError('key %s should not have been in the join'
  664. % str(group_key))
  665. _assert_all_na(r_joined, right.columns, join_col)
  666. else:
  667. _assert_same_contents(r_joined, rgroup)
  668. def _restrict_to_columns(group, columns, suffix):
  669. found = [c for c in group.columns
  670. if c in columns or c.replace(suffix, '') in columns]
  671. # filter
  672. group = group.loc[:, found]
  673. # get rid of suffixes, if any
  674. group = group.rename(columns=lambda x: x.replace(suffix, ''))
  675. # put in the right order...
  676. group = group.loc[:, columns]
  677. return group
  678. def _assert_same_contents(join_chunk, source):
  679. NA_SENTINEL = -1234567 # drop_duplicates not so NA-friendly...
  680. jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values
  681. svalues = source.fillna(NA_SENTINEL).drop_duplicates().values
  682. rows = {tuple(row) for row in jvalues}
  683. assert(len(rows) == len(source))
  684. assert(all(tuple(row) in rows for row in svalues))
  685. def _assert_all_na(join_chunk, source_columns, join_col):
  686. for c in source_columns:
  687. if c in join_col:
  688. continue
  689. assert(join_chunk[c].isna().all())
  690. def _join_by_hand(a, b, how='left'):
  691. join_index = a.index.join(b.index, how=how)
  692. a_re = a.reindex(join_index)
  693. b_re = b.reindex(join_index)
  694. result_columns = a.columns.append(b.columns)
  695. for col, s in compat.iteritems(b_re):
  696. a_re[col] = s
  697. return a_re.reindex(columns=result_columns)