test_multi.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668
  1. # pylint: disable=E1103
  2. from collections import OrderedDict
  3. import numpy as np
  4. from numpy import nan
  5. from numpy.random import randn
  6. import pytest
  7. import pandas as pd
  8. from pandas import DataFrame, Index, MultiIndex, Series
  9. from pandas.core.reshape.concat import concat
  10. from pandas.core.reshape.merge import merge
  11. import pandas.util.testing as tm
  12. @pytest.fixture
  13. def left():
  14. """left dataframe (not multi-indexed) for multi-index join tests"""
  15. # a little relevant example with NAs
  16. key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
  17. 'qux', 'snap']
  18. key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
  19. 'three', 'one']
  20. data = np.random.randn(len(key1))
  21. return DataFrame({'key1': key1, 'key2': key2, 'data': data})
  22. @pytest.fixture
  23. def right():
  24. """right dataframe (multi-indexed) for multi-index join tests"""
  25. index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
  26. ['one', 'two', 'three']],
  27. codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
  28. [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
  29. names=['key1', 'key2'])
  30. return DataFrame(np.random.randn(10, 3), index=index,
  31. columns=['j_one', 'j_two', 'j_three'])
  32. @pytest.fixture
  33. def left_multi():
  34. return (
  35. DataFrame(
  36. dict(Origin=['A', 'A', 'B', 'B', 'C'],
  37. Destination=['A', 'B', 'A', 'C', 'A'],
  38. Period=['AM', 'AM', 'IP', 'AM', 'OP'],
  39. TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'],
  40. Trips=[1987, 3647, 2470, 4296, 4444]),
  41. columns=['Origin', 'Destination', 'Period',
  42. 'TripPurp', 'Trips'])
  43. .set_index(['Origin', 'Destination', 'Period', 'TripPurp']))
  44. @pytest.fixture
  45. def right_multi():
  46. return (
  47. DataFrame(
  48. dict(Origin=['A', 'A', 'B', 'B', 'C', 'C', 'E'],
  49. Destination=['A', 'B', 'A', 'B', 'A', 'B', 'F'],
  50. Period=['AM', 'AM', 'IP', 'AM', 'OP', 'IP', 'AM'],
  51. LinkType=['a', 'b', 'c', 'b', 'a', 'b', 'a'],
  52. Distance=[100, 80, 90, 80, 75, 35, 55]),
  53. columns=['Origin', 'Destination', 'Period',
  54. 'LinkType', 'Distance'])
  55. .set_index(['Origin', 'Destination', 'Period', 'LinkType']))
  56. @pytest.fixture
  57. def on_cols_multi():
  58. return ['Origin', 'Destination', 'Period']
  59. @pytest.fixture
  60. def idx_cols_multi():
  61. return ['Origin', 'Destination', 'Period', 'TripPurp', 'LinkType']
  62. class TestMergeMulti(object):
  63. def setup_method(self):
  64. self.index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
  65. ['one', 'two', 'three']],
  66. codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
  67. [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
  68. names=['first', 'second'])
  69. self.to_join = DataFrame(np.random.randn(10, 3), index=self.index,
  70. columns=['j_one', 'j_two', 'j_three'])
  71. # a little relevant example with NAs
  72. key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
  73. 'qux', 'snap']
  74. key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
  75. 'three', 'one']
  76. data = np.random.randn(len(key1))
  77. self.data = DataFrame({'key1': key1, 'key2': key2,
  78. 'data': data})
  79. def test_merge_on_multikey(self, left, right, join_type):
  80. on_cols = ['key1', 'key2']
  81. result = (left.join(right, on=on_cols, how=join_type)
  82. .reset_index(drop=True))
  83. expected = pd.merge(left, right.reset_index(),
  84. on=on_cols, how=join_type)
  85. tm.assert_frame_equal(result, expected)
  86. result = (left.join(right, on=on_cols, how=join_type, sort=True)
  87. .reset_index(drop=True))
  88. expected = pd.merge(left, right.reset_index(),
  89. on=on_cols, how=join_type, sort=True)
  90. tm.assert_frame_equal(result, expected)
  91. @pytest.mark.parametrize("sort", [False, True])
  92. def test_left_join_multi_index(self, left, right, sort):
  93. icols = ['1st', '2nd', '3rd']
  94. def bind_cols(df):
  95. iord = lambda a: 0 if a != a else ord(a)
  96. f = lambda ts: ts.map(iord) - ord('a')
  97. return (f(df['1st']) + f(df['3rd']) * 1e2 +
  98. df['2nd'].fillna(0) * 1e4)
  99. def run_asserts(left, right, sort):
  100. res = left.join(right, on=icols, how='left', sort=sort)
  101. assert len(left) < len(res) + 1
  102. assert not res['4th'].isna().any()
  103. assert not res['5th'].isna().any()
  104. tm.assert_series_equal(
  105. res['4th'], - res['5th'], check_names=False)
  106. result = bind_cols(res.iloc[:, :-2])
  107. tm.assert_series_equal(res['4th'], result, check_names=False)
  108. assert result.name is None
  109. if sort:
  110. tm.assert_frame_equal(
  111. res, res.sort_values(icols, kind='mergesort'))
  112. out = merge(left, right.reset_index(), on=icols,
  113. sort=sort, how='left')
  114. res.index = np.arange(len(res))
  115. tm.assert_frame_equal(out, res)
  116. lc = list(map(chr, np.arange(ord('a'), ord('z') + 1)))
  117. left = DataFrame(np.random.choice(lc, (5000, 2)),
  118. columns=['1st', '3rd'])
  119. left.insert(1, '2nd', np.random.randint(0, 1000, len(left)))
  120. i = np.random.permutation(len(left))
  121. right = left.iloc[i].copy()
  122. left['4th'] = bind_cols(left)
  123. right['5th'] = - bind_cols(right)
  124. right.set_index(icols, inplace=True)
  125. run_asserts(left, right, sort)
  126. # inject some nulls
  127. left.loc[1::23, '1st'] = np.nan
  128. left.loc[2::37, '2nd'] = np.nan
  129. left.loc[3::43, '3rd'] = np.nan
  130. left['4th'] = bind_cols(left)
  131. i = np.random.permutation(len(left))
  132. right = left.iloc[i, :-1]
  133. right['5th'] = - bind_cols(right)
  134. right.set_index(icols, inplace=True)
  135. run_asserts(left, right, sort)
  136. @pytest.mark.parametrize("sort", [False, True])
  137. def test_merge_right_vs_left(self, left, right, sort):
  138. # compare left vs right merge with multikey
  139. on_cols = ['key1', 'key2']
  140. merged_left_right = left.merge(right,
  141. left_on=on_cols, right_index=True,
  142. how='left', sort=sort)
  143. merge_right_left = right.merge(left,
  144. right_on=on_cols, left_index=True,
  145. how='right', sort=sort)
  146. # Reorder columns
  147. merge_right_left = merge_right_left[merged_left_right.columns]
  148. tm.assert_frame_equal(merged_left_right, merge_right_left)
  149. def test_compress_group_combinations(self):
  150. # ~ 40000000 possible unique groups
  151. key1 = tm.rands_array(10, 10000)
  152. key1 = np.tile(key1, 2)
  153. key2 = key1[::-1]
  154. df = DataFrame({'key1': key1, 'key2': key2,
  155. 'value1': np.random.randn(20000)})
  156. df2 = DataFrame({'key1': key1[::2], 'key2': key2[::2],
  157. 'value2': np.random.randn(10000)})
  158. # just to hit the label compression code path
  159. merge(df, df2, how='outer')
  160. def test_left_join_index_preserve_order(self):
  161. on_cols = ['k1', 'k2']
  162. left = DataFrame({'k1': [0, 1, 2] * 8,
  163. 'k2': ['foo', 'bar'] * 12,
  164. 'v': np.array(np.arange(24), dtype=np.int64)})
  165. index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
  166. right = DataFrame({'v2': [5, 7]}, index=index)
  167. result = left.join(right, on=on_cols)
  168. expected = left.copy()
  169. expected['v2'] = np.nan
  170. expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5
  171. expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7
  172. tm.assert_frame_equal(result, expected)
  173. result.sort_values(on_cols, kind='mergesort', inplace=True)
  174. expected = left.join(right, on=on_cols, sort=True)
  175. tm.assert_frame_equal(result, expected)
  176. # test join with multi dtypes blocks
  177. left = DataFrame({'k1': [0, 1, 2] * 8,
  178. 'k2': ['foo', 'bar'] * 12,
  179. 'k3': np.array([0, 1, 2] * 8, dtype=np.float32),
  180. 'v': np.array(np.arange(24), dtype=np.int32)})
  181. index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
  182. right = DataFrame({'v2': [5, 7]}, index=index)
  183. result = left.join(right, on=on_cols)
  184. expected = left.copy()
  185. expected['v2'] = np.nan
  186. expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5
  187. expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7
  188. tm.assert_frame_equal(result, expected)
  189. result = result.sort_values(on_cols, kind='mergesort')
  190. expected = left.join(right, on=on_cols, sort=True)
  191. tm.assert_frame_equal(result, expected)
  192. def test_left_join_index_multi_match_multiindex(self):
  193. left = DataFrame([
  194. ['X', 'Y', 'C', 'a'],
  195. ['W', 'Y', 'C', 'e'],
  196. ['V', 'Q', 'A', 'h'],
  197. ['V', 'R', 'D', 'i'],
  198. ['X', 'Y', 'D', 'b'],
  199. ['X', 'Y', 'A', 'c'],
  200. ['W', 'Q', 'B', 'f'],
  201. ['W', 'R', 'C', 'g'],
  202. ['V', 'Y', 'C', 'j'],
  203. ['X', 'Y', 'B', 'd']],
  204. columns=['cola', 'colb', 'colc', 'tag'],
  205. index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8])
  206. right = (DataFrame([
  207. ['W', 'R', 'C', 0],
  208. ['W', 'Q', 'B', 3],
  209. ['W', 'Q', 'B', 8],
  210. ['X', 'Y', 'A', 1],
  211. ['X', 'Y', 'A', 4],
  212. ['X', 'Y', 'B', 5],
  213. ['X', 'Y', 'C', 6],
  214. ['X', 'Y', 'C', 9],
  215. ['X', 'Q', 'C', -6],
  216. ['X', 'R', 'C', -9],
  217. ['V', 'Y', 'C', 7],
  218. ['V', 'R', 'D', 2],
  219. ['V', 'R', 'D', -1],
  220. ['V', 'Q', 'A', -3]],
  221. columns=['col1', 'col2', 'col3', 'val'])
  222. .set_index(['col1', 'col2', 'col3']))
  223. result = left.join(right, on=['cola', 'colb', 'colc'], how='left')
  224. expected = DataFrame([
  225. ['X', 'Y', 'C', 'a', 6],
  226. ['X', 'Y', 'C', 'a', 9],
  227. ['W', 'Y', 'C', 'e', nan],
  228. ['V', 'Q', 'A', 'h', -3],
  229. ['V', 'R', 'D', 'i', 2],
  230. ['V', 'R', 'D', 'i', -1],
  231. ['X', 'Y', 'D', 'b', nan],
  232. ['X', 'Y', 'A', 'c', 1],
  233. ['X', 'Y', 'A', 'c', 4],
  234. ['W', 'Q', 'B', 'f', 3],
  235. ['W', 'Q', 'B', 'f', 8],
  236. ['W', 'R', 'C', 'g', 0],
  237. ['V', 'Y', 'C', 'j', 7],
  238. ['X', 'Y', 'B', 'd', 5]],
  239. columns=['cola', 'colb', 'colc', 'tag', 'val'],
  240. index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8])
  241. tm.assert_frame_equal(result, expected)
  242. result = left.join(right, on=['cola', 'colb', 'colc'],
  243. how='left', sort=True)
  244. expected = expected.sort_values(['cola', 'colb', 'colc'],
  245. kind='mergesort')
  246. tm.assert_frame_equal(result, expected)
  247. def test_left_join_index_multi_match(self):
  248. left = DataFrame([
  249. ['c', 0],
  250. ['b', 1],
  251. ['a', 2],
  252. ['b', 3]],
  253. columns=['tag', 'val'],
  254. index=[2, 0, 1, 3])
  255. right = (DataFrame([
  256. ['a', 'v'],
  257. ['c', 'w'],
  258. ['c', 'x'],
  259. ['d', 'y'],
  260. ['a', 'z'],
  261. ['c', 'r'],
  262. ['e', 'q'],
  263. ['c', 's']],
  264. columns=['tag', 'char'])
  265. .set_index('tag'))
  266. result = left.join(right, on='tag', how='left')
  267. expected = DataFrame([
  268. ['c', 0, 'w'],
  269. ['c', 0, 'x'],
  270. ['c', 0, 'r'],
  271. ['c', 0, 's'],
  272. ['b', 1, nan],
  273. ['a', 2, 'v'],
  274. ['a', 2, 'z'],
  275. ['b', 3, nan]],
  276. columns=['tag', 'val', 'char'],
  277. index=[2, 2, 2, 2, 0, 1, 1, 3])
  278. tm.assert_frame_equal(result, expected)
  279. result = left.join(right, on='tag', how='left', sort=True)
  280. expected2 = expected.sort_values('tag', kind='mergesort')
  281. tm.assert_frame_equal(result, expected2)
  282. # GH7331 - maintain left frame order in left merge
  283. result = merge(left, right.reset_index(), how='left', on='tag')
  284. expected.index = np.arange(len(expected))
  285. tm.assert_frame_equal(result, expected)
  286. def test_left_merge_na_buglet(self):
  287. left = DataFrame({'id': list('abcde'), 'v1': randn(5),
  288. 'v2': randn(5), 'dummy': list('abcde'),
  289. 'v3': randn(5)},
  290. columns=['id', 'v1', 'v2', 'dummy', 'v3'])
  291. right = DataFrame({'id': ['a', 'b', np.nan, np.nan, np.nan],
  292. 'sv3': [1.234, 5.678, np.nan, np.nan, np.nan]})
  293. result = merge(left, right, on='id', how='left')
  294. rdf = right.drop(['id'], axis=1)
  295. expected = left.join(rdf)
  296. tm.assert_frame_equal(result, expected)
  297. def test_merge_na_keys(self):
  298. data = [[1950, "A", 1.5],
  299. [1950, "B", 1.5],
  300. [1955, "B", 1.5],
  301. [1960, "B", np.nan],
  302. [1970, "B", 4.],
  303. [1950, "C", 4.],
  304. [1960, "C", np.nan],
  305. [1965, "C", 3.],
  306. [1970, "C", 4.]]
  307. frame = DataFrame(data, columns=["year", "panel", "data"])
  308. other_data = [[1960, 'A', np.nan],
  309. [1970, 'A', np.nan],
  310. [1955, 'A', np.nan],
  311. [1965, 'A', np.nan],
  312. [1965, 'B', np.nan],
  313. [1955, 'C', np.nan]]
  314. other = DataFrame(other_data, columns=['year', 'panel', 'data'])
  315. result = frame.merge(other, how='outer')
  316. expected = frame.fillna(-999).merge(other.fillna(-999), how='outer')
  317. expected = expected.replace(-999, np.nan)
  318. tm.assert_frame_equal(result, expected)
  319. @pytest.mark.parametrize("klass", [None, np.asarray, Series, Index])
  320. def test_merge_datetime_index(self, klass):
  321. # see gh-19038
  322. df = DataFrame([1, 2, 3],
  323. ["2016-01-01", "2017-01-01", "2018-01-01"],
  324. columns=["a"])
  325. df.index = pd.to_datetime(df.index)
  326. on_vector = df.index.year
  327. if klass is not None:
  328. on_vector = klass(on_vector)
  329. expected = DataFrame(
  330. OrderedDict([
  331. ("a", [1, 2, 3]),
  332. ("key_1", [2016, 2017, 2018]),
  333. ])
  334. )
  335. result = df.merge(df, on=["a", on_vector], how="inner")
  336. tm.assert_frame_equal(result, expected)
  337. expected = DataFrame(
  338. OrderedDict([
  339. ("key_0", [2016, 2017, 2018]),
  340. ("a_x", [1, 2, 3]),
  341. ("a_y", [1, 2, 3]),
  342. ])
  343. )
  344. result = df.merge(df, on=[df.index.year], how="inner")
  345. tm.assert_frame_equal(result, expected)
  346. def test_join_multi_levels(self):
  347. # GH 3662
  348. # merge multi-levels
  349. household = (
  350. DataFrame(
  351. dict(household_id=[1, 2, 3],
  352. male=[0, 1, 0],
  353. wealth=[196087.3, 316478.7, 294750]),
  354. columns=['household_id', 'male', 'wealth'])
  355. .set_index('household_id'))
  356. portfolio = (
  357. DataFrame(
  358. dict(household_id=[1, 2, 2, 3, 3, 3, 4],
  359. asset_id=["nl0000301109", "nl0000289783", "gb00b03mlx29",
  360. "gb00b03mlx29", "lu0197800237", "nl0000289965",
  361. np.nan],
  362. name=["ABN Amro", "Robeco", "Royal Dutch Shell",
  363. "Royal Dutch Shell",
  364. "AAB Eastern Europe Equity Fund",
  365. "Postbank BioTech Fonds", np.nan],
  366. share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]),
  367. columns=['household_id', 'asset_id', 'name', 'share'])
  368. .set_index(['household_id', 'asset_id']))
  369. result = household.join(portfolio, how='inner')
  370. expected = (
  371. DataFrame(
  372. dict(male=[0, 1, 1, 0, 0, 0],
  373. wealth=[196087.3, 316478.7, 316478.7,
  374. 294750.0, 294750.0, 294750.0],
  375. name=['ABN Amro', 'Robeco', 'Royal Dutch Shell',
  376. 'Royal Dutch Shell',
  377. 'AAB Eastern Europe Equity Fund',
  378. 'Postbank BioTech Fonds'],
  379. share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25],
  380. household_id=[1, 2, 2, 3, 3, 3],
  381. asset_id=['nl0000301109', 'nl0000289783', 'gb00b03mlx29',
  382. 'gb00b03mlx29', 'lu0197800237',
  383. 'nl0000289965']))
  384. .set_index(['household_id', 'asset_id'])
  385. .reindex(columns=['male', 'wealth', 'name', 'share']))
  386. tm.assert_frame_equal(result, expected)
  387. # equivalency
  388. result = (merge(household.reset_index(), portfolio.reset_index(),
  389. on=['household_id'], how='inner')
  390. .set_index(['household_id', 'asset_id']))
  391. tm.assert_frame_equal(result, expected)
  392. result = household.join(portfolio, how='outer')
  393. expected = (concat([
  394. expected,
  395. (DataFrame(
  396. dict(share=[1.00]),
  397. index=MultiIndex.from_tuples(
  398. [(4, np.nan)],
  399. names=['household_id', 'asset_id'])))
  400. ], axis=0, sort=True).reindex(columns=expected.columns))
  401. tm.assert_frame_equal(result, expected)
  402. # invalid cases
  403. household.index.name = 'foo'
  404. with pytest.raises(ValueError):
  405. household.join(portfolio, how='inner')
  406. portfolio2 = portfolio.copy()
  407. portfolio2.index.set_names(['household_id', 'foo'])
  408. with pytest.raises(ValueError):
  409. portfolio2.join(portfolio, how='inner')
  410. def test_join_multi_levels2(self):
  411. # some more advanced merges
  412. # GH6360
  413. household = (
  414. DataFrame(
  415. dict(household_id=[1, 2, 2, 3, 3, 3, 4],
  416. asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29",
  417. "gb00b03mlx29", "lu0197800237", "nl0000289965",
  418. np.nan],
  419. share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]),
  420. columns=['household_id', 'asset_id', 'share'])
  421. .set_index(['household_id', 'asset_id']))
  422. log_return = DataFrame(dict(
  423. asset_id=["gb00b03mlx29", "gb00b03mlx29",
  424. "gb00b03mlx29", "lu0197800237", "lu0197800237"],
  425. t=[233, 234, 235, 180, 181],
  426. log_return=[.09604978, -.06524096, .03532373, .03025441, .036997]
  427. )).set_index(["asset_id", "t"])
  428. expected = (
  429. DataFrame(dict(
  430. household_id=[2, 2, 2, 3, 3, 3, 3, 3],
  431. asset_id=["gb00b03mlx29", "gb00b03mlx29",
  432. "gb00b03mlx29", "gb00b03mlx29",
  433. "gb00b03mlx29", "gb00b03mlx29",
  434. "lu0197800237", "lu0197800237"],
  435. t=[233, 234, 235, 233, 234, 235, 180, 181],
  436. share=[0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6],
  437. log_return=[.09604978, -.06524096, .03532373,
  438. .09604978, -.06524096, .03532373,
  439. .03025441, .036997]
  440. ))
  441. .set_index(["household_id", "asset_id", "t"])
  442. .reindex(columns=['share', 'log_return']))
  443. # this is the equivalency
  444. result = (merge(household.reset_index(), log_return.reset_index(),
  445. on=['asset_id'], how='inner')
  446. .set_index(['household_id', 'asset_id', 't']))
  447. tm.assert_frame_equal(result, expected)
  448. expected = (
  449. DataFrame(dict(
  450. household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4],
  451. asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29",
  452. "gb00b03mlx29", "gb00b03mlx29",
  453. "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29",
  454. "lu0197800237", "lu0197800237",
  455. "nl0000289965", None],
  456. t=[None, None, 233, 234, 235, 233, 234,
  457. 235, 180, 181, None, None],
  458. share=[1.0, 0.4, 0.6, 0.6, 0.6, 0.15,
  459. 0.15, 0.15, 0.6, 0.6, 0.25, 1.0],
  460. log_return=[None, None, .09604978, -.06524096, .03532373,
  461. .09604978, -.06524096, .03532373,
  462. .03025441, .036997, None, None]
  463. ))
  464. .set_index(["household_id", "asset_id", "t"])
  465. .reindex(columns=['share', 'log_return']))
  466. result = (merge(household.reset_index(), log_return.reset_index(),
  467. on=['asset_id'], how='outer')
  468. .set_index(['household_id', 'asset_id', 't']))
  469. tm.assert_frame_equal(result, expected)
  470. class TestJoinMultiMulti(object):
  471. def test_join_multi_multi(self, left_multi, right_multi, join_type,
  472. on_cols_multi, idx_cols_multi):
  473. # Multi-index join tests
  474. expected = (pd.merge(left_multi.reset_index(),
  475. right_multi.reset_index(),
  476. how=join_type, on=on_cols_multi).
  477. set_index(idx_cols_multi).sort_index())
  478. result = left_multi.join(right_multi, how=join_type).sort_index()
  479. tm.assert_frame_equal(result, expected)
  480. def test_join_multi_empty_frames(self, left_multi, right_multi, join_type,
  481. on_cols_multi, idx_cols_multi):
  482. left_multi = left_multi.drop(columns=left_multi.columns)
  483. right_multi = right_multi.drop(columns=right_multi.columns)
  484. expected = (pd.merge(left_multi.reset_index(),
  485. right_multi.reset_index(),
  486. how=join_type, on=on_cols_multi)
  487. .set_index(idx_cols_multi).sort_index())
  488. result = left_multi.join(right_multi, how=join_type).sort_index()
  489. tm.assert_frame_equal(result, expected)
  490. @pytest.mark.parametrize("box", [None, np.asarray, Series, Index])
  491. def test_merge_datetime_index(self, box):
  492. # see gh-19038
  493. df = DataFrame([1, 2, 3],
  494. ["2016-01-01", "2017-01-01", "2018-01-01"],
  495. columns=["a"])
  496. df.index = pd.to_datetime(df.index)
  497. on_vector = df.index.year
  498. if box is not None:
  499. on_vector = box(on_vector)
  500. expected = DataFrame(
  501. OrderedDict([
  502. ("a", [1, 2, 3]),
  503. ("key_1", [2016, 2017, 2018]),
  504. ])
  505. )
  506. result = df.merge(df, on=["a", on_vector], how="inner")
  507. tm.assert_frame_equal(result, expected)
  508. expected = DataFrame(
  509. OrderedDict([
  510. ("key_0", [2016, 2017, 2018]),
  511. ("a_x", [1, 2, 3]),
  512. ("a_y", [1, 2, 3]),
  513. ])
  514. )
  515. result = df.merge(df, on=[df.index.year], how="inner")
  516. tm.assert_frame_equal(result, expected)
  517. def test_single_common_level(self):
  518. index_left = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'),
  519. ('K1', 'X2')],
  520. names=['key', 'X'])
  521. left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
  522. 'B': ['B0', 'B1', 'B2']},
  523. index=index_left)
  524. index_right = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'),
  525. ('K2', 'Y2'), ('K2', 'Y3')],
  526. names=['key', 'Y'])
  527. right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'],
  528. 'D': ['D0', 'D1', 'D2', 'D3']},
  529. index=index_right)
  530. result = left.join(right)
  531. expected = (pd.merge(left.reset_index(), right.reset_index(),
  532. on=['key'], how='inner')
  533. .set_index(['key', 'X', 'Y']))
  534. tm.assert_frame_equal(result, expected)