test_merge.py 65 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609
  1. # pylint: disable=E1103
  2. from collections import OrderedDict
  3. from datetime import date, datetime
  4. import random
  5. import re
  6. import numpy as np
  7. from numpy import nan
  8. import pytest
  9. from pandas.compat import lrange
  10. from pandas.core.dtypes.common import is_categorical_dtype, is_object_dtype
  11. from pandas.core.dtypes.dtypes import CategoricalDtype
  12. import pandas as pd
  13. from pandas import (
  14. Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Float64Index,
  15. Int64Index, MultiIndex, RangeIndex, Series, UInt64Index)
  16. from pandas.api.types import CategoricalDtype as CDT
  17. from pandas.core.reshape.concat import concat
  18. from pandas.core.reshape.merge import MergeError, merge
  19. import pandas.util.testing as tm
  20. from pandas.util.testing import assert_frame_equal, assert_series_equal
  21. N = 50
  22. NGROUPS = 8
  23. def get_test_data(ngroups=NGROUPS, n=N):
  24. unique_groups = lrange(ngroups)
  25. arr = np.asarray(np.tile(unique_groups, n // ngroups))
  26. if len(arr) < n:
  27. arr = np.asarray(list(arr) + unique_groups[:n - len(arr)])
  28. random.shuffle(arr)
  29. return arr
  30. def get_series():
  31. return [
  32. pd.Series([1], dtype='int64'),
  33. pd.Series([1], dtype='Int64'),
  34. pd.Series([1.23]),
  35. pd.Series(['foo']),
  36. pd.Series([True]),
  37. pd.Series([pd.Timestamp('2018-01-01')]),
  38. pd.Series([pd.Timestamp('2018-01-01', tz='US/Eastern')]),
  39. ]
  40. def get_series_na():
  41. return [
  42. pd.Series([np.nan], dtype='Int64'),
  43. pd.Series([np.nan], dtype='float'),
  44. pd.Series([np.nan], dtype='object'),
  45. pd.Series([pd.NaT]),
  46. ]
  47. @pytest.fixture(params=get_series(), ids=lambda x: x.dtype.name)
  48. def series_of_dtype(request):
  49. """
  50. A parametrized fixture returning a variety of Series of different
  51. dtypes
  52. """
  53. return request.param
  54. @pytest.fixture(params=get_series(), ids=lambda x: x.dtype.name)
  55. def series_of_dtype2(request):
  56. """
  57. A duplicate of the series_of_dtype fixture, so that it can be used
  58. twice by a single function
  59. """
  60. return request.param
  61. @pytest.fixture(params=get_series_na(), ids=lambda x: x.dtype.name)
  62. def series_of_dtype_all_na(request):
  63. """
  64. A parametrized fixture returning a variety of Series with all NA
  65. values
  66. """
  67. return request.param
  68. class TestMerge(object):
  69. def setup_method(self, method):
  70. # aggregate multiple columns
  71. self.df = DataFrame({'key1': get_test_data(),
  72. 'key2': get_test_data(),
  73. 'data1': np.random.randn(N),
  74. 'data2': np.random.randn(N)})
  75. # exclude a couple keys for fun
  76. self.df = self.df[self.df['key2'] > 1]
  77. self.df2 = DataFrame({'key1': get_test_data(n=N // 5),
  78. 'key2': get_test_data(ngroups=NGROUPS // 2,
  79. n=N // 5),
  80. 'value': np.random.randn(N // 5)})
  81. self.left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'],
  82. 'v1': np.random.randn(7)})
  83. self.right = DataFrame({'v2': np.random.randn(4)},
  84. index=['d', 'b', 'c', 'a'])
  85. def test_merge_inner_join_empty(self):
  86. # GH 15328
  87. df_empty = pd.DataFrame()
  88. df_a = pd.DataFrame({'a': [1, 2]}, index=[0, 1], dtype='int64')
  89. result = pd.merge(df_empty, df_a, left_index=True, right_index=True)
  90. expected = pd.DataFrame({'a': []}, index=[], dtype='int64')
  91. assert_frame_equal(result, expected)
  92. def test_merge_common(self):
  93. joined = merge(self.df, self.df2)
  94. exp = merge(self.df, self.df2, on=['key1', 'key2'])
  95. tm.assert_frame_equal(joined, exp)
  96. def test_merge_index_as_on_arg(self):
  97. # GH14355
  98. left = self.df.set_index('key1')
  99. right = self.df2.set_index('key1')
  100. result = merge(left, right, on='key1')
  101. expected = merge(self.df, self.df2, on='key1').set_index('key1')
  102. assert_frame_equal(result, expected)
  103. def test_merge_index_singlekey_right_vs_left(self):
  104. left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'],
  105. 'v1': np.random.randn(7)})
  106. right = DataFrame({'v2': np.random.randn(4)},
  107. index=['d', 'b', 'c', 'a'])
  108. merged1 = merge(left, right, left_on='key',
  109. right_index=True, how='left', sort=False)
  110. merged2 = merge(right, left, right_on='key',
  111. left_index=True, how='right', sort=False)
  112. assert_frame_equal(merged1, merged2.loc[:, merged1.columns])
  113. merged1 = merge(left, right, left_on='key',
  114. right_index=True, how='left', sort=True)
  115. merged2 = merge(right, left, right_on='key',
  116. left_index=True, how='right', sort=True)
  117. assert_frame_equal(merged1, merged2.loc[:, merged1.columns])
  118. def test_merge_index_singlekey_inner(self):
  119. left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'],
  120. 'v1': np.random.randn(7)})
  121. right = DataFrame({'v2': np.random.randn(4)},
  122. index=['d', 'b', 'c', 'a'])
  123. # inner join
  124. result = merge(left, right, left_on='key', right_index=True,
  125. how='inner')
  126. expected = left.join(right, on='key').loc[result.index]
  127. assert_frame_equal(result, expected)
  128. result = merge(right, left, right_on='key', left_index=True,
  129. how='inner')
  130. expected = left.join(right, on='key').loc[result.index]
  131. assert_frame_equal(result, expected.loc[:, result.columns])
  132. def test_merge_misspecified(self):
  133. msg = "Must pass right_on or right_index=True"
  134. with pytest.raises(pd.errors.MergeError, match=msg):
  135. merge(self.left, self.right, left_index=True)
  136. msg = "Must pass left_on or left_index=True"
  137. with pytest.raises(pd.errors.MergeError, match=msg):
  138. merge(self.left, self.right, right_index=True)
  139. msg = ('Can only pass argument "on" OR "left_on" and "right_on", not'
  140. ' a combination of both')
  141. with pytest.raises(pd.errors.MergeError, match=msg):
  142. merge(self.left, self.left, left_on='key', on='key')
  143. msg = r"len\(right_on\) must equal len\(left_on\)"
  144. with pytest.raises(ValueError, match=msg):
  145. merge(self.df, self.df2, left_on=['key1'],
  146. right_on=['key1', 'key2'])
  147. def test_index_and_on_parameters_confusion(self):
  148. msg = ("right_index parameter must be of type bool, not"
  149. r" <(class|type) 'list'>")
  150. with pytest.raises(ValueError, match=msg):
  151. merge(self.df, self.df2, how='left',
  152. left_index=False, right_index=['key1', 'key2'])
  153. msg = ("left_index parameter must be of type bool, not "
  154. r"<(class|type) 'list'>")
  155. with pytest.raises(ValueError, match=msg):
  156. merge(self.df, self.df2, how='left',
  157. left_index=['key1', 'key2'], right_index=False)
  158. with pytest.raises(ValueError, match=msg):
  159. merge(self.df, self.df2, how='left',
  160. left_index=['key1', 'key2'], right_index=['key1', 'key2'])
  161. def test_merge_overlap(self):
  162. merged = merge(self.left, self.left, on='key')
  163. exp_len = (self.left['key'].value_counts() ** 2).sum()
  164. assert len(merged) == exp_len
  165. assert 'v1_x' in merged
  166. assert 'v1_y' in merged
  167. def test_merge_different_column_key_names(self):
  168. left = DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
  169. 'value': [1, 2, 3, 4]})
  170. right = DataFrame({'rkey': ['foo', 'bar', 'qux', 'foo'],
  171. 'value': [5, 6, 7, 8]})
  172. merged = left.merge(right, left_on='lkey', right_on='rkey',
  173. how='outer', sort=True)
  174. exp = pd.Series(['bar', 'baz', 'foo', 'foo', 'foo', 'foo', np.nan],
  175. name='lkey')
  176. tm.assert_series_equal(merged['lkey'], exp)
  177. exp = pd.Series(['bar', np.nan, 'foo', 'foo', 'foo', 'foo', 'qux'],
  178. name='rkey')
  179. tm.assert_series_equal(merged['rkey'], exp)
  180. exp = pd.Series([2, 3, 1, 1, 4, 4, np.nan], name='value_x')
  181. tm.assert_series_equal(merged['value_x'], exp)
  182. exp = pd.Series([6, np.nan, 5, 8, 5, 8, 7], name='value_y')
  183. tm.assert_series_equal(merged['value_y'], exp)
  184. def test_merge_copy(self):
  185. left = DataFrame({'a': 0, 'b': 1}, index=lrange(10))
  186. right = DataFrame({'c': 'foo', 'd': 'bar'}, index=lrange(10))
  187. merged = merge(left, right, left_index=True,
  188. right_index=True, copy=True)
  189. merged['a'] = 6
  190. assert (left['a'] == 0).all()
  191. merged['d'] = 'peekaboo'
  192. assert (right['d'] == 'bar').all()
  193. def test_merge_nocopy(self):
  194. left = DataFrame({'a': 0, 'b': 1}, index=lrange(10))
  195. right = DataFrame({'c': 'foo', 'd': 'bar'}, index=lrange(10))
  196. merged = merge(left, right, left_index=True,
  197. right_index=True, copy=False)
  198. merged['a'] = 6
  199. assert (left['a'] == 6).all()
  200. merged['d'] = 'peekaboo'
  201. assert (right['d'] == 'peekaboo').all()
  202. def test_intelligently_handle_join_key(self):
  203. # #733, be a bit more 1337 about not returning unconsolidated DataFrame
  204. left = DataFrame({'key': [1, 1, 2, 2, 3],
  205. 'value': lrange(5)}, columns=['value', 'key'])
  206. right = DataFrame({'key': [1, 1, 2, 3, 4, 5],
  207. 'rvalue': lrange(6)})
  208. joined = merge(left, right, on='key', how='outer')
  209. expected = DataFrame({'key': [1, 1, 1, 1, 2, 2, 3, 4, 5],
  210. 'value': np.array([0, 0, 1, 1, 2, 3, 4,
  211. np.nan, np.nan]),
  212. 'rvalue': [0, 1, 0, 1, 2, 2, 3, 4, 5]},
  213. columns=['value', 'key', 'rvalue'])
  214. assert_frame_equal(joined, expected)
  215. def test_merge_join_key_dtype_cast(self):
  216. # #8596
  217. df1 = DataFrame({'key': [1], 'v1': [10]})
  218. df2 = DataFrame({'key': [2], 'v1': [20]})
  219. df = merge(df1, df2, how='outer')
  220. assert df['key'].dtype == 'int64'
  221. df1 = DataFrame({'key': [True], 'v1': [1]})
  222. df2 = DataFrame({'key': [False], 'v1': [0]})
  223. df = merge(df1, df2, how='outer')
  224. # GH13169
  225. # this really should be bool
  226. assert df['key'].dtype == 'object'
  227. df1 = DataFrame({'val': [1]})
  228. df2 = DataFrame({'val': [2]})
  229. lkey = np.array([1])
  230. rkey = np.array([2])
  231. df = merge(df1, df2, left_on=lkey, right_on=rkey, how='outer')
  232. assert df['key_0'].dtype == 'int64'
  233. def test_handle_join_key_pass_array(self):
  234. left = DataFrame({'key': [1, 1, 2, 2, 3],
  235. 'value': lrange(5)}, columns=['value', 'key'])
  236. right = DataFrame({'rvalue': lrange(6)})
  237. key = np.array([1, 1, 2, 3, 4, 5])
  238. merged = merge(left, right, left_on='key', right_on=key, how='outer')
  239. merged2 = merge(right, left, left_on=key, right_on='key', how='outer')
  240. assert_series_equal(merged['key'], merged2['key'])
  241. assert merged['key'].notna().all()
  242. assert merged2['key'].notna().all()
  243. left = DataFrame({'value': lrange(5)}, columns=['value'])
  244. right = DataFrame({'rvalue': lrange(6)})
  245. lkey = np.array([1, 1, 2, 2, 3])
  246. rkey = np.array([1, 1, 2, 3, 4, 5])
  247. merged = merge(left, right, left_on=lkey, right_on=rkey, how='outer')
  248. tm.assert_series_equal(merged['key_0'], Series([1, 1, 1, 1, 2,
  249. 2, 3, 4, 5],
  250. name='key_0'))
  251. left = DataFrame({'value': lrange(3)})
  252. right = DataFrame({'rvalue': lrange(6)})
  253. key = np.array([0, 1, 1, 2, 2, 3], dtype=np.int64)
  254. merged = merge(left, right, left_index=True, right_on=key, how='outer')
  255. tm.assert_series_equal(merged['key_0'], Series(key, name='key_0'))
  256. def test_no_overlap_more_informative_error(self):
  257. dt = datetime.now()
  258. df1 = DataFrame({'x': ['a']}, index=[dt])
  259. df2 = DataFrame({'y': ['b', 'c']}, index=[dt, dt])
  260. msg = ('No common columns to perform merge on. '
  261. 'Merge options: left_on={lon}, right_on={ron}, '
  262. 'left_index={lidx}, right_index={ridx}'
  263. .format(lon=None, ron=None, lidx=False, ridx=False))
  264. with pytest.raises(MergeError, match=msg):
  265. merge(df1, df2)
  266. def test_merge_non_unique_indexes(self):
  267. dt = datetime(2012, 5, 1)
  268. dt2 = datetime(2012, 5, 2)
  269. dt3 = datetime(2012, 5, 3)
  270. dt4 = datetime(2012, 5, 4)
  271. df1 = DataFrame({'x': ['a']}, index=[dt])
  272. df2 = DataFrame({'y': ['b', 'c']}, index=[dt, dt])
  273. _check_merge(df1, df2)
  274. # Not monotonic
  275. df1 = DataFrame({'x': ['a', 'b', 'q']}, index=[dt2, dt, dt4])
  276. df2 = DataFrame({'y': ['c', 'd', 'e', 'f', 'g', 'h']},
  277. index=[dt3, dt3, dt2, dt2, dt, dt])
  278. _check_merge(df1, df2)
  279. df1 = DataFrame({'x': ['a', 'b']}, index=[dt, dt])
  280. df2 = DataFrame({'y': ['c', 'd']}, index=[dt, dt])
  281. _check_merge(df1, df2)
  282. def test_merge_non_unique_index_many_to_many(self):
  283. dt = datetime(2012, 5, 1)
  284. dt2 = datetime(2012, 5, 2)
  285. dt3 = datetime(2012, 5, 3)
  286. df1 = DataFrame({'x': ['a', 'b', 'c', 'd']},
  287. index=[dt2, dt2, dt, dt])
  288. df2 = DataFrame({'y': ['e', 'f', 'g', ' h', 'i']},
  289. index=[dt2, dt2, dt3, dt, dt])
  290. _check_merge(df1, df2)
  291. def test_left_merge_empty_dataframe(self):
  292. left = DataFrame({'key': [1], 'value': [2]})
  293. right = DataFrame({'key': []})
  294. result = merge(left, right, on='key', how='left')
  295. assert_frame_equal(result, left)
  296. result = merge(right, left, on='key', how='right')
  297. assert_frame_equal(result, left)
  298. @pytest.mark.parametrize('kwarg',
  299. [dict(left_index=True, right_index=True),
  300. dict(left_index=True, right_on='x'),
  301. dict(left_on='a', right_index=True),
  302. dict(left_on='a', right_on='x')])
  303. def test_merge_left_empty_right_empty(self, join_type, kwarg):
  304. # GH 10824
  305. left = pd.DataFrame([], columns=['a', 'b', 'c'])
  306. right = pd.DataFrame([], columns=['x', 'y', 'z'])
  307. exp_in = pd.DataFrame([], columns=['a', 'b', 'c', 'x', 'y', 'z'],
  308. index=pd.Index([], dtype=object),
  309. dtype=object)
  310. result = pd.merge(left, right, how=join_type, **kwarg)
  311. tm.assert_frame_equal(result, exp_in)
  312. def test_merge_left_empty_right_notempty(self):
  313. # GH 10824
  314. left = pd.DataFrame([], columns=['a', 'b', 'c'])
  315. right = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
  316. columns=['x', 'y', 'z'])
  317. exp_out = pd.DataFrame({'a': np.array([np.nan] * 3, dtype=object),
  318. 'b': np.array([np.nan] * 3, dtype=object),
  319. 'c': np.array([np.nan] * 3, dtype=object),
  320. 'x': [1, 4, 7],
  321. 'y': [2, 5, 8],
  322. 'z': [3, 6, 9]},
  323. columns=['a', 'b', 'c', 'x', 'y', 'z'])
  324. exp_in = exp_out[0:0] # make empty DataFrame keeping dtype
  325. # result will have object dtype
  326. exp_in.index = exp_in.index.astype(object)
  327. def check1(exp, kwarg):
  328. result = pd.merge(left, right, how='inner', **kwarg)
  329. tm.assert_frame_equal(result, exp)
  330. result = pd.merge(left, right, how='left', **kwarg)
  331. tm.assert_frame_equal(result, exp)
  332. def check2(exp, kwarg):
  333. result = pd.merge(left, right, how='right', **kwarg)
  334. tm.assert_frame_equal(result, exp)
  335. result = pd.merge(left, right, how='outer', **kwarg)
  336. tm.assert_frame_equal(result, exp)
  337. for kwarg in [dict(left_index=True, right_index=True),
  338. dict(left_index=True, right_on='x')]:
  339. check1(exp_in, kwarg)
  340. check2(exp_out, kwarg)
  341. kwarg = dict(left_on='a', right_index=True)
  342. check1(exp_in, kwarg)
  343. exp_out['a'] = [0, 1, 2]
  344. check2(exp_out, kwarg)
  345. kwarg = dict(left_on='a', right_on='x')
  346. check1(exp_in, kwarg)
  347. exp_out['a'] = np.array([np.nan] * 3, dtype=object)
  348. check2(exp_out, kwarg)
  349. def test_merge_left_notempty_right_empty(self):
  350. # GH 10824
  351. left = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
  352. columns=['a', 'b', 'c'])
  353. right = pd.DataFrame([], columns=['x', 'y', 'z'])
  354. exp_out = pd.DataFrame({'a': [1, 4, 7],
  355. 'b': [2, 5, 8],
  356. 'c': [3, 6, 9],
  357. 'x': np.array([np.nan] * 3, dtype=object),
  358. 'y': np.array([np.nan] * 3, dtype=object),
  359. 'z': np.array([np.nan] * 3, dtype=object)},
  360. columns=['a', 'b', 'c', 'x', 'y', 'z'])
  361. exp_in = exp_out[0:0] # make empty DataFrame keeping dtype
  362. # result will have object dtype
  363. exp_in.index = exp_in.index.astype(object)
  364. def check1(exp, kwarg):
  365. result = pd.merge(left, right, how='inner', **kwarg)
  366. tm.assert_frame_equal(result, exp)
  367. result = pd.merge(left, right, how='right', **kwarg)
  368. tm.assert_frame_equal(result, exp)
  369. def check2(exp, kwarg):
  370. result = pd.merge(left, right, how='left', **kwarg)
  371. tm.assert_frame_equal(result, exp)
  372. result = pd.merge(left, right, how='outer', **kwarg)
  373. tm.assert_frame_equal(result, exp)
  374. for kwarg in [dict(left_index=True, right_index=True),
  375. dict(left_index=True, right_on='x'),
  376. dict(left_on='a', right_index=True),
  377. dict(left_on='a', right_on='x')]:
  378. check1(exp_in, kwarg)
  379. check2(exp_out, kwarg)
  380. def test_merge_empty_frame(self, series_of_dtype, series_of_dtype2):
  381. # GH 25183
  382. df = pd.DataFrame({'key': series_of_dtype, 'value': series_of_dtype2},
  383. columns=['key', 'value'])
  384. df_empty = df[:0]
  385. expected = pd.DataFrame({
  386. 'value_x': pd.Series(dtype=df.dtypes['value']),
  387. 'key': pd.Series(dtype=df.dtypes['key']),
  388. 'value_y': pd.Series(dtype=df.dtypes['value']),
  389. }, columns=['value_x', 'key', 'value_y'])
  390. actual = df_empty.merge(df, on='key')
  391. assert_frame_equal(actual, expected)
  392. def test_merge_all_na_column(self, series_of_dtype,
  393. series_of_dtype_all_na):
  394. # GH 25183
  395. df_left = pd.DataFrame(
  396. {'key': series_of_dtype, 'value': series_of_dtype_all_na},
  397. columns=['key', 'value'])
  398. df_right = pd.DataFrame(
  399. {'key': series_of_dtype, 'value': series_of_dtype_all_na},
  400. columns=['key', 'value'])
  401. expected = pd.DataFrame({
  402. 'key': series_of_dtype,
  403. 'value_x': series_of_dtype_all_na,
  404. 'value_y': series_of_dtype_all_na,
  405. }, columns=['key', 'value_x', 'value_y'])
  406. actual = df_left.merge(df_right, on='key')
  407. assert_frame_equal(actual, expected)
  408. def test_merge_nosort(self):
  409. # #2098, anything to do?
  410. from datetime import datetime
  411. d = {"var1": np.random.randint(0, 10, size=10),
  412. "var2": np.random.randint(0, 10, size=10),
  413. "var3": [datetime(2012, 1, 12),
  414. datetime(2011, 2, 4),
  415. datetime(2010, 2, 3),
  416. datetime(2012, 1, 12),
  417. datetime(2011, 2, 4),
  418. datetime(2012, 4, 3),
  419. datetime(2012, 3, 4),
  420. datetime(2008, 5, 1),
  421. datetime(2010, 2, 3),
  422. datetime(2012, 2, 3)]}
  423. df = DataFrame.from_dict(d)
  424. var3 = df.var3.unique()
  425. var3.sort()
  426. new = DataFrame.from_dict({"var3": var3,
  427. "var8": np.random.random(7)})
  428. result = df.merge(new, on="var3", sort=False)
  429. exp = merge(df, new, on='var3', sort=False)
  430. assert_frame_equal(result, exp)
  431. assert (df.var3.unique() == result.var3.unique()).all()
  432. def test_merge_nan_right(self):
  433. df1 = DataFrame({"i1": [0, 1], "i2": [0, 1]})
  434. df2 = DataFrame({"i1": [0], "i3": [0]})
  435. result = df1.join(df2, on="i1", rsuffix="_")
  436. expected = (DataFrame({'i1': {0: 0.0, 1: 1}, 'i2': {0: 0, 1: 1},
  437. 'i1_': {0: 0, 1: np.nan},
  438. 'i3': {0: 0.0, 1: np.nan},
  439. None: {0: 0, 1: 0}})
  440. .set_index(None)
  441. .reset_index()[['i1', 'i2', 'i1_', 'i3']])
  442. assert_frame_equal(result, expected, check_dtype=False)
  443. df1 = DataFrame({"i1": [0, 1], "i2": [0.5, 1.5]})
  444. df2 = DataFrame({"i1": [0], "i3": [0.7]})
  445. result = df1.join(df2, rsuffix="_", on='i1')
  446. expected = (DataFrame({'i1': {0: 0, 1: 1}, 'i1_': {0: 0.0, 1: nan},
  447. 'i2': {0: 0.5, 1: 1.5},
  448. 'i3': {0: 0.69999999999999996,
  449. 1: nan}})
  450. [['i1', 'i2', 'i1_', 'i3']])
  451. assert_frame_equal(result, expected)
  452. def test_merge_type(self):
  453. class NotADataFrame(DataFrame):
  454. @property
  455. def _constructor(self):
  456. return NotADataFrame
  457. nad = NotADataFrame(self.df)
  458. result = nad.merge(self.df2, on='key1')
  459. assert isinstance(result, NotADataFrame)
  460. def test_join_append_timedeltas(self):
  461. import datetime as dt
  462. from pandas import NaT
  463. # timedelta64 issues with join/merge
  464. # GH 5695
  465. d = {'d': dt.datetime(2013, 11, 5, 5, 56), 't': dt.timedelta(0, 22500)}
  466. df = DataFrame(columns=list('dt'))
  467. df = df.append(d, ignore_index=True)
  468. result = df.append(d, ignore_index=True)
  469. expected = DataFrame({'d': [dt.datetime(2013, 11, 5, 5, 56),
  470. dt.datetime(2013, 11, 5, 5, 56)],
  471. 't': [dt.timedelta(0, 22500),
  472. dt.timedelta(0, 22500)]})
  473. assert_frame_equal(result, expected)
  474. td = np.timedelta64(300000000)
  475. lhs = DataFrame(Series([td, td], index=["A", "B"]))
  476. rhs = DataFrame(Series([td], index=["A"]))
  477. result = lhs.join(rhs, rsuffix='r', how="left")
  478. expected = DataFrame({'0': Series([td, td], index=list('AB')),
  479. '0r': Series([td, NaT], index=list('AB'))})
  480. assert_frame_equal(result, expected)
  481. def test_other_datetime_unit(self):
  482. # GH 13389
  483. df1 = pd.DataFrame({'entity_id': [101, 102]})
  484. s = pd.Series([None, None], index=[101, 102], name='days')
  485. for dtype in ['datetime64[D]', 'datetime64[h]', 'datetime64[m]',
  486. 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]',
  487. 'datetime64[ns]']:
  488. df2 = s.astype(dtype).to_frame('days')
  489. # coerces to datetime64[ns], thus sholuld not be affected
  490. assert df2['days'].dtype == 'datetime64[ns]'
  491. result = df1.merge(df2, left_on='entity_id', right_index=True)
  492. exp = pd.DataFrame({'entity_id': [101, 102],
  493. 'days': np.array(['nat', 'nat'],
  494. dtype='datetime64[ns]')},
  495. columns=['entity_id', 'days'])
  496. tm.assert_frame_equal(result, exp)
  497. @pytest.mark.parametrize("unit", ['D', 'h', 'm', 's', 'ms', 'us', 'ns'])
  498. def test_other_timedelta_unit(self, unit):
  499. # GH 13389
  500. df1 = pd.DataFrame({'entity_id': [101, 102]})
  501. s = pd.Series([None, None], index=[101, 102], name='days')
  502. dtype = "m8[{}]".format(unit)
  503. df2 = s.astype(dtype).to_frame('days')
  504. assert df2['days'].dtype == 'm8[ns]'
  505. result = df1.merge(df2, left_on='entity_id', right_index=True)
  506. exp = pd.DataFrame({'entity_id': [101, 102],
  507. 'days': np.array(['nat', 'nat'],
  508. dtype=dtype)},
  509. columns=['entity_id', 'days'])
  510. tm.assert_frame_equal(result, exp)
  511. def test_overlapping_columns_error_message(self):
  512. df = DataFrame({'key': [1, 2, 3],
  513. 'v1': [4, 5, 6],
  514. 'v2': [7, 8, 9]})
  515. df2 = DataFrame({'key': [1, 2, 3],
  516. 'v1': [4, 5, 6],
  517. 'v2': [7, 8, 9]})
  518. df.columns = ['key', 'foo', 'foo']
  519. df2.columns = ['key', 'bar', 'bar']
  520. expected = DataFrame({'key': [1, 2, 3],
  521. 'v1': [4, 5, 6],
  522. 'v2': [7, 8, 9],
  523. 'v3': [4, 5, 6],
  524. 'v4': [7, 8, 9]})
  525. expected.columns = ['key', 'foo', 'foo', 'bar', 'bar']
  526. assert_frame_equal(merge(df, df2), expected)
  527. # #2649, #10639
  528. df2.columns = ['key1', 'foo', 'foo']
  529. msg = (r"Data columns not unique: Index\(\[u?'foo', u?'foo'\],"
  530. r" dtype='object'\)")
  531. with pytest.raises(MergeError, match=msg):
  532. merge(df, df2)
  533. def test_merge_on_datetime64tz(self):
  534. # GH11405
  535. left = pd.DataFrame({'key': pd.date_range('20151010', periods=2,
  536. tz='US/Eastern'),
  537. 'value': [1, 2]})
  538. right = pd.DataFrame({'key': pd.date_range('20151011', periods=3,
  539. tz='US/Eastern'),
  540. 'value': [1, 2, 3]})
  541. expected = DataFrame({'key': pd.date_range('20151010', periods=4,
  542. tz='US/Eastern'),
  543. 'value_x': [1, 2, np.nan, np.nan],
  544. 'value_y': [np.nan, 1, 2, 3]})
  545. result = pd.merge(left, right, on='key', how='outer')
  546. assert_frame_equal(result, expected)
  547. left = pd.DataFrame({'key': [1, 2],
  548. 'value': pd.date_range('20151010', periods=2,
  549. tz='US/Eastern')})
  550. right = pd.DataFrame({'key': [2, 3],
  551. 'value': pd.date_range('20151011', periods=2,
  552. tz='US/Eastern')})
  553. expected = DataFrame({
  554. 'key': [1, 2, 3],
  555. 'value_x': list(pd.date_range('20151010', periods=2,
  556. tz='US/Eastern')) + [pd.NaT],
  557. 'value_y': [pd.NaT] + list(pd.date_range('20151011', periods=2,
  558. tz='US/Eastern'))})
  559. result = pd.merge(left, right, on='key', how='outer')
  560. assert_frame_equal(result, expected)
  561. assert result['value_x'].dtype == 'datetime64[ns, US/Eastern]'
  562. assert result['value_y'].dtype == 'datetime64[ns, US/Eastern]'
  563. def test_merge_on_datetime64tz_empty(self):
  564. # https://github.com/pandas-dev/pandas/issues/25014
  565. dtz = pd.DatetimeTZDtype(tz='UTC')
  566. right = pd.DataFrame({'date': [pd.Timestamp('2018', tz=dtz.tz)],
  567. 'value': [4.0],
  568. 'date2': [pd.Timestamp('2019', tz=dtz.tz)]},
  569. columns=['date', 'value', 'date2'])
  570. left = right[:0]
  571. result = left.merge(right, on='date')
  572. expected = pd.DataFrame({
  573. 'value_x': pd.Series(dtype=float),
  574. 'date2_x': pd.Series(dtype=dtz),
  575. 'date': pd.Series(dtype=dtz),
  576. 'value_y': pd.Series(dtype=float),
  577. 'date2_y': pd.Series(dtype=dtz),
  578. }, columns=['value_x', 'date2_x', 'date', 'value_y', 'date2_y'])
  579. tm.assert_frame_equal(result, expected)
  580. def test_merge_datetime64tz_with_dst_transition(self):
  581. # GH 18885
  582. df1 = pd.DataFrame(pd.date_range(
  583. '2017-10-29 01:00', periods=4, freq='H', tz='Europe/Madrid'),
  584. columns=['date'])
  585. df1['value'] = 1
  586. df2 = pd.DataFrame({
  587. 'date': pd.to_datetime([
  588. '2017-10-29 03:00:00', '2017-10-29 04:00:00',
  589. '2017-10-29 05:00:00'
  590. ]),
  591. 'value': 2
  592. })
  593. df2['date'] = df2['date'].dt.tz_localize('UTC').dt.tz_convert(
  594. 'Europe/Madrid')
  595. result = pd.merge(df1, df2, how='outer', on='date')
  596. expected = pd.DataFrame({
  597. 'date': pd.date_range(
  598. '2017-10-29 01:00', periods=7, freq='H', tz='Europe/Madrid'),
  599. 'value_x': [1] * 4 + [np.nan] * 3,
  600. 'value_y': [np.nan] * 4 + [2] * 3
  601. })
  602. assert_frame_equal(result, expected)
  603. def test_merge_non_unique_period_index(self):
  604. # GH #16871
  605. index = pd.period_range('2016-01-01', periods=16, freq='M')
  606. df = DataFrame([i for i in range(len(index))],
  607. index=index, columns=['pnum'])
  608. df2 = concat([df, df])
  609. result = df.merge(df2, left_index=True, right_index=True, how='inner')
  610. expected = DataFrame(
  611. np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
  612. columns=['pnum_x', 'pnum_y'], index=df2.sort_index().index)
  613. tm.assert_frame_equal(result, expected)
  614. def test_merge_on_periods(self):
  615. left = pd.DataFrame({'key': pd.period_range('20151010', periods=2,
  616. freq='D'),
  617. 'value': [1, 2]})
  618. right = pd.DataFrame({'key': pd.period_range('20151011', periods=3,
  619. freq='D'),
  620. 'value': [1, 2, 3]})
  621. expected = DataFrame({'key': pd.period_range('20151010', periods=4,
  622. freq='D'),
  623. 'value_x': [1, 2, np.nan, np.nan],
  624. 'value_y': [np.nan, 1, 2, 3]})
  625. result = pd.merge(left, right, on='key', how='outer')
  626. assert_frame_equal(result, expected)
  627. left = pd.DataFrame({'key': [1, 2],
  628. 'value': pd.period_range('20151010', periods=2,
  629. freq='D')})
  630. right = pd.DataFrame({'key': [2, 3],
  631. 'value': pd.period_range('20151011', periods=2,
  632. freq='D')})
  633. exp_x = pd.period_range('20151010', periods=2, freq='D')
  634. exp_y = pd.period_range('20151011', periods=2, freq='D')
  635. expected = DataFrame({'key': [1, 2, 3],
  636. 'value_x': list(exp_x) + [pd.NaT],
  637. 'value_y': [pd.NaT] + list(exp_y)})
  638. result = pd.merge(left, right, on='key', how='outer')
  639. assert_frame_equal(result, expected)
  640. assert result['value_x'].dtype == 'Period[D]'
  641. assert result['value_y'].dtype == 'Period[D]'
  642. def test_indicator(self):
  643. # PR #10054. xref #7412 and closes #8790.
  644. df1 = DataFrame({'col1': [0, 1], 'col_conflict': [1, 2],
  645. 'col_left': ['a', 'b']})
  646. df1_copy = df1.copy()
  647. df2 = DataFrame({'col1': [1, 2, 3, 4, 5],
  648. 'col_conflict': [1, 2, 3, 4, 5],
  649. 'col_right': [2, 2, 2, 2, 2]})
  650. df2_copy = df2.copy()
  651. df_result = DataFrame({
  652. 'col1': [0, 1, 2, 3, 4, 5],
  653. 'col_conflict_x': [1, 2, np.nan, np.nan, np.nan, np.nan],
  654. 'col_left': ['a', 'b', np.nan, np.nan, np.nan, np.nan],
  655. 'col_conflict_y': [np.nan, 1, 2, 3, 4, 5],
  656. 'col_right': [np.nan, 2, 2, 2, 2, 2]})
  657. df_result['_merge'] = Categorical(
  658. ['left_only', 'both', 'right_only',
  659. 'right_only', 'right_only', 'right_only'],
  660. categories=['left_only', 'right_only', 'both'])
  661. df_result = df_result[['col1', 'col_conflict_x', 'col_left',
  662. 'col_conflict_y', 'col_right', '_merge']]
  663. test = merge(df1, df2, on='col1', how='outer', indicator=True)
  664. assert_frame_equal(test, df_result)
  665. test = df1.merge(df2, on='col1', how='outer', indicator=True)
  666. assert_frame_equal(test, df_result)
  667. # No side effects
  668. assert_frame_equal(df1, df1_copy)
  669. assert_frame_equal(df2, df2_copy)
  670. # Check with custom name
  671. df_result_custom_name = df_result
  672. df_result_custom_name = df_result_custom_name.rename(
  673. columns={'_merge': 'custom_name'})
  674. test_custom_name = merge(
  675. df1, df2, on='col1', how='outer', indicator='custom_name')
  676. assert_frame_equal(test_custom_name, df_result_custom_name)
  677. test_custom_name = df1.merge(
  678. df2, on='col1', how='outer', indicator='custom_name')
  679. assert_frame_equal(test_custom_name, df_result_custom_name)
  680. # Check only accepts strings and booleans
  681. msg = "indicator option can only accept boolean or string arguments"
  682. with pytest.raises(ValueError, match=msg):
  683. merge(df1, df2, on='col1', how='outer', indicator=5)
  684. with pytest.raises(ValueError, match=msg):
  685. df1.merge(df2, on='col1', how='outer', indicator=5)
  686. # Check result integrity
  687. test2 = merge(df1, df2, on='col1', how='left', indicator=True)
  688. assert (test2._merge != 'right_only').all()
  689. test2 = df1.merge(df2, on='col1', how='left', indicator=True)
  690. assert (test2._merge != 'right_only').all()
  691. test3 = merge(df1, df2, on='col1', how='right', indicator=True)
  692. assert (test3._merge != 'left_only').all()
  693. test3 = df1.merge(df2, on='col1', how='right', indicator=True)
  694. assert (test3._merge != 'left_only').all()
  695. test4 = merge(df1, df2, on='col1', how='inner', indicator=True)
  696. assert (test4._merge == 'both').all()
  697. test4 = df1.merge(df2, on='col1', how='inner', indicator=True)
  698. assert (test4._merge == 'both').all()
  699. # Check if working name in df
  700. for i in ['_right_indicator', '_left_indicator', '_merge']:
  701. df_badcolumn = DataFrame({'col1': [1, 2], i: [2, 2]})
  702. msg = ("Cannot use `indicator=True` option when data contains a"
  703. " column named {}|"
  704. "Cannot use name of an existing column for indicator"
  705. " column").format(i)
  706. with pytest.raises(ValueError, match=msg):
  707. merge(df1, df_badcolumn, on='col1',
  708. how='outer', indicator=True)
  709. with pytest.raises(ValueError, match=msg):
  710. df1.merge(df_badcolumn, on='col1', how='outer', indicator=True)
  711. # Check for name conflict with custom name
  712. df_badcolumn = DataFrame(
  713. {'col1': [1, 2], 'custom_column_name': [2, 2]})
  714. msg = "Cannot use name of an existing column for indicator column"
  715. with pytest.raises(ValueError, match=msg):
  716. merge(df1, df_badcolumn, on='col1', how='outer',
  717. indicator='custom_column_name')
  718. with pytest.raises(ValueError, match=msg):
  719. df1.merge(df_badcolumn, on='col1', how='outer',
  720. indicator='custom_column_name')
  721. # Merge on multiple columns
  722. df3 = DataFrame({'col1': [0, 1], 'col2': ['a', 'b']})
  723. df4 = DataFrame({'col1': [1, 1, 3], 'col2': ['b', 'x', 'y']})
  724. hand_coded_result = DataFrame({'col1': [0, 1, 1, 3],
  725. 'col2': ['a', 'b', 'x', 'y']})
  726. hand_coded_result['_merge'] = Categorical(
  727. ['left_only', 'both', 'right_only', 'right_only'],
  728. categories=['left_only', 'right_only', 'both'])
  729. test5 = merge(df3, df4, on=['col1', 'col2'],
  730. how='outer', indicator=True)
  731. assert_frame_equal(test5, hand_coded_result)
  732. test5 = df3.merge(df4, on=['col1', 'col2'],
  733. how='outer', indicator=True)
  734. assert_frame_equal(test5, hand_coded_result)
  735. def test_validation(self):
  736. left = DataFrame({'a': ['a', 'b', 'c', 'd'],
  737. 'b': ['cat', 'dog', 'weasel', 'horse']},
  738. index=range(4))
  739. right = DataFrame({'a': ['a', 'b', 'c', 'd', 'e'],
  740. 'c': ['meow', 'bark', 'um... weasel noise?',
  741. 'nay', 'chirp']},
  742. index=range(5))
  743. # Make sure no side effects.
  744. left_copy = left.copy()
  745. right_copy = right.copy()
  746. result = merge(left, right, left_index=True, right_index=True,
  747. validate='1:1')
  748. assert_frame_equal(left, left_copy)
  749. assert_frame_equal(right, right_copy)
  750. # make sure merge still correct
  751. expected = DataFrame({'a_x': ['a', 'b', 'c', 'd'],
  752. 'b': ['cat', 'dog', 'weasel', 'horse'],
  753. 'a_y': ['a', 'b', 'c', 'd'],
  754. 'c': ['meow', 'bark', 'um... weasel noise?',
  755. 'nay']},
  756. index=range(4),
  757. columns=['a_x', 'b', 'a_y', 'c'])
  758. result = merge(left, right, left_index=True, right_index=True,
  759. validate='one_to_one')
  760. assert_frame_equal(result, expected)
  761. expected_2 = DataFrame({'a': ['a', 'b', 'c', 'd'],
  762. 'b': ['cat', 'dog', 'weasel', 'horse'],
  763. 'c': ['meow', 'bark', 'um... weasel noise?',
  764. 'nay']},
  765. index=range(4))
  766. result = merge(left, right, on='a', validate='1:1')
  767. assert_frame_equal(left, left_copy)
  768. assert_frame_equal(right, right_copy)
  769. assert_frame_equal(result, expected_2)
  770. result = merge(left, right, on='a', validate='one_to_one')
  771. assert_frame_equal(result, expected_2)
  772. # One index, one column
  773. expected_3 = DataFrame({'b': ['cat', 'dog', 'weasel', 'horse'],
  774. 'a': ['a', 'b', 'c', 'd'],
  775. 'c': ['meow', 'bark', 'um... weasel noise?',
  776. 'nay']},
  777. columns=['b', 'a', 'c'],
  778. index=range(4))
  779. left_index_reset = left.set_index('a')
  780. result = merge(left_index_reset, right, left_index=True,
  781. right_on='a', validate='one_to_one')
  782. assert_frame_equal(result, expected_3)
  783. # Dups on right
  784. right_w_dups = right.append(pd.DataFrame({'a': ['e'], 'c': ['moo']},
  785. index=[4]))
  786. merge(left, right_w_dups, left_index=True, right_index=True,
  787. validate='one_to_many')
  788. msg = ("Merge keys are not unique in right dataset; not a one-to-one"
  789. " merge")
  790. with pytest.raises(MergeError, match=msg):
  791. merge(left, right_w_dups, left_index=True, right_index=True,
  792. validate='one_to_one')
  793. with pytest.raises(MergeError, match=msg):
  794. merge(left, right_w_dups, on='a', validate='one_to_one')
  795. # Dups on left
  796. left_w_dups = left.append(pd.DataFrame({'a': ['a'], 'c': ['cow']},
  797. index=[3]), sort=True)
  798. merge(left_w_dups, right, left_index=True, right_index=True,
  799. validate='many_to_one')
  800. msg = ("Merge keys are not unique in left dataset; not a one-to-one"
  801. " merge")
  802. with pytest.raises(MergeError, match=msg):
  803. merge(left_w_dups, right, left_index=True, right_index=True,
  804. validate='one_to_one')
  805. with pytest.raises(MergeError, match=msg):
  806. merge(left_w_dups, right, on='a', validate='one_to_one')
  807. # Dups on both
  808. merge(left_w_dups, right_w_dups, on='a', validate='many_to_many')
  809. msg = ("Merge keys are not unique in right dataset; not a many-to-one"
  810. " merge")
  811. with pytest.raises(MergeError, match=msg):
  812. merge(left_w_dups, right_w_dups, left_index=True,
  813. right_index=True, validate='many_to_one')
  814. msg = ("Merge keys are not unique in left dataset; not a one-to-many"
  815. " merge")
  816. with pytest.raises(MergeError, match=msg):
  817. merge(left_w_dups, right_w_dups, on='a',
  818. validate='one_to_many')
  819. # Check invalid arguments
  820. msg = "Not a valid argument for validate"
  821. with pytest.raises(ValueError, match=msg):
  822. merge(left, right, on='a', validate='jibberish')
  823. # Two column merge, dups in both, but jointly no dups.
  824. left = DataFrame({'a': ['a', 'a', 'b', 'b'],
  825. 'b': [0, 1, 0, 1],
  826. 'c': ['cat', 'dog', 'weasel', 'horse']},
  827. index=range(4))
  828. right = DataFrame({'a': ['a', 'a', 'b'],
  829. 'b': [0, 1, 0],
  830. 'd': ['meow', 'bark', 'um... weasel noise?']},
  831. index=range(3))
  832. expected_multi = DataFrame({'a': ['a', 'a', 'b'],
  833. 'b': [0, 1, 0],
  834. 'c': ['cat', 'dog', 'weasel'],
  835. 'd': ['meow', 'bark',
  836. 'um... weasel noise?']},
  837. index=range(3))
  838. msg = ("Merge keys are not unique in either left or right dataset;"
  839. " not a one-to-one merge")
  840. with pytest.raises(MergeError, match=msg):
  841. merge(left, right, on='a', validate='1:1')
  842. result = merge(left, right, on=['a', 'b'], validate='1:1')
  843. assert_frame_equal(result, expected_multi)
  844. def test_merge_two_empty_df_no_division_error(self):
  845. # GH17776, PR #17846
  846. a = pd.DataFrame({'a': [], 'b': [], 'c': []})
  847. with np.errstate(divide='raise'):
  848. merge(a, a, on=('a', 'b'))
  849. @pytest.mark.parametrize('how', ['left', 'outer'])
  850. @pytest.mark.xfail(reason="GH-24897")
  851. def test_merge_on_index_with_more_values(self, how):
  852. # GH 24212
  853. # pd.merge gets [-1, -1, 0, 1] as right_indexer, ensure that -1 is
  854. # interpreted as a missing value instead of the last element
  855. df1 = pd.DataFrame([[1, 2], [2, 4], [3, 6], [4, 8]],
  856. columns=['a', 'b'])
  857. df2 = pd.DataFrame([[3, 30], [4, 40]],
  858. columns=['a', 'c'])
  859. df1.set_index('a', drop=False, inplace=True)
  860. df2.set_index('a', inplace=True)
  861. result = pd.merge(df1, df2, left_index=True, right_on='a', how=how)
  862. expected = pd.DataFrame([[1, 2, np.nan],
  863. [2, 4, np.nan],
  864. [3, 6, 30.0],
  865. [4, 8, 40.0]],
  866. columns=['a', 'b', 'c'])
  867. expected.set_index('a', drop=False, inplace=True)
  868. assert_frame_equal(result, expected)
  869. def test_merge_right_index_right(self):
  870. # Note: the expected output here is probably incorrect.
  871. # See https://github.com/pandas-dev/pandas/issues/17257 for more.
  872. # We include this as a regression test for GH-24897.
  873. left = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 1, 1]})
  874. right = pd.DataFrame({'b': [1, 2, 3]})
  875. expected = pd.DataFrame({'a': [1, 2, 3, None],
  876. 'key': [0, 1, 1, 2],
  877. 'b': [1, 2, 2, 3]},
  878. columns=['a', 'key', 'b'],
  879. index=[0, 1, 2, 2])
  880. result = left.merge(right, left_on='key', right_index=True,
  881. how='right')
  882. tm.assert_frame_equal(result, expected)
  883. def _check_merge(x, y):
  884. for how in ['inner', 'left', 'outer']:
  885. result = x.join(y, how=how)
  886. expected = merge(x.reset_index(), y.reset_index(), how=how,
  887. sort=True)
  888. expected = expected.set_index('index')
  889. # TODO check_names on merge?
  890. assert_frame_equal(result, expected, check_names=False)
  891. class TestMergeDtypes(object):
  892. @pytest.mark.parametrize('right_vals', [
  893. ['foo', 'bar'],
  894. Series(['foo', 'bar']).astype('category'),
  895. ])
  896. def test_different(self, right_vals):
  897. left = DataFrame({'A': ['foo', 'bar'],
  898. 'B': Series(['foo', 'bar']).astype('category'),
  899. 'C': [1, 2],
  900. 'D': [1.0, 2.0],
  901. 'E': Series([1, 2], dtype='uint64'),
  902. 'F': Series([1, 2], dtype='int32')})
  903. right = DataFrame({'A': right_vals})
  904. # GH 9780
  905. # We allow merging on object and categorical cols and cast
  906. # categorical cols to object
  907. result = pd.merge(left, right, on='A')
  908. assert is_object_dtype(result.A.dtype)
  909. @pytest.mark.parametrize('d1', [np.int64, np.int32,
  910. np.int16, np.int8, np.uint8])
  911. @pytest.mark.parametrize('d2', [np.int64, np.float64,
  912. np.float32, np.float16])
  913. def test_join_multi_dtypes(self, d1, d2):
  914. dtype1 = np.dtype(d1)
  915. dtype2 = np.dtype(d2)
  916. left = DataFrame({'k1': np.array([0, 1, 2] * 8, dtype=dtype1),
  917. 'k2': ['foo', 'bar'] * 12,
  918. 'v': np.array(np.arange(24), dtype=np.int64)})
  919. index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
  920. right = DataFrame({'v2': np.array([5, 7], dtype=dtype2)}, index=index)
  921. result = left.join(right, on=['k1', 'k2'])
  922. expected = left.copy()
  923. if dtype2.kind == 'i':
  924. dtype2 = np.dtype('float64')
  925. expected['v2'] = np.array(np.nan, dtype=dtype2)
  926. expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5
  927. expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7
  928. tm.assert_frame_equal(result, expected)
  929. result = left.join(right, on=['k1', 'k2'], sort=True)
  930. expected.sort_values(['k1', 'k2'], kind='mergesort', inplace=True)
  931. tm.assert_frame_equal(result, expected)
  932. @pytest.mark.parametrize('int_vals, float_vals, exp_vals', [
  933. ([1, 2, 3], [1.0, 2.0, 3.0], {'X': [1, 2, 3], 'Y': [1.0, 2.0, 3.0]}),
  934. ([1, 2, 3], [1.0, 3.0], {'X': [1, 3], 'Y': [1.0, 3.0]}),
  935. ([1, 2], [1.0, 2.0, 3.0], {'X': [1, 2], 'Y': [1.0, 2.0]}),
  936. ])
  937. def test_merge_on_ints_floats(self, int_vals, float_vals, exp_vals):
  938. # GH 16572
  939. # Check that float column is not cast to object if
  940. # merging on float and int columns
  941. A = DataFrame({'X': int_vals})
  942. B = DataFrame({'Y': float_vals})
  943. expected = DataFrame(exp_vals)
  944. result = A.merge(B, left_on='X', right_on='Y')
  945. assert_frame_equal(result, expected)
  946. result = B.merge(A, left_on='Y', right_on='X')
  947. assert_frame_equal(result, expected[['Y', 'X']])
  948. def test_merge_on_ints_floats_warning(self):
  949. # GH 16572
  950. # merge will produce a warning when merging on int and
  951. # float columns where the float values are not exactly
  952. # equal to their int representation
  953. A = DataFrame({'X': [1, 2, 3]})
  954. B = DataFrame({'Y': [1.1, 2.5, 3.0]})
  955. expected = DataFrame({'X': [3], 'Y': [3.0]})
  956. with tm.assert_produces_warning(UserWarning):
  957. result = A.merge(B, left_on='X', right_on='Y')
  958. assert_frame_equal(result, expected)
  959. with tm.assert_produces_warning(UserWarning):
  960. result = B.merge(A, left_on='Y', right_on='X')
  961. assert_frame_equal(result, expected[['Y', 'X']])
  962. # test no warning if float has NaNs
  963. B = DataFrame({'Y': [np.nan, np.nan, 3.0]})
  964. with tm.assert_produces_warning(None):
  965. result = B.merge(A, left_on='Y', right_on='X')
  966. assert_frame_equal(result, expected[['Y', 'X']])
  967. def test_merge_incompat_infer_boolean_object(self):
  968. # GH21119: bool + object bool merge OK
  969. df1 = DataFrame({'key': Series([True, False], dtype=object)})
  970. df2 = DataFrame({'key': [True, False]})
  971. expected = DataFrame({'key': [True, False]}, dtype=object)
  972. result = pd.merge(df1, df2, on='key')
  973. assert_frame_equal(result, expected)
  974. result = pd.merge(df2, df1, on='key')
  975. assert_frame_equal(result, expected)
  976. # with missing value
  977. df1 = DataFrame({'key': Series([True, False, np.nan], dtype=object)})
  978. df2 = DataFrame({'key': [True, False]})
  979. expected = DataFrame({'key': [True, False]}, dtype=object)
  980. result = pd.merge(df1, df2, on='key')
  981. assert_frame_equal(result, expected)
  982. result = pd.merge(df2, df1, on='key')
  983. assert_frame_equal(result, expected)
  984. @pytest.mark.parametrize('df1_vals, df2_vals', [
  985. # merge on category coerces to object
  986. ([0, 1, 2], Series(['a', 'b', 'a']).astype('category')),
  987. ([0.0, 1.0, 2.0], Series(['a', 'b', 'a']).astype('category')),
  988. # no not infer
  989. ([0, 1], pd.Series([False, True], dtype=object)),
  990. ([0, 1], pd.Series([False, True], dtype=bool)),
  991. ])
  992. def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals):
  993. # these are explicity allowed incompat merges, that pass thru
  994. # the result type is dependent on if the values on the rhs are
  995. # inferred, otherwise these will be coereced to object
  996. df1 = DataFrame({'A': df1_vals})
  997. df2 = DataFrame({'A': df2_vals})
  998. result = pd.merge(df1, df2, on=['A'])
  999. assert is_object_dtype(result.A.dtype)
  1000. result = pd.merge(df2, df1, on=['A'])
  1001. assert is_object_dtype(result.A.dtype)
  1002. @pytest.mark.parametrize('df1_vals, df2_vals', [
  1003. # do not infer to numeric
  1004. (Series([1, 2], dtype='uint64'), ["a", "b", "c"]),
  1005. (Series([1, 2], dtype='int32'), ["a", "b", "c"]),
  1006. ([0, 1, 2], ["0", "1", "2"]),
  1007. ([0.0, 1.0, 2.0], ["0", "1", "2"]),
  1008. ([0, 1, 2], [u"0", u"1", u"2"]),
  1009. (pd.date_range('1/1/2011', periods=2, freq='D'), ['2011-01-01',
  1010. '2011-01-02']),
  1011. (pd.date_range('1/1/2011', periods=2, freq='D'), [0, 1]),
  1012. (pd.date_range('1/1/2011', periods=2, freq='D'), [0.0, 1.0]),
  1013. (pd.date_range('20130101', periods=3),
  1014. pd.date_range('20130101', periods=3, tz='US/Eastern')),
  1015. ])
  1016. def test_merge_incompat_dtypes_error(self, df1_vals, df2_vals):
  1017. # GH 9780, GH 15800
  1018. # Raise a ValueError when a user tries to merge on
  1019. # dtypes that are incompatible (e.g., obj and int/float)
  1020. df1 = DataFrame({'A': df1_vals})
  1021. df2 = DataFrame({'A': df2_vals})
  1022. msg = ("You are trying to merge on {lk_dtype} and "
  1023. "{rk_dtype} columns. If you wish to proceed "
  1024. "you should use pd.concat".format(lk_dtype=df1['A'].dtype,
  1025. rk_dtype=df2['A'].dtype))
  1026. msg = re.escape(msg)
  1027. with pytest.raises(ValueError, match=msg):
  1028. pd.merge(df1, df2, on=['A'])
  1029. # Check that error still raised when swapping order of dataframes
  1030. msg = ("You are trying to merge on {lk_dtype} and "
  1031. "{rk_dtype} columns. If you wish to proceed "
  1032. "you should use pd.concat".format(lk_dtype=df2['A'].dtype,
  1033. rk_dtype=df1['A'].dtype))
  1034. msg = re.escape(msg)
  1035. with pytest.raises(ValueError, match=msg):
  1036. pd.merge(df2, df1, on=['A'])
  1037. @pytest.fixture
  1038. def left():
  1039. np.random.seed(1234)
  1040. return DataFrame(
  1041. {'X': Series(np.random.choice(
  1042. ['foo', 'bar'],
  1043. size=(10,))).astype(CDT(['foo', 'bar'])),
  1044. 'Y': np.random.choice(['one', 'two', 'three'], size=(10,))})
  1045. @pytest.fixture
  1046. def right():
  1047. np.random.seed(1234)
  1048. return DataFrame(
  1049. {'X': Series(['foo', 'bar']).astype(CDT(['foo', 'bar'])),
  1050. 'Z': [1, 2]})
  1051. class TestMergeCategorical(object):
  1052. def test_identical(self, left):
  1053. # merging on the same, should preserve dtypes
  1054. merged = pd.merge(left, left, on='X')
  1055. result = merged.dtypes.sort_index()
  1056. expected = Series([CategoricalDtype(),
  1057. np.dtype('O'),
  1058. np.dtype('O')],
  1059. index=['X', 'Y_x', 'Y_y'])
  1060. assert_series_equal(result, expected)
  1061. def test_basic(self, left, right):
  1062. # we have matching Categorical dtypes in X
  1063. # so should preserve the merged column
  1064. merged = pd.merge(left, right, on='X')
  1065. result = merged.dtypes.sort_index()
  1066. expected = Series([CategoricalDtype(),
  1067. np.dtype('O'),
  1068. np.dtype('int64')],
  1069. index=['X', 'Y', 'Z'])
  1070. assert_series_equal(result, expected)
  1071. def test_merge_categorical(self):
  1072. # GH 9426
  1073. right = DataFrame({'c': {0: 'a',
  1074. 1: 'b',
  1075. 2: 'c',
  1076. 3: 'd',
  1077. 4: 'e'},
  1078. 'd': {0: 'null',
  1079. 1: 'null',
  1080. 2: 'null',
  1081. 3: 'null',
  1082. 4: 'null'}})
  1083. left = DataFrame({'a': {0: 'f',
  1084. 1: 'f',
  1085. 2: 'f',
  1086. 3: 'f',
  1087. 4: 'f'},
  1088. 'b': {0: 'g',
  1089. 1: 'g',
  1090. 2: 'g',
  1091. 3: 'g',
  1092. 4: 'g'}})
  1093. df = pd.merge(left, right, how='left', left_on='b', right_on='c')
  1094. # object-object
  1095. expected = df.copy()
  1096. # object-cat
  1097. # note that we propagate the category
  1098. # because we don't have any matching rows
  1099. cright = right.copy()
  1100. cright['d'] = cright['d'].astype('category')
  1101. result = pd.merge(left, cright, how='left', left_on='b', right_on='c')
  1102. expected['d'] = expected['d'].astype(CategoricalDtype(['null']))
  1103. tm.assert_frame_equal(result, expected)
  1104. # cat-object
  1105. cleft = left.copy()
  1106. cleft['b'] = cleft['b'].astype('category')
  1107. result = pd.merge(cleft, cright, how='left', left_on='b', right_on='c')
  1108. tm.assert_frame_equal(result, expected)
  1109. # cat-cat
  1110. cright = right.copy()
  1111. cright['d'] = cright['d'].astype('category')
  1112. cleft = left.copy()
  1113. cleft['b'] = cleft['b'].astype('category')
  1114. result = pd.merge(cleft, cright, how='left', left_on='b', right_on='c')
  1115. tm.assert_frame_equal(result, expected)
  1116. def tests_merge_categorical_unordered_equal(self):
  1117. # GH-19551
  1118. df1 = DataFrame({
  1119. 'Foo': Categorical(['A', 'B', 'C'], categories=['A', 'B', 'C']),
  1120. 'Left': ['A0', 'B0', 'C0'],
  1121. })
  1122. df2 = DataFrame({
  1123. 'Foo': Categorical(['C', 'B', 'A'], categories=['C', 'B', 'A']),
  1124. 'Right': ['C1', 'B1', 'A1'],
  1125. })
  1126. result = pd.merge(df1, df2, on=['Foo'])
  1127. expected = DataFrame({
  1128. 'Foo': pd.Categorical(['A', 'B', 'C']),
  1129. 'Left': ['A0', 'B0', 'C0'],
  1130. 'Right': ['A1', 'B1', 'C1'],
  1131. })
  1132. assert_frame_equal(result, expected)
  1133. def test_other_columns(self, left, right):
  1134. # non-merge columns should preserve if possible
  1135. right = right.assign(Z=right.Z.astype('category'))
  1136. merged = pd.merge(left, right, on='X')
  1137. result = merged.dtypes.sort_index()
  1138. expected = Series([CategoricalDtype(),
  1139. np.dtype('O'),
  1140. CategoricalDtype()],
  1141. index=['X', 'Y', 'Z'])
  1142. assert_series_equal(result, expected)
  1143. # categories are preserved
  1144. assert left.X.values.is_dtype_equal(merged.X.values)
  1145. assert right.Z.values.is_dtype_equal(merged.Z.values)
  1146. @pytest.mark.parametrize(
  1147. 'change', [lambda x: x,
  1148. lambda x: x.astype(CDT(['foo', 'bar', 'bah'])),
  1149. lambda x: x.astype(CDT(ordered=True))])
  1150. def test_dtype_on_merged_different(self, change, join_type, left, right):
  1151. # our merging columns, X now has 2 different dtypes
  1152. # so we must be object as a result
  1153. X = change(right.X.astype('object'))
  1154. right = right.assign(X=X)
  1155. assert is_categorical_dtype(left.X.values)
  1156. # assert not left.X.values.is_dtype_equal(right.X.values)
  1157. merged = pd.merge(left, right, on='X', how=join_type)
  1158. result = merged.dtypes.sort_index()
  1159. expected = Series([np.dtype('O'),
  1160. np.dtype('O'),
  1161. np.dtype('int64')],
  1162. index=['X', 'Y', 'Z'])
  1163. assert_series_equal(result, expected)
  1164. def test_self_join_multiple_categories(self):
  1165. # GH 16767
  1166. # non-duplicates should work with multiple categories
  1167. m = 5
  1168. df = pd.DataFrame({
  1169. 'a': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] * m,
  1170. 'b': ['t', 'w', 'x', 'y', 'z'] * 2 * m,
  1171. 'c': [letter
  1172. for each in ['m', 'n', 'u', 'p', 'o']
  1173. for letter in [each] * 2 * m],
  1174. 'd': [letter
  1175. for each in ['aa', 'bb', 'cc', 'dd', 'ee',
  1176. 'ff', 'gg', 'hh', 'ii', 'jj']
  1177. for letter in [each] * m]})
  1178. # change them all to categorical variables
  1179. df = df.apply(lambda x: x.astype('category'))
  1180. # self-join should equal ourselves
  1181. result = pd.merge(df, df, on=list(df.columns))
  1182. assert_frame_equal(result, df)
  1183. def test_dtype_on_categorical_dates(self):
  1184. # GH 16900
  1185. # dates should not be coerced to ints
  1186. df = pd.DataFrame(
  1187. [[date(2001, 1, 1), 1.1],
  1188. [date(2001, 1, 2), 1.3]],
  1189. columns=['date', 'num2']
  1190. )
  1191. df['date'] = df['date'].astype('category')
  1192. df2 = pd.DataFrame(
  1193. [[date(2001, 1, 1), 1.3],
  1194. [date(2001, 1, 3), 1.4]],
  1195. columns=['date', 'num4']
  1196. )
  1197. df2['date'] = df2['date'].astype('category')
  1198. expected_outer = pd.DataFrame([
  1199. [pd.Timestamp('2001-01-01'), 1.1, 1.3],
  1200. [pd.Timestamp('2001-01-02'), 1.3, np.nan],
  1201. [pd.Timestamp('2001-01-03'), np.nan, 1.4]],
  1202. columns=['date', 'num2', 'num4']
  1203. )
  1204. result_outer = pd.merge(df, df2, how='outer', on=['date'])
  1205. assert_frame_equal(result_outer, expected_outer)
  1206. expected_inner = pd.DataFrame(
  1207. [[pd.Timestamp('2001-01-01'), 1.1, 1.3]],
  1208. columns=['date', 'num2', 'num4']
  1209. )
  1210. result_inner = pd.merge(df, df2, how='inner', on=['date'])
  1211. assert_frame_equal(result_inner, expected_inner)
  1212. @pytest.mark.parametrize('ordered', [True, False])
  1213. @pytest.mark.parametrize('category_column,categories,expected_categories',
  1214. [([False, True, True, False], [True, False],
  1215. [True, False]),
  1216. ([2, 1, 1, 2], [1, 2], [1, 2]),
  1217. (['False', 'True', 'True', 'False'],
  1218. ['True', 'False'], ['True', 'False'])])
  1219. def test_merging_with_bool_or_int_cateorical_column(self, category_column,
  1220. categories,
  1221. expected_categories,
  1222. ordered):
  1223. # GH 17187
  1224. # merging with a boolean/int categorical column
  1225. df1 = pd.DataFrame({'id': [1, 2, 3, 4],
  1226. 'cat': category_column})
  1227. df1['cat'] = df1['cat'].astype(CDT(categories, ordered=ordered))
  1228. df2 = pd.DataFrame({'id': [2, 4], 'num': [1, 9]})
  1229. result = df1.merge(df2)
  1230. expected = pd.DataFrame({'id': [2, 4], 'cat': expected_categories,
  1231. 'num': [1, 9]})
  1232. expected['cat'] = expected['cat'].astype(
  1233. CDT(categories, ordered=ordered))
  1234. assert_frame_equal(expected, result)
  1235. def test_merge_on_int_array(self):
  1236. # GH 23020
  1237. df = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'),
  1238. 'B': 1})
  1239. result = pd.merge(df, df, on='A')
  1240. expected = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'),
  1241. 'B_x': 1,
  1242. 'B_y': 1})
  1243. assert_frame_equal(result, expected)
  1244. @pytest.fixture
  1245. def left_df():
  1246. return DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0])
  1247. @pytest.fixture
  1248. def right_df():
  1249. return DataFrame({'b': [300, 100, 200]}, index=[3, 1, 2])
  1250. class TestMergeOnIndexes(object):
  1251. @pytest.mark.parametrize(
  1252. "how, sort, expected",
  1253. [('inner', False, DataFrame({'a': [20, 10],
  1254. 'b': [200, 100]},
  1255. index=[2, 1])),
  1256. ('inner', True, DataFrame({'a': [10, 20],
  1257. 'b': [100, 200]},
  1258. index=[1, 2])),
  1259. ('left', False, DataFrame({'a': [20, 10, 0],
  1260. 'b': [200, 100, np.nan]},
  1261. index=[2, 1, 0])),
  1262. ('left', True, DataFrame({'a': [0, 10, 20],
  1263. 'b': [np.nan, 100, 200]},
  1264. index=[0, 1, 2])),
  1265. ('right', False, DataFrame({'a': [np.nan, 10, 20],
  1266. 'b': [300, 100, 200]},
  1267. index=[3, 1, 2])),
  1268. ('right', True, DataFrame({'a': [10, 20, np.nan],
  1269. 'b': [100, 200, 300]},
  1270. index=[1, 2, 3])),
  1271. ('outer', False, DataFrame({'a': [0, 10, 20, np.nan],
  1272. 'b': [np.nan, 100, 200, 300]},
  1273. index=[0, 1, 2, 3])),
  1274. ('outer', True, DataFrame({'a': [0, 10, 20, np.nan],
  1275. 'b': [np.nan, 100, 200, 300]},
  1276. index=[0, 1, 2, 3]))])
  1277. def test_merge_on_indexes(self, left_df, right_df, how, sort, expected):
  1278. result = pd.merge(left_df, right_df,
  1279. left_index=True,
  1280. right_index=True,
  1281. how=how,
  1282. sort=sort)
  1283. tm.assert_frame_equal(result, expected)
  1284. @pytest.mark.parametrize(
  1285. 'index', [
  1286. CategoricalIndex(['A', 'B'], categories=['A', 'B'], name='index_col'),
  1287. Float64Index([1.0, 2.0], name='index_col'),
  1288. Int64Index([1, 2], name='index_col'),
  1289. UInt64Index([1, 2], name='index_col'),
  1290. RangeIndex(start=0, stop=2, name='index_col'),
  1291. DatetimeIndex(["2018-01-01", "2018-01-02"], name='index_col'),
  1292. ], ids=lambda x: type(x).__name__)
  1293. def test_merge_index_types(index):
  1294. # gh-20777
  1295. # assert key access is consistent across index types
  1296. left = DataFrame({"left_data": [1, 2]}, index=index)
  1297. right = DataFrame({"right_data": [1.0, 2.0]}, index=index)
  1298. result = left.merge(right, on=['index_col'])
  1299. expected = DataFrame(
  1300. OrderedDict([('left_data', [1, 2]), ('right_data', [1.0, 2.0])]),
  1301. index=index)
  1302. assert_frame_equal(result, expected)
  1303. @pytest.mark.parametrize("on,left_on,right_on,left_index,right_index,nm", [
  1304. (['outer', 'inner'], None, None, False, False, 'B'),
  1305. (None, None, None, True, True, 'B'),
  1306. (None, ['outer', 'inner'], None, False, True, 'B'),
  1307. (None, None, ['outer', 'inner'], True, False, 'B'),
  1308. (['outer', 'inner'], None, None, False, False, None),
  1309. (None, None, None, True, True, None),
  1310. (None, ['outer', 'inner'], None, False, True, None),
  1311. (None, None, ['outer', 'inner'], True, False, None)])
  1312. def test_merge_series(on, left_on, right_on, left_index, right_index, nm):
  1313. # GH 21220
  1314. a = pd.DataFrame({"A": [1, 2, 3, 4]},
  1315. index=pd.MultiIndex.from_product([['a', 'b'], [0, 1]],
  1316. names=['outer', 'inner']))
  1317. b = pd.Series([1, 2, 3, 4],
  1318. index=pd.MultiIndex.from_product([['a', 'b'], [1, 2]],
  1319. names=['outer', 'inner']), name=nm)
  1320. expected = pd.DataFrame({"A": [2, 4], "B": [1, 3]},
  1321. index=pd.MultiIndex.from_product([['a', 'b'], [1]],
  1322. names=['outer', 'inner']))
  1323. if nm is not None:
  1324. result = pd.merge(a, b, on=on, left_on=left_on, right_on=right_on,
  1325. left_index=left_index, right_index=right_index)
  1326. tm.assert_frame_equal(result, expected)
  1327. else:
  1328. msg = "Cannot merge a Series without a name"
  1329. with pytest.raises(ValueError, match=msg):
  1330. result = pd.merge(a, b, on=on, left_on=left_on, right_on=right_on,
  1331. left_index=left_index, right_index=right_index)