1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609 |
- # pylint: disable=E1103
- from collections import OrderedDict
- from datetime import date, datetime
- import random
- import re
- import numpy as np
- from numpy import nan
- import pytest
- from pandas.compat import lrange
- from pandas.core.dtypes.common import is_categorical_dtype, is_object_dtype
- from pandas.core.dtypes.dtypes import CategoricalDtype
- import pandas as pd
- from pandas import (
- Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Float64Index,
- Int64Index, MultiIndex, RangeIndex, Series, UInt64Index)
- from pandas.api.types import CategoricalDtype as CDT
- from pandas.core.reshape.concat import concat
- from pandas.core.reshape.merge import MergeError, merge
- import pandas.util.testing as tm
- from pandas.util.testing import assert_frame_equal, assert_series_equal
- N = 50
- NGROUPS = 8
- def get_test_data(ngroups=NGROUPS, n=N):
- unique_groups = lrange(ngroups)
- arr = np.asarray(np.tile(unique_groups, n // ngroups))
- if len(arr) < n:
- arr = np.asarray(list(arr) + unique_groups[:n - len(arr)])
- random.shuffle(arr)
- return arr
- def get_series():
- return [
- pd.Series([1], dtype='int64'),
- pd.Series([1], dtype='Int64'),
- pd.Series([1.23]),
- pd.Series(['foo']),
- pd.Series([True]),
- pd.Series([pd.Timestamp('2018-01-01')]),
- pd.Series([pd.Timestamp('2018-01-01', tz='US/Eastern')]),
- ]
- def get_series_na():
- return [
- pd.Series([np.nan], dtype='Int64'),
- pd.Series([np.nan], dtype='float'),
- pd.Series([np.nan], dtype='object'),
- pd.Series([pd.NaT]),
- ]
- @pytest.fixture(params=get_series(), ids=lambda x: x.dtype.name)
- def series_of_dtype(request):
- """
- A parametrized fixture returning a variety of Series of different
- dtypes
- """
- return request.param
- @pytest.fixture(params=get_series(), ids=lambda x: x.dtype.name)
- def series_of_dtype2(request):
- """
- A duplicate of the series_of_dtype fixture, so that it can be used
- twice by a single function
- """
- return request.param
- @pytest.fixture(params=get_series_na(), ids=lambda x: x.dtype.name)
- def series_of_dtype_all_na(request):
- """
- A parametrized fixture returning a variety of Series with all NA
- values
- """
- return request.param
- class TestMerge(object):
- def setup_method(self, method):
- # aggregate multiple columns
- self.df = DataFrame({'key1': get_test_data(),
- 'key2': get_test_data(),
- 'data1': np.random.randn(N),
- 'data2': np.random.randn(N)})
- # exclude a couple keys for fun
- self.df = self.df[self.df['key2'] > 1]
- self.df2 = DataFrame({'key1': get_test_data(n=N // 5),
- 'key2': get_test_data(ngroups=NGROUPS // 2,
- n=N // 5),
- 'value': np.random.randn(N // 5)})
- self.left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'],
- 'v1': np.random.randn(7)})
- self.right = DataFrame({'v2': np.random.randn(4)},
- index=['d', 'b', 'c', 'a'])
- def test_merge_inner_join_empty(self):
- # GH 15328
- df_empty = pd.DataFrame()
- df_a = pd.DataFrame({'a': [1, 2]}, index=[0, 1], dtype='int64')
- result = pd.merge(df_empty, df_a, left_index=True, right_index=True)
- expected = pd.DataFrame({'a': []}, index=[], dtype='int64')
- assert_frame_equal(result, expected)
- def test_merge_common(self):
- joined = merge(self.df, self.df2)
- exp = merge(self.df, self.df2, on=['key1', 'key2'])
- tm.assert_frame_equal(joined, exp)
- def test_merge_index_as_on_arg(self):
- # GH14355
- left = self.df.set_index('key1')
- right = self.df2.set_index('key1')
- result = merge(left, right, on='key1')
- expected = merge(self.df, self.df2, on='key1').set_index('key1')
- assert_frame_equal(result, expected)
- def test_merge_index_singlekey_right_vs_left(self):
- left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'],
- 'v1': np.random.randn(7)})
- right = DataFrame({'v2': np.random.randn(4)},
- index=['d', 'b', 'c', 'a'])
- merged1 = merge(left, right, left_on='key',
- right_index=True, how='left', sort=False)
- merged2 = merge(right, left, right_on='key',
- left_index=True, how='right', sort=False)
- assert_frame_equal(merged1, merged2.loc[:, merged1.columns])
- merged1 = merge(left, right, left_on='key',
- right_index=True, how='left', sort=True)
- merged2 = merge(right, left, right_on='key',
- left_index=True, how='right', sort=True)
- assert_frame_equal(merged1, merged2.loc[:, merged1.columns])
- def test_merge_index_singlekey_inner(self):
- left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'],
- 'v1': np.random.randn(7)})
- right = DataFrame({'v2': np.random.randn(4)},
- index=['d', 'b', 'c', 'a'])
- # inner join
- result = merge(left, right, left_on='key', right_index=True,
- how='inner')
- expected = left.join(right, on='key').loc[result.index]
- assert_frame_equal(result, expected)
- result = merge(right, left, right_on='key', left_index=True,
- how='inner')
- expected = left.join(right, on='key').loc[result.index]
- assert_frame_equal(result, expected.loc[:, result.columns])
- def test_merge_misspecified(self):
- msg = "Must pass right_on or right_index=True"
- with pytest.raises(pd.errors.MergeError, match=msg):
- merge(self.left, self.right, left_index=True)
- msg = "Must pass left_on or left_index=True"
- with pytest.raises(pd.errors.MergeError, match=msg):
- merge(self.left, self.right, right_index=True)
- msg = ('Can only pass argument "on" OR "left_on" and "right_on", not'
- ' a combination of both')
- with pytest.raises(pd.errors.MergeError, match=msg):
- merge(self.left, self.left, left_on='key', on='key')
- msg = r"len\(right_on\) must equal len\(left_on\)"
- with pytest.raises(ValueError, match=msg):
- merge(self.df, self.df2, left_on=['key1'],
- right_on=['key1', 'key2'])
- def test_index_and_on_parameters_confusion(self):
- msg = ("right_index parameter must be of type bool, not"
- r" <(class|type) 'list'>")
- with pytest.raises(ValueError, match=msg):
- merge(self.df, self.df2, how='left',
- left_index=False, right_index=['key1', 'key2'])
- msg = ("left_index parameter must be of type bool, not "
- r"<(class|type) 'list'>")
- with pytest.raises(ValueError, match=msg):
- merge(self.df, self.df2, how='left',
- left_index=['key1', 'key2'], right_index=False)
- with pytest.raises(ValueError, match=msg):
- merge(self.df, self.df2, how='left',
- left_index=['key1', 'key2'], right_index=['key1', 'key2'])
- def test_merge_overlap(self):
- merged = merge(self.left, self.left, on='key')
- exp_len = (self.left['key'].value_counts() ** 2).sum()
- assert len(merged) == exp_len
- assert 'v1_x' in merged
- assert 'v1_y' in merged
- def test_merge_different_column_key_names(self):
- left = DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
- 'value': [1, 2, 3, 4]})
- right = DataFrame({'rkey': ['foo', 'bar', 'qux', 'foo'],
- 'value': [5, 6, 7, 8]})
- merged = left.merge(right, left_on='lkey', right_on='rkey',
- how='outer', sort=True)
- exp = pd.Series(['bar', 'baz', 'foo', 'foo', 'foo', 'foo', np.nan],
- name='lkey')
- tm.assert_series_equal(merged['lkey'], exp)
- exp = pd.Series(['bar', np.nan, 'foo', 'foo', 'foo', 'foo', 'qux'],
- name='rkey')
- tm.assert_series_equal(merged['rkey'], exp)
- exp = pd.Series([2, 3, 1, 1, 4, 4, np.nan], name='value_x')
- tm.assert_series_equal(merged['value_x'], exp)
- exp = pd.Series([6, np.nan, 5, 8, 5, 8, 7], name='value_y')
- tm.assert_series_equal(merged['value_y'], exp)
- def test_merge_copy(self):
- left = DataFrame({'a': 0, 'b': 1}, index=lrange(10))
- right = DataFrame({'c': 'foo', 'd': 'bar'}, index=lrange(10))
- merged = merge(left, right, left_index=True,
- right_index=True, copy=True)
- merged['a'] = 6
- assert (left['a'] == 0).all()
- merged['d'] = 'peekaboo'
- assert (right['d'] == 'bar').all()
- def test_merge_nocopy(self):
- left = DataFrame({'a': 0, 'b': 1}, index=lrange(10))
- right = DataFrame({'c': 'foo', 'd': 'bar'}, index=lrange(10))
- merged = merge(left, right, left_index=True,
- right_index=True, copy=False)
- merged['a'] = 6
- assert (left['a'] == 6).all()
- merged['d'] = 'peekaboo'
- assert (right['d'] == 'peekaboo').all()
- def test_intelligently_handle_join_key(self):
- # #733, be a bit more 1337 about not returning unconsolidated DataFrame
- left = DataFrame({'key': [1, 1, 2, 2, 3],
- 'value': lrange(5)}, columns=['value', 'key'])
- right = DataFrame({'key': [1, 1, 2, 3, 4, 5],
- 'rvalue': lrange(6)})
- joined = merge(left, right, on='key', how='outer')
- expected = DataFrame({'key': [1, 1, 1, 1, 2, 2, 3, 4, 5],
- 'value': np.array([0, 0, 1, 1, 2, 3, 4,
- np.nan, np.nan]),
- 'rvalue': [0, 1, 0, 1, 2, 2, 3, 4, 5]},
- columns=['value', 'key', 'rvalue'])
- assert_frame_equal(joined, expected)
- def test_merge_join_key_dtype_cast(self):
- # #8596
- df1 = DataFrame({'key': [1], 'v1': [10]})
- df2 = DataFrame({'key': [2], 'v1': [20]})
- df = merge(df1, df2, how='outer')
- assert df['key'].dtype == 'int64'
- df1 = DataFrame({'key': [True], 'v1': [1]})
- df2 = DataFrame({'key': [False], 'v1': [0]})
- df = merge(df1, df2, how='outer')
- # GH13169
- # this really should be bool
- assert df['key'].dtype == 'object'
- df1 = DataFrame({'val': [1]})
- df2 = DataFrame({'val': [2]})
- lkey = np.array([1])
- rkey = np.array([2])
- df = merge(df1, df2, left_on=lkey, right_on=rkey, how='outer')
- assert df['key_0'].dtype == 'int64'
- def test_handle_join_key_pass_array(self):
- left = DataFrame({'key': [1, 1, 2, 2, 3],
- 'value': lrange(5)}, columns=['value', 'key'])
- right = DataFrame({'rvalue': lrange(6)})
- key = np.array([1, 1, 2, 3, 4, 5])
- merged = merge(left, right, left_on='key', right_on=key, how='outer')
- merged2 = merge(right, left, left_on=key, right_on='key', how='outer')
- assert_series_equal(merged['key'], merged2['key'])
- assert merged['key'].notna().all()
- assert merged2['key'].notna().all()
- left = DataFrame({'value': lrange(5)}, columns=['value'])
- right = DataFrame({'rvalue': lrange(6)})
- lkey = np.array([1, 1, 2, 2, 3])
- rkey = np.array([1, 1, 2, 3, 4, 5])
- merged = merge(left, right, left_on=lkey, right_on=rkey, how='outer')
- tm.assert_series_equal(merged['key_0'], Series([1, 1, 1, 1, 2,
- 2, 3, 4, 5],
- name='key_0'))
- left = DataFrame({'value': lrange(3)})
- right = DataFrame({'rvalue': lrange(6)})
- key = np.array([0, 1, 1, 2, 2, 3], dtype=np.int64)
- merged = merge(left, right, left_index=True, right_on=key, how='outer')
- tm.assert_series_equal(merged['key_0'], Series(key, name='key_0'))
- def test_no_overlap_more_informative_error(self):
- dt = datetime.now()
- df1 = DataFrame({'x': ['a']}, index=[dt])
- df2 = DataFrame({'y': ['b', 'c']}, index=[dt, dt])
- msg = ('No common columns to perform merge on. '
- 'Merge options: left_on={lon}, right_on={ron}, '
- 'left_index={lidx}, right_index={ridx}'
- .format(lon=None, ron=None, lidx=False, ridx=False))
- with pytest.raises(MergeError, match=msg):
- merge(df1, df2)
- def test_merge_non_unique_indexes(self):
- dt = datetime(2012, 5, 1)
- dt2 = datetime(2012, 5, 2)
- dt3 = datetime(2012, 5, 3)
- dt4 = datetime(2012, 5, 4)
- df1 = DataFrame({'x': ['a']}, index=[dt])
- df2 = DataFrame({'y': ['b', 'c']}, index=[dt, dt])
- _check_merge(df1, df2)
- # Not monotonic
- df1 = DataFrame({'x': ['a', 'b', 'q']}, index=[dt2, dt, dt4])
- df2 = DataFrame({'y': ['c', 'd', 'e', 'f', 'g', 'h']},
- index=[dt3, dt3, dt2, dt2, dt, dt])
- _check_merge(df1, df2)
- df1 = DataFrame({'x': ['a', 'b']}, index=[dt, dt])
- df2 = DataFrame({'y': ['c', 'd']}, index=[dt, dt])
- _check_merge(df1, df2)
- def test_merge_non_unique_index_many_to_many(self):
- dt = datetime(2012, 5, 1)
- dt2 = datetime(2012, 5, 2)
- dt3 = datetime(2012, 5, 3)
- df1 = DataFrame({'x': ['a', 'b', 'c', 'd']},
- index=[dt2, dt2, dt, dt])
- df2 = DataFrame({'y': ['e', 'f', 'g', ' h', 'i']},
- index=[dt2, dt2, dt3, dt, dt])
- _check_merge(df1, df2)
- def test_left_merge_empty_dataframe(self):
- left = DataFrame({'key': [1], 'value': [2]})
- right = DataFrame({'key': []})
- result = merge(left, right, on='key', how='left')
- assert_frame_equal(result, left)
- result = merge(right, left, on='key', how='right')
- assert_frame_equal(result, left)
- @pytest.mark.parametrize('kwarg',
- [dict(left_index=True, right_index=True),
- dict(left_index=True, right_on='x'),
- dict(left_on='a', right_index=True),
- dict(left_on='a', right_on='x')])
- def test_merge_left_empty_right_empty(self, join_type, kwarg):
- # GH 10824
- left = pd.DataFrame([], columns=['a', 'b', 'c'])
- right = pd.DataFrame([], columns=['x', 'y', 'z'])
- exp_in = pd.DataFrame([], columns=['a', 'b', 'c', 'x', 'y', 'z'],
- index=pd.Index([], dtype=object),
- dtype=object)
- result = pd.merge(left, right, how=join_type, **kwarg)
- tm.assert_frame_equal(result, exp_in)
- def test_merge_left_empty_right_notempty(self):
- # GH 10824
- left = pd.DataFrame([], columns=['a', 'b', 'c'])
- right = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
- columns=['x', 'y', 'z'])
- exp_out = pd.DataFrame({'a': np.array([np.nan] * 3, dtype=object),
- 'b': np.array([np.nan] * 3, dtype=object),
- 'c': np.array([np.nan] * 3, dtype=object),
- 'x': [1, 4, 7],
- 'y': [2, 5, 8],
- 'z': [3, 6, 9]},
- columns=['a', 'b', 'c', 'x', 'y', 'z'])
- exp_in = exp_out[0:0] # make empty DataFrame keeping dtype
- # result will have object dtype
- exp_in.index = exp_in.index.astype(object)
- def check1(exp, kwarg):
- result = pd.merge(left, right, how='inner', **kwarg)
- tm.assert_frame_equal(result, exp)
- result = pd.merge(left, right, how='left', **kwarg)
- tm.assert_frame_equal(result, exp)
- def check2(exp, kwarg):
- result = pd.merge(left, right, how='right', **kwarg)
- tm.assert_frame_equal(result, exp)
- result = pd.merge(left, right, how='outer', **kwarg)
- tm.assert_frame_equal(result, exp)
- for kwarg in [dict(left_index=True, right_index=True),
- dict(left_index=True, right_on='x')]:
- check1(exp_in, kwarg)
- check2(exp_out, kwarg)
- kwarg = dict(left_on='a', right_index=True)
- check1(exp_in, kwarg)
- exp_out['a'] = [0, 1, 2]
- check2(exp_out, kwarg)
- kwarg = dict(left_on='a', right_on='x')
- check1(exp_in, kwarg)
- exp_out['a'] = np.array([np.nan] * 3, dtype=object)
- check2(exp_out, kwarg)
- def test_merge_left_notempty_right_empty(self):
- # GH 10824
- left = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
- columns=['a', 'b', 'c'])
- right = pd.DataFrame([], columns=['x', 'y', 'z'])
- exp_out = pd.DataFrame({'a': [1, 4, 7],
- 'b': [2, 5, 8],
- 'c': [3, 6, 9],
- 'x': np.array([np.nan] * 3, dtype=object),
- 'y': np.array([np.nan] * 3, dtype=object),
- 'z': np.array([np.nan] * 3, dtype=object)},
- columns=['a', 'b', 'c', 'x', 'y', 'z'])
- exp_in = exp_out[0:0] # make empty DataFrame keeping dtype
- # result will have object dtype
- exp_in.index = exp_in.index.astype(object)
- def check1(exp, kwarg):
- result = pd.merge(left, right, how='inner', **kwarg)
- tm.assert_frame_equal(result, exp)
- result = pd.merge(left, right, how='right', **kwarg)
- tm.assert_frame_equal(result, exp)
- def check2(exp, kwarg):
- result = pd.merge(left, right, how='left', **kwarg)
- tm.assert_frame_equal(result, exp)
- result = pd.merge(left, right, how='outer', **kwarg)
- tm.assert_frame_equal(result, exp)
- for kwarg in [dict(left_index=True, right_index=True),
- dict(left_index=True, right_on='x'),
- dict(left_on='a', right_index=True),
- dict(left_on='a', right_on='x')]:
- check1(exp_in, kwarg)
- check2(exp_out, kwarg)
- def test_merge_empty_frame(self, series_of_dtype, series_of_dtype2):
- # GH 25183
- df = pd.DataFrame({'key': series_of_dtype, 'value': series_of_dtype2},
- columns=['key', 'value'])
- df_empty = df[:0]
- expected = pd.DataFrame({
- 'value_x': pd.Series(dtype=df.dtypes['value']),
- 'key': pd.Series(dtype=df.dtypes['key']),
- 'value_y': pd.Series(dtype=df.dtypes['value']),
- }, columns=['value_x', 'key', 'value_y'])
- actual = df_empty.merge(df, on='key')
- assert_frame_equal(actual, expected)
- def test_merge_all_na_column(self, series_of_dtype,
- series_of_dtype_all_na):
- # GH 25183
- df_left = pd.DataFrame(
- {'key': series_of_dtype, 'value': series_of_dtype_all_na},
- columns=['key', 'value'])
- df_right = pd.DataFrame(
- {'key': series_of_dtype, 'value': series_of_dtype_all_na},
- columns=['key', 'value'])
- expected = pd.DataFrame({
- 'key': series_of_dtype,
- 'value_x': series_of_dtype_all_na,
- 'value_y': series_of_dtype_all_na,
- }, columns=['key', 'value_x', 'value_y'])
- actual = df_left.merge(df_right, on='key')
- assert_frame_equal(actual, expected)
- def test_merge_nosort(self):
- # #2098, anything to do?
- from datetime import datetime
- d = {"var1": np.random.randint(0, 10, size=10),
- "var2": np.random.randint(0, 10, size=10),
- "var3": [datetime(2012, 1, 12),
- datetime(2011, 2, 4),
- datetime(2010, 2, 3),
- datetime(2012, 1, 12),
- datetime(2011, 2, 4),
- datetime(2012, 4, 3),
- datetime(2012, 3, 4),
- datetime(2008, 5, 1),
- datetime(2010, 2, 3),
- datetime(2012, 2, 3)]}
- df = DataFrame.from_dict(d)
- var3 = df.var3.unique()
- var3.sort()
- new = DataFrame.from_dict({"var3": var3,
- "var8": np.random.random(7)})
- result = df.merge(new, on="var3", sort=False)
- exp = merge(df, new, on='var3', sort=False)
- assert_frame_equal(result, exp)
- assert (df.var3.unique() == result.var3.unique()).all()
- def test_merge_nan_right(self):
- df1 = DataFrame({"i1": [0, 1], "i2": [0, 1]})
- df2 = DataFrame({"i1": [0], "i3": [0]})
- result = df1.join(df2, on="i1", rsuffix="_")
- expected = (DataFrame({'i1': {0: 0.0, 1: 1}, 'i2': {0: 0, 1: 1},
- 'i1_': {0: 0, 1: np.nan},
- 'i3': {0: 0.0, 1: np.nan},
- None: {0: 0, 1: 0}})
- .set_index(None)
- .reset_index()[['i1', 'i2', 'i1_', 'i3']])
- assert_frame_equal(result, expected, check_dtype=False)
- df1 = DataFrame({"i1": [0, 1], "i2": [0.5, 1.5]})
- df2 = DataFrame({"i1": [0], "i3": [0.7]})
- result = df1.join(df2, rsuffix="_", on='i1')
- expected = (DataFrame({'i1': {0: 0, 1: 1}, 'i1_': {0: 0.0, 1: nan},
- 'i2': {0: 0.5, 1: 1.5},
- 'i3': {0: 0.69999999999999996,
- 1: nan}})
- [['i1', 'i2', 'i1_', 'i3']])
- assert_frame_equal(result, expected)
- def test_merge_type(self):
- class NotADataFrame(DataFrame):
- @property
- def _constructor(self):
- return NotADataFrame
- nad = NotADataFrame(self.df)
- result = nad.merge(self.df2, on='key1')
- assert isinstance(result, NotADataFrame)
- def test_join_append_timedeltas(self):
- import datetime as dt
- from pandas import NaT
- # timedelta64 issues with join/merge
- # GH 5695
- d = {'d': dt.datetime(2013, 11, 5, 5, 56), 't': dt.timedelta(0, 22500)}
- df = DataFrame(columns=list('dt'))
- df = df.append(d, ignore_index=True)
- result = df.append(d, ignore_index=True)
- expected = DataFrame({'d': [dt.datetime(2013, 11, 5, 5, 56),
- dt.datetime(2013, 11, 5, 5, 56)],
- 't': [dt.timedelta(0, 22500),
- dt.timedelta(0, 22500)]})
- assert_frame_equal(result, expected)
- td = np.timedelta64(300000000)
- lhs = DataFrame(Series([td, td], index=["A", "B"]))
- rhs = DataFrame(Series([td], index=["A"]))
- result = lhs.join(rhs, rsuffix='r', how="left")
- expected = DataFrame({'0': Series([td, td], index=list('AB')),
- '0r': Series([td, NaT], index=list('AB'))})
- assert_frame_equal(result, expected)
- def test_other_datetime_unit(self):
- # GH 13389
- df1 = pd.DataFrame({'entity_id': [101, 102]})
- s = pd.Series([None, None], index=[101, 102], name='days')
- for dtype in ['datetime64[D]', 'datetime64[h]', 'datetime64[m]',
- 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]',
- 'datetime64[ns]']:
- df2 = s.astype(dtype).to_frame('days')
- # coerces to datetime64[ns], thus sholuld not be affected
- assert df2['days'].dtype == 'datetime64[ns]'
- result = df1.merge(df2, left_on='entity_id', right_index=True)
- exp = pd.DataFrame({'entity_id': [101, 102],
- 'days': np.array(['nat', 'nat'],
- dtype='datetime64[ns]')},
- columns=['entity_id', 'days'])
- tm.assert_frame_equal(result, exp)
- @pytest.mark.parametrize("unit", ['D', 'h', 'm', 's', 'ms', 'us', 'ns'])
- def test_other_timedelta_unit(self, unit):
- # GH 13389
- df1 = pd.DataFrame({'entity_id': [101, 102]})
- s = pd.Series([None, None], index=[101, 102], name='days')
- dtype = "m8[{}]".format(unit)
- df2 = s.astype(dtype).to_frame('days')
- assert df2['days'].dtype == 'm8[ns]'
- result = df1.merge(df2, left_on='entity_id', right_index=True)
- exp = pd.DataFrame({'entity_id': [101, 102],
- 'days': np.array(['nat', 'nat'],
- dtype=dtype)},
- columns=['entity_id', 'days'])
- tm.assert_frame_equal(result, exp)
- def test_overlapping_columns_error_message(self):
- df = DataFrame({'key': [1, 2, 3],
- 'v1': [4, 5, 6],
- 'v2': [7, 8, 9]})
- df2 = DataFrame({'key': [1, 2, 3],
- 'v1': [4, 5, 6],
- 'v2': [7, 8, 9]})
- df.columns = ['key', 'foo', 'foo']
- df2.columns = ['key', 'bar', 'bar']
- expected = DataFrame({'key': [1, 2, 3],
- 'v1': [4, 5, 6],
- 'v2': [7, 8, 9],
- 'v3': [4, 5, 6],
- 'v4': [7, 8, 9]})
- expected.columns = ['key', 'foo', 'foo', 'bar', 'bar']
- assert_frame_equal(merge(df, df2), expected)
- # #2649, #10639
- df2.columns = ['key1', 'foo', 'foo']
- msg = (r"Data columns not unique: Index\(\[u?'foo', u?'foo'\],"
- r" dtype='object'\)")
- with pytest.raises(MergeError, match=msg):
- merge(df, df2)
- def test_merge_on_datetime64tz(self):
- # GH11405
- left = pd.DataFrame({'key': pd.date_range('20151010', periods=2,
- tz='US/Eastern'),
- 'value': [1, 2]})
- right = pd.DataFrame({'key': pd.date_range('20151011', periods=3,
- tz='US/Eastern'),
- 'value': [1, 2, 3]})
- expected = DataFrame({'key': pd.date_range('20151010', periods=4,
- tz='US/Eastern'),
- 'value_x': [1, 2, np.nan, np.nan],
- 'value_y': [np.nan, 1, 2, 3]})
- result = pd.merge(left, right, on='key', how='outer')
- assert_frame_equal(result, expected)
- left = pd.DataFrame({'key': [1, 2],
- 'value': pd.date_range('20151010', periods=2,
- tz='US/Eastern')})
- right = pd.DataFrame({'key': [2, 3],
- 'value': pd.date_range('20151011', periods=2,
- tz='US/Eastern')})
- expected = DataFrame({
- 'key': [1, 2, 3],
- 'value_x': list(pd.date_range('20151010', periods=2,
- tz='US/Eastern')) + [pd.NaT],
- 'value_y': [pd.NaT] + list(pd.date_range('20151011', periods=2,
- tz='US/Eastern'))})
- result = pd.merge(left, right, on='key', how='outer')
- assert_frame_equal(result, expected)
- assert result['value_x'].dtype == 'datetime64[ns, US/Eastern]'
- assert result['value_y'].dtype == 'datetime64[ns, US/Eastern]'
- def test_merge_on_datetime64tz_empty(self):
- # https://github.com/pandas-dev/pandas/issues/25014
- dtz = pd.DatetimeTZDtype(tz='UTC')
- right = pd.DataFrame({'date': [pd.Timestamp('2018', tz=dtz.tz)],
- 'value': [4.0],
- 'date2': [pd.Timestamp('2019', tz=dtz.tz)]},
- columns=['date', 'value', 'date2'])
- left = right[:0]
- result = left.merge(right, on='date')
- expected = pd.DataFrame({
- 'value_x': pd.Series(dtype=float),
- 'date2_x': pd.Series(dtype=dtz),
- 'date': pd.Series(dtype=dtz),
- 'value_y': pd.Series(dtype=float),
- 'date2_y': pd.Series(dtype=dtz),
- }, columns=['value_x', 'date2_x', 'date', 'value_y', 'date2_y'])
- tm.assert_frame_equal(result, expected)
- def test_merge_datetime64tz_with_dst_transition(self):
- # GH 18885
- df1 = pd.DataFrame(pd.date_range(
- '2017-10-29 01:00', periods=4, freq='H', tz='Europe/Madrid'),
- columns=['date'])
- df1['value'] = 1
- df2 = pd.DataFrame({
- 'date': pd.to_datetime([
- '2017-10-29 03:00:00', '2017-10-29 04:00:00',
- '2017-10-29 05:00:00'
- ]),
- 'value': 2
- })
- df2['date'] = df2['date'].dt.tz_localize('UTC').dt.tz_convert(
- 'Europe/Madrid')
- result = pd.merge(df1, df2, how='outer', on='date')
- expected = pd.DataFrame({
- 'date': pd.date_range(
- '2017-10-29 01:00', periods=7, freq='H', tz='Europe/Madrid'),
- 'value_x': [1] * 4 + [np.nan] * 3,
- 'value_y': [np.nan] * 4 + [2] * 3
- })
- assert_frame_equal(result, expected)
- def test_merge_non_unique_period_index(self):
- # GH #16871
- index = pd.period_range('2016-01-01', periods=16, freq='M')
- df = DataFrame([i for i in range(len(index))],
- index=index, columns=['pnum'])
- df2 = concat([df, df])
- result = df.merge(df2, left_index=True, right_index=True, how='inner')
- expected = DataFrame(
- np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
- columns=['pnum_x', 'pnum_y'], index=df2.sort_index().index)
- tm.assert_frame_equal(result, expected)
- def test_merge_on_periods(self):
- left = pd.DataFrame({'key': pd.period_range('20151010', periods=2,
- freq='D'),
- 'value': [1, 2]})
- right = pd.DataFrame({'key': pd.period_range('20151011', periods=3,
- freq='D'),
- 'value': [1, 2, 3]})
- expected = DataFrame({'key': pd.period_range('20151010', periods=4,
- freq='D'),
- 'value_x': [1, 2, np.nan, np.nan],
- 'value_y': [np.nan, 1, 2, 3]})
- result = pd.merge(left, right, on='key', how='outer')
- assert_frame_equal(result, expected)
- left = pd.DataFrame({'key': [1, 2],
- 'value': pd.period_range('20151010', periods=2,
- freq='D')})
- right = pd.DataFrame({'key': [2, 3],
- 'value': pd.period_range('20151011', periods=2,
- freq='D')})
- exp_x = pd.period_range('20151010', periods=2, freq='D')
- exp_y = pd.period_range('20151011', periods=2, freq='D')
- expected = DataFrame({'key': [1, 2, 3],
- 'value_x': list(exp_x) + [pd.NaT],
- 'value_y': [pd.NaT] + list(exp_y)})
- result = pd.merge(left, right, on='key', how='outer')
- assert_frame_equal(result, expected)
- assert result['value_x'].dtype == 'Period[D]'
- assert result['value_y'].dtype == 'Period[D]'
- def test_indicator(self):
- # PR #10054. xref #7412 and closes #8790.
- df1 = DataFrame({'col1': [0, 1], 'col_conflict': [1, 2],
- 'col_left': ['a', 'b']})
- df1_copy = df1.copy()
- df2 = DataFrame({'col1': [1, 2, 3, 4, 5],
- 'col_conflict': [1, 2, 3, 4, 5],
- 'col_right': [2, 2, 2, 2, 2]})
- df2_copy = df2.copy()
- df_result = DataFrame({
- 'col1': [0, 1, 2, 3, 4, 5],
- 'col_conflict_x': [1, 2, np.nan, np.nan, np.nan, np.nan],
- 'col_left': ['a', 'b', np.nan, np.nan, np.nan, np.nan],
- 'col_conflict_y': [np.nan, 1, 2, 3, 4, 5],
- 'col_right': [np.nan, 2, 2, 2, 2, 2]})
- df_result['_merge'] = Categorical(
- ['left_only', 'both', 'right_only',
- 'right_only', 'right_only', 'right_only'],
- categories=['left_only', 'right_only', 'both'])
- df_result = df_result[['col1', 'col_conflict_x', 'col_left',
- 'col_conflict_y', 'col_right', '_merge']]
- test = merge(df1, df2, on='col1', how='outer', indicator=True)
- assert_frame_equal(test, df_result)
- test = df1.merge(df2, on='col1', how='outer', indicator=True)
- assert_frame_equal(test, df_result)
- # No side effects
- assert_frame_equal(df1, df1_copy)
- assert_frame_equal(df2, df2_copy)
- # Check with custom name
- df_result_custom_name = df_result
- df_result_custom_name = df_result_custom_name.rename(
- columns={'_merge': 'custom_name'})
- test_custom_name = merge(
- df1, df2, on='col1', how='outer', indicator='custom_name')
- assert_frame_equal(test_custom_name, df_result_custom_name)
- test_custom_name = df1.merge(
- df2, on='col1', how='outer', indicator='custom_name')
- assert_frame_equal(test_custom_name, df_result_custom_name)
- # Check only accepts strings and booleans
- msg = "indicator option can only accept boolean or string arguments"
- with pytest.raises(ValueError, match=msg):
- merge(df1, df2, on='col1', how='outer', indicator=5)
- with pytest.raises(ValueError, match=msg):
- df1.merge(df2, on='col1', how='outer', indicator=5)
- # Check result integrity
- test2 = merge(df1, df2, on='col1', how='left', indicator=True)
- assert (test2._merge != 'right_only').all()
- test2 = df1.merge(df2, on='col1', how='left', indicator=True)
- assert (test2._merge != 'right_only').all()
- test3 = merge(df1, df2, on='col1', how='right', indicator=True)
- assert (test3._merge != 'left_only').all()
- test3 = df1.merge(df2, on='col1', how='right', indicator=True)
- assert (test3._merge != 'left_only').all()
- test4 = merge(df1, df2, on='col1', how='inner', indicator=True)
- assert (test4._merge == 'both').all()
- test4 = df1.merge(df2, on='col1', how='inner', indicator=True)
- assert (test4._merge == 'both').all()
- # Check if working name in df
- for i in ['_right_indicator', '_left_indicator', '_merge']:
- df_badcolumn = DataFrame({'col1': [1, 2], i: [2, 2]})
- msg = ("Cannot use `indicator=True` option when data contains a"
- " column named {}|"
- "Cannot use name of an existing column for indicator"
- " column").format(i)
- with pytest.raises(ValueError, match=msg):
- merge(df1, df_badcolumn, on='col1',
- how='outer', indicator=True)
- with pytest.raises(ValueError, match=msg):
- df1.merge(df_badcolumn, on='col1', how='outer', indicator=True)
- # Check for name conflict with custom name
- df_badcolumn = DataFrame(
- {'col1': [1, 2], 'custom_column_name': [2, 2]})
- msg = "Cannot use name of an existing column for indicator column"
- with pytest.raises(ValueError, match=msg):
- merge(df1, df_badcolumn, on='col1', how='outer',
- indicator='custom_column_name')
- with pytest.raises(ValueError, match=msg):
- df1.merge(df_badcolumn, on='col1', how='outer',
- indicator='custom_column_name')
- # Merge on multiple columns
- df3 = DataFrame({'col1': [0, 1], 'col2': ['a', 'b']})
- df4 = DataFrame({'col1': [1, 1, 3], 'col2': ['b', 'x', 'y']})
- hand_coded_result = DataFrame({'col1': [0, 1, 1, 3],
- 'col2': ['a', 'b', 'x', 'y']})
- hand_coded_result['_merge'] = Categorical(
- ['left_only', 'both', 'right_only', 'right_only'],
- categories=['left_only', 'right_only', 'both'])
- test5 = merge(df3, df4, on=['col1', 'col2'],
- how='outer', indicator=True)
- assert_frame_equal(test5, hand_coded_result)
- test5 = df3.merge(df4, on=['col1', 'col2'],
- how='outer', indicator=True)
- assert_frame_equal(test5, hand_coded_result)
- def test_validation(self):
- left = DataFrame({'a': ['a', 'b', 'c', 'd'],
- 'b': ['cat', 'dog', 'weasel', 'horse']},
- index=range(4))
- right = DataFrame({'a': ['a', 'b', 'c', 'd', 'e'],
- 'c': ['meow', 'bark', 'um... weasel noise?',
- 'nay', 'chirp']},
- index=range(5))
- # Make sure no side effects.
- left_copy = left.copy()
- right_copy = right.copy()
- result = merge(left, right, left_index=True, right_index=True,
- validate='1:1')
- assert_frame_equal(left, left_copy)
- assert_frame_equal(right, right_copy)
- # make sure merge still correct
- expected = DataFrame({'a_x': ['a', 'b', 'c', 'd'],
- 'b': ['cat', 'dog', 'weasel', 'horse'],
- 'a_y': ['a', 'b', 'c', 'd'],
- 'c': ['meow', 'bark', 'um... weasel noise?',
- 'nay']},
- index=range(4),
- columns=['a_x', 'b', 'a_y', 'c'])
- result = merge(left, right, left_index=True, right_index=True,
- validate='one_to_one')
- assert_frame_equal(result, expected)
- expected_2 = DataFrame({'a': ['a', 'b', 'c', 'd'],
- 'b': ['cat', 'dog', 'weasel', 'horse'],
- 'c': ['meow', 'bark', 'um... weasel noise?',
- 'nay']},
- index=range(4))
- result = merge(left, right, on='a', validate='1:1')
- assert_frame_equal(left, left_copy)
- assert_frame_equal(right, right_copy)
- assert_frame_equal(result, expected_2)
- result = merge(left, right, on='a', validate='one_to_one')
- assert_frame_equal(result, expected_2)
- # One index, one column
- expected_3 = DataFrame({'b': ['cat', 'dog', 'weasel', 'horse'],
- 'a': ['a', 'b', 'c', 'd'],
- 'c': ['meow', 'bark', 'um... weasel noise?',
- 'nay']},
- columns=['b', 'a', 'c'],
- index=range(4))
- left_index_reset = left.set_index('a')
- result = merge(left_index_reset, right, left_index=True,
- right_on='a', validate='one_to_one')
- assert_frame_equal(result, expected_3)
- # Dups on right
- right_w_dups = right.append(pd.DataFrame({'a': ['e'], 'c': ['moo']},
- index=[4]))
- merge(left, right_w_dups, left_index=True, right_index=True,
- validate='one_to_many')
- msg = ("Merge keys are not unique in right dataset; not a one-to-one"
- " merge")
- with pytest.raises(MergeError, match=msg):
- merge(left, right_w_dups, left_index=True, right_index=True,
- validate='one_to_one')
- with pytest.raises(MergeError, match=msg):
- merge(left, right_w_dups, on='a', validate='one_to_one')
- # Dups on left
- left_w_dups = left.append(pd.DataFrame({'a': ['a'], 'c': ['cow']},
- index=[3]), sort=True)
- merge(left_w_dups, right, left_index=True, right_index=True,
- validate='many_to_one')
- msg = ("Merge keys are not unique in left dataset; not a one-to-one"
- " merge")
- with pytest.raises(MergeError, match=msg):
- merge(left_w_dups, right, left_index=True, right_index=True,
- validate='one_to_one')
- with pytest.raises(MergeError, match=msg):
- merge(left_w_dups, right, on='a', validate='one_to_one')
- # Dups on both
- merge(left_w_dups, right_w_dups, on='a', validate='many_to_many')
- msg = ("Merge keys are not unique in right dataset; not a many-to-one"
- " merge")
- with pytest.raises(MergeError, match=msg):
- merge(left_w_dups, right_w_dups, left_index=True,
- right_index=True, validate='many_to_one')
- msg = ("Merge keys are not unique in left dataset; not a one-to-many"
- " merge")
- with pytest.raises(MergeError, match=msg):
- merge(left_w_dups, right_w_dups, on='a',
- validate='one_to_many')
- # Check invalid arguments
- msg = "Not a valid argument for validate"
- with pytest.raises(ValueError, match=msg):
- merge(left, right, on='a', validate='jibberish')
- # Two column merge, dups in both, but jointly no dups.
- left = DataFrame({'a': ['a', 'a', 'b', 'b'],
- 'b': [0, 1, 0, 1],
- 'c': ['cat', 'dog', 'weasel', 'horse']},
- index=range(4))
- right = DataFrame({'a': ['a', 'a', 'b'],
- 'b': [0, 1, 0],
- 'd': ['meow', 'bark', 'um... weasel noise?']},
- index=range(3))
- expected_multi = DataFrame({'a': ['a', 'a', 'b'],
- 'b': [0, 1, 0],
- 'c': ['cat', 'dog', 'weasel'],
- 'd': ['meow', 'bark',
- 'um... weasel noise?']},
- index=range(3))
- msg = ("Merge keys are not unique in either left or right dataset;"
- " not a one-to-one merge")
- with pytest.raises(MergeError, match=msg):
- merge(left, right, on='a', validate='1:1')
- result = merge(left, right, on=['a', 'b'], validate='1:1')
- assert_frame_equal(result, expected_multi)
- def test_merge_two_empty_df_no_division_error(self):
- # GH17776, PR #17846
- a = pd.DataFrame({'a': [], 'b': [], 'c': []})
- with np.errstate(divide='raise'):
- merge(a, a, on=('a', 'b'))
- @pytest.mark.parametrize('how', ['left', 'outer'])
- @pytest.mark.xfail(reason="GH-24897")
- def test_merge_on_index_with_more_values(self, how):
- # GH 24212
- # pd.merge gets [-1, -1, 0, 1] as right_indexer, ensure that -1 is
- # interpreted as a missing value instead of the last element
- df1 = pd.DataFrame([[1, 2], [2, 4], [3, 6], [4, 8]],
- columns=['a', 'b'])
- df2 = pd.DataFrame([[3, 30], [4, 40]],
- columns=['a', 'c'])
- df1.set_index('a', drop=False, inplace=True)
- df2.set_index('a', inplace=True)
- result = pd.merge(df1, df2, left_index=True, right_on='a', how=how)
- expected = pd.DataFrame([[1, 2, np.nan],
- [2, 4, np.nan],
- [3, 6, 30.0],
- [4, 8, 40.0]],
- columns=['a', 'b', 'c'])
- expected.set_index('a', drop=False, inplace=True)
- assert_frame_equal(result, expected)
- def test_merge_right_index_right(self):
- # Note: the expected output here is probably incorrect.
- # See https://github.com/pandas-dev/pandas/issues/17257 for more.
- # We include this as a regression test for GH-24897.
- left = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 1, 1]})
- right = pd.DataFrame({'b': [1, 2, 3]})
- expected = pd.DataFrame({'a': [1, 2, 3, None],
- 'key': [0, 1, 1, 2],
- 'b': [1, 2, 2, 3]},
- columns=['a', 'key', 'b'],
- index=[0, 1, 2, 2])
- result = left.merge(right, left_on='key', right_index=True,
- how='right')
- tm.assert_frame_equal(result, expected)
- def _check_merge(x, y):
- for how in ['inner', 'left', 'outer']:
- result = x.join(y, how=how)
- expected = merge(x.reset_index(), y.reset_index(), how=how,
- sort=True)
- expected = expected.set_index('index')
- # TODO check_names on merge?
- assert_frame_equal(result, expected, check_names=False)
- class TestMergeDtypes(object):
- @pytest.mark.parametrize('right_vals', [
- ['foo', 'bar'],
- Series(['foo', 'bar']).astype('category'),
- ])
- def test_different(self, right_vals):
- left = DataFrame({'A': ['foo', 'bar'],
- 'B': Series(['foo', 'bar']).astype('category'),
- 'C': [1, 2],
- 'D': [1.0, 2.0],
- 'E': Series([1, 2], dtype='uint64'),
- 'F': Series([1, 2], dtype='int32')})
- right = DataFrame({'A': right_vals})
- # GH 9780
- # We allow merging on object and categorical cols and cast
- # categorical cols to object
- result = pd.merge(left, right, on='A')
- assert is_object_dtype(result.A.dtype)
- @pytest.mark.parametrize('d1', [np.int64, np.int32,
- np.int16, np.int8, np.uint8])
- @pytest.mark.parametrize('d2', [np.int64, np.float64,
- np.float32, np.float16])
- def test_join_multi_dtypes(self, d1, d2):
- dtype1 = np.dtype(d1)
- dtype2 = np.dtype(d2)
- left = DataFrame({'k1': np.array([0, 1, 2] * 8, dtype=dtype1),
- 'k2': ['foo', 'bar'] * 12,
- 'v': np.array(np.arange(24), dtype=np.int64)})
- index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
- right = DataFrame({'v2': np.array([5, 7], dtype=dtype2)}, index=index)
- result = left.join(right, on=['k1', 'k2'])
- expected = left.copy()
- if dtype2.kind == 'i':
- dtype2 = np.dtype('float64')
- expected['v2'] = np.array(np.nan, dtype=dtype2)
- expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5
- expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7
- tm.assert_frame_equal(result, expected)
- result = left.join(right, on=['k1', 'k2'], sort=True)
- expected.sort_values(['k1', 'k2'], kind='mergesort', inplace=True)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize('int_vals, float_vals, exp_vals', [
- ([1, 2, 3], [1.0, 2.0, 3.0], {'X': [1, 2, 3], 'Y': [1.0, 2.0, 3.0]}),
- ([1, 2, 3], [1.0, 3.0], {'X': [1, 3], 'Y': [1.0, 3.0]}),
- ([1, 2], [1.0, 2.0, 3.0], {'X': [1, 2], 'Y': [1.0, 2.0]}),
- ])
- def test_merge_on_ints_floats(self, int_vals, float_vals, exp_vals):
- # GH 16572
- # Check that float column is not cast to object if
- # merging on float and int columns
- A = DataFrame({'X': int_vals})
- B = DataFrame({'Y': float_vals})
- expected = DataFrame(exp_vals)
- result = A.merge(B, left_on='X', right_on='Y')
- assert_frame_equal(result, expected)
- result = B.merge(A, left_on='Y', right_on='X')
- assert_frame_equal(result, expected[['Y', 'X']])
- def test_merge_on_ints_floats_warning(self):
- # GH 16572
- # merge will produce a warning when merging on int and
- # float columns where the float values are not exactly
- # equal to their int representation
- A = DataFrame({'X': [1, 2, 3]})
- B = DataFrame({'Y': [1.1, 2.5, 3.0]})
- expected = DataFrame({'X': [3], 'Y': [3.0]})
- with tm.assert_produces_warning(UserWarning):
- result = A.merge(B, left_on='X', right_on='Y')
- assert_frame_equal(result, expected)
- with tm.assert_produces_warning(UserWarning):
- result = B.merge(A, left_on='Y', right_on='X')
- assert_frame_equal(result, expected[['Y', 'X']])
- # test no warning if float has NaNs
- B = DataFrame({'Y': [np.nan, np.nan, 3.0]})
- with tm.assert_produces_warning(None):
- result = B.merge(A, left_on='Y', right_on='X')
- assert_frame_equal(result, expected[['Y', 'X']])
- def test_merge_incompat_infer_boolean_object(self):
- # GH21119: bool + object bool merge OK
- df1 = DataFrame({'key': Series([True, False], dtype=object)})
- df2 = DataFrame({'key': [True, False]})
- expected = DataFrame({'key': [True, False]}, dtype=object)
- result = pd.merge(df1, df2, on='key')
- assert_frame_equal(result, expected)
- result = pd.merge(df2, df1, on='key')
- assert_frame_equal(result, expected)
- # with missing value
- df1 = DataFrame({'key': Series([True, False, np.nan], dtype=object)})
- df2 = DataFrame({'key': [True, False]})
- expected = DataFrame({'key': [True, False]}, dtype=object)
- result = pd.merge(df1, df2, on='key')
- assert_frame_equal(result, expected)
- result = pd.merge(df2, df1, on='key')
- assert_frame_equal(result, expected)
- @pytest.mark.parametrize('df1_vals, df2_vals', [
- # merge on category coerces to object
- ([0, 1, 2], Series(['a', 'b', 'a']).astype('category')),
- ([0.0, 1.0, 2.0], Series(['a', 'b', 'a']).astype('category')),
- # no not infer
- ([0, 1], pd.Series([False, True], dtype=object)),
- ([0, 1], pd.Series([False, True], dtype=bool)),
- ])
- def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals):
- # these are explicity allowed incompat merges, that pass thru
- # the result type is dependent on if the values on the rhs are
- # inferred, otherwise these will be coereced to object
- df1 = DataFrame({'A': df1_vals})
- df2 = DataFrame({'A': df2_vals})
- result = pd.merge(df1, df2, on=['A'])
- assert is_object_dtype(result.A.dtype)
- result = pd.merge(df2, df1, on=['A'])
- assert is_object_dtype(result.A.dtype)
- @pytest.mark.parametrize('df1_vals, df2_vals', [
- # do not infer to numeric
- (Series([1, 2], dtype='uint64'), ["a", "b", "c"]),
- (Series([1, 2], dtype='int32'), ["a", "b", "c"]),
- ([0, 1, 2], ["0", "1", "2"]),
- ([0.0, 1.0, 2.0], ["0", "1", "2"]),
- ([0, 1, 2], [u"0", u"1", u"2"]),
- (pd.date_range('1/1/2011', periods=2, freq='D'), ['2011-01-01',
- '2011-01-02']),
- (pd.date_range('1/1/2011', periods=2, freq='D'), [0, 1]),
- (pd.date_range('1/1/2011', periods=2, freq='D'), [0.0, 1.0]),
- (pd.date_range('20130101', periods=3),
- pd.date_range('20130101', periods=3, tz='US/Eastern')),
- ])
- def test_merge_incompat_dtypes_error(self, df1_vals, df2_vals):
- # GH 9780, GH 15800
- # Raise a ValueError when a user tries to merge on
- # dtypes that are incompatible (e.g., obj and int/float)
- df1 = DataFrame({'A': df1_vals})
- df2 = DataFrame({'A': df2_vals})
- msg = ("You are trying to merge on {lk_dtype} and "
- "{rk_dtype} columns. If you wish to proceed "
- "you should use pd.concat".format(lk_dtype=df1['A'].dtype,
- rk_dtype=df2['A'].dtype))
- msg = re.escape(msg)
- with pytest.raises(ValueError, match=msg):
- pd.merge(df1, df2, on=['A'])
- # Check that error still raised when swapping order of dataframes
- msg = ("You are trying to merge on {lk_dtype} and "
- "{rk_dtype} columns. If you wish to proceed "
- "you should use pd.concat".format(lk_dtype=df2['A'].dtype,
- rk_dtype=df1['A'].dtype))
- msg = re.escape(msg)
- with pytest.raises(ValueError, match=msg):
- pd.merge(df2, df1, on=['A'])
- @pytest.fixture
- def left():
- np.random.seed(1234)
- return DataFrame(
- {'X': Series(np.random.choice(
- ['foo', 'bar'],
- size=(10,))).astype(CDT(['foo', 'bar'])),
- 'Y': np.random.choice(['one', 'two', 'three'], size=(10,))})
- @pytest.fixture
- def right():
- np.random.seed(1234)
- return DataFrame(
- {'X': Series(['foo', 'bar']).astype(CDT(['foo', 'bar'])),
- 'Z': [1, 2]})
- class TestMergeCategorical(object):
- def test_identical(self, left):
- # merging on the same, should preserve dtypes
- merged = pd.merge(left, left, on='X')
- result = merged.dtypes.sort_index()
- expected = Series([CategoricalDtype(),
- np.dtype('O'),
- np.dtype('O')],
- index=['X', 'Y_x', 'Y_y'])
- assert_series_equal(result, expected)
- def test_basic(self, left, right):
- # we have matching Categorical dtypes in X
- # so should preserve the merged column
- merged = pd.merge(left, right, on='X')
- result = merged.dtypes.sort_index()
- expected = Series([CategoricalDtype(),
- np.dtype('O'),
- np.dtype('int64')],
- index=['X', 'Y', 'Z'])
- assert_series_equal(result, expected)
- def test_merge_categorical(self):
- # GH 9426
- right = DataFrame({'c': {0: 'a',
- 1: 'b',
- 2: 'c',
- 3: 'd',
- 4: 'e'},
- 'd': {0: 'null',
- 1: 'null',
- 2: 'null',
- 3: 'null',
- 4: 'null'}})
- left = DataFrame({'a': {0: 'f',
- 1: 'f',
- 2: 'f',
- 3: 'f',
- 4: 'f'},
- 'b': {0: 'g',
- 1: 'g',
- 2: 'g',
- 3: 'g',
- 4: 'g'}})
- df = pd.merge(left, right, how='left', left_on='b', right_on='c')
- # object-object
- expected = df.copy()
- # object-cat
- # note that we propagate the category
- # because we don't have any matching rows
- cright = right.copy()
- cright['d'] = cright['d'].astype('category')
- result = pd.merge(left, cright, how='left', left_on='b', right_on='c')
- expected['d'] = expected['d'].astype(CategoricalDtype(['null']))
- tm.assert_frame_equal(result, expected)
- # cat-object
- cleft = left.copy()
- cleft['b'] = cleft['b'].astype('category')
- result = pd.merge(cleft, cright, how='left', left_on='b', right_on='c')
- tm.assert_frame_equal(result, expected)
- # cat-cat
- cright = right.copy()
- cright['d'] = cright['d'].astype('category')
- cleft = left.copy()
- cleft['b'] = cleft['b'].astype('category')
- result = pd.merge(cleft, cright, how='left', left_on='b', right_on='c')
- tm.assert_frame_equal(result, expected)
- def tests_merge_categorical_unordered_equal(self):
- # GH-19551
- df1 = DataFrame({
- 'Foo': Categorical(['A', 'B', 'C'], categories=['A', 'B', 'C']),
- 'Left': ['A0', 'B0', 'C0'],
- })
- df2 = DataFrame({
- 'Foo': Categorical(['C', 'B', 'A'], categories=['C', 'B', 'A']),
- 'Right': ['C1', 'B1', 'A1'],
- })
- result = pd.merge(df1, df2, on=['Foo'])
- expected = DataFrame({
- 'Foo': pd.Categorical(['A', 'B', 'C']),
- 'Left': ['A0', 'B0', 'C0'],
- 'Right': ['A1', 'B1', 'C1'],
- })
- assert_frame_equal(result, expected)
- def test_other_columns(self, left, right):
- # non-merge columns should preserve if possible
- right = right.assign(Z=right.Z.astype('category'))
- merged = pd.merge(left, right, on='X')
- result = merged.dtypes.sort_index()
- expected = Series([CategoricalDtype(),
- np.dtype('O'),
- CategoricalDtype()],
- index=['X', 'Y', 'Z'])
- assert_series_equal(result, expected)
- # categories are preserved
- assert left.X.values.is_dtype_equal(merged.X.values)
- assert right.Z.values.is_dtype_equal(merged.Z.values)
- @pytest.mark.parametrize(
- 'change', [lambda x: x,
- lambda x: x.astype(CDT(['foo', 'bar', 'bah'])),
- lambda x: x.astype(CDT(ordered=True))])
- def test_dtype_on_merged_different(self, change, join_type, left, right):
- # our merging columns, X now has 2 different dtypes
- # so we must be object as a result
- X = change(right.X.astype('object'))
- right = right.assign(X=X)
- assert is_categorical_dtype(left.X.values)
- # assert not left.X.values.is_dtype_equal(right.X.values)
- merged = pd.merge(left, right, on='X', how=join_type)
- result = merged.dtypes.sort_index()
- expected = Series([np.dtype('O'),
- np.dtype('O'),
- np.dtype('int64')],
- index=['X', 'Y', 'Z'])
- assert_series_equal(result, expected)
- def test_self_join_multiple_categories(self):
- # GH 16767
- # non-duplicates should work with multiple categories
- m = 5
- df = pd.DataFrame({
- 'a': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] * m,
- 'b': ['t', 'w', 'x', 'y', 'z'] * 2 * m,
- 'c': [letter
- for each in ['m', 'n', 'u', 'p', 'o']
- for letter in [each] * 2 * m],
- 'd': [letter
- for each in ['aa', 'bb', 'cc', 'dd', 'ee',
- 'ff', 'gg', 'hh', 'ii', 'jj']
- for letter in [each] * m]})
- # change them all to categorical variables
- df = df.apply(lambda x: x.astype('category'))
- # self-join should equal ourselves
- result = pd.merge(df, df, on=list(df.columns))
- assert_frame_equal(result, df)
- def test_dtype_on_categorical_dates(self):
- # GH 16900
- # dates should not be coerced to ints
- df = pd.DataFrame(
- [[date(2001, 1, 1), 1.1],
- [date(2001, 1, 2), 1.3]],
- columns=['date', 'num2']
- )
- df['date'] = df['date'].astype('category')
- df2 = pd.DataFrame(
- [[date(2001, 1, 1), 1.3],
- [date(2001, 1, 3), 1.4]],
- columns=['date', 'num4']
- )
- df2['date'] = df2['date'].astype('category')
- expected_outer = pd.DataFrame([
- [pd.Timestamp('2001-01-01'), 1.1, 1.3],
- [pd.Timestamp('2001-01-02'), 1.3, np.nan],
- [pd.Timestamp('2001-01-03'), np.nan, 1.4]],
- columns=['date', 'num2', 'num4']
- )
- result_outer = pd.merge(df, df2, how='outer', on=['date'])
- assert_frame_equal(result_outer, expected_outer)
- expected_inner = pd.DataFrame(
- [[pd.Timestamp('2001-01-01'), 1.1, 1.3]],
- columns=['date', 'num2', 'num4']
- )
- result_inner = pd.merge(df, df2, how='inner', on=['date'])
- assert_frame_equal(result_inner, expected_inner)
- @pytest.mark.parametrize('ordered', [True, False])
- @pytest.mark.parametrize('category_column,categories,expected_categories',
- [([False, True, True, False], [True, False],
- [True, False]),
- ([2, 1, 1, 2], [1, 2], [1, 2]),
- (['False', 'True', 'True', 'False'],
- ['True', 'False'], ['True', 'False'])])
- def test_merging_with_bool_or_int_cateorical_column(self, category_column,
- categories,
- expected_categories,
- ordered):
- # GH 17187
- # merging with a boolean/int categorical column
- df1 = pd.DataFrame({'id': [1, 2, 3, 4],
- 'cat': category_column})
- df1['cat'] = df1['cat'].astype(CDT(categories, ordered=ordered))
- df2 = pd.DataFrame({'id': [2, 4], 'num': [1, 9]})
- result = df1.merge(df2)
- expected = pd.DataFrame({'id': [2, 4], 'cat': expected_categories,
- 'num': [1, 9]})
- expected['cat'] = expected['cat'].astype(
- CDT(categories, ordered=ordered))
- assert_frame_equal(expected, result)
- def test_merge_on_int_array(self):
- # GH 23020
- df = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'),
- 'B': 1})
- result = pd.merge(df, df, on='A')
- expected = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'),
- 'B_x': 1,
- 'B_y': 1})
- assert_frame_equal(result, expected)
- @pytest.fixture
- def left_df():
- return DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0])
- @pytest.fixture
- def right_df():
- return DataFrame({'b': [300, 100, 200]}, index=[3, 1, 2])
- class TestMergeOnIndexes(object):
- @pytest.mark.parametrize(
- "how, sort, expected",
- [('inner', False, DataFrame({'a': [20, 10],
- 'b': [200, 100]},
- index=[2, 1])),
- ('inner', True, DataFrame({'a': [10, 20],
- 'b': [100, 200]},
- index=[1, 2])),
- ('left', False, DataFrame({'a': [20, 10, 0],
- 'b': [200, 100, np.nan]},
- index=[2, 1, 0])),
- ('left', True, DataFrame({'a': [0, 10, 20],
- 'b': [np.nan, 100, 200]},
- index=[0, 1, 2])),
- ('right', False, DataFrame({'a': [np.nan, 10, 20],
- 'b': [300, 100, 200]},
- index=[3, 1, 2])),
- ('right', True, DataFrame({'a': [10, 20, np.nan],
- 'b': [100, 200, 300]},
- index=[1, 2, 3])),
- ('outer', False, DataFrame({'a': [0, 10, 20, np.nan],
- 'b': [np.nan, 100, 200, 300]},
- index=[0, 1, 2, 3])),
- ('outer', True, DataFrame({'a': [0, 10, 20, np.nan],
- 'b': [np.nan, 100, 200, 300]},
- index=[0, 1, 2, 3]))])
- def test_merge_on_indexes(self, left_df, right_df, how, sort, expected):
- result = pd.merge(left_df, right_df,
- left_index=True,
- right_index=True,
- how=how,
- sort=sort)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- 'index', [
- CategoricalIndex(['A', 'B'], categories=['A', 'B'], name='index_col'),
- Float64Index([1.0, 2.0], name='index_col'),
- Int64Index([1, 2], name='index_col'),
- UInt64Index([1, 2], name='index_col'),
- RangeIndex(start=0, stop=2, name='index_col'),
- DatetimeIndex(["2018-01-01", "2018-01-02"], name='index_col'),
- ], ids=lambda x: type(x).__name__)
- def test_merge_index_types(index):
- # gh-20777
- # assert key access is consistent across index types
- left = DataFrame({"left_data": [1, 2]}, index=index)
- right = DataFrame({"right_data": [1.0, 2.0]}, index=index)
- result = left.merge(right, on=['index_col'])
- expected = DataFrame(
- OrderedDict([('left_data', [1, 2]), ('right_data', [1.0, 2.0])]),
- index=index)
- assert_frame_equal(result, expected)
- @pytest.mark.parametrize("on,left_on,right_on,left_index,right_index,nm", [
- (['outer', 'inner'], None, None, False, False, 'B'),
- (None, None, None, True, True, 'B'),
- (None, ['outer', 'inner'], None, False, True, 'B'),
- (None, None, ['outer', 'inner'], True, False, 'B'),
- (['outer', 'inner'], None, None, False, False, None),
- (None, None, None, True, True, None),
- (None, ['outer', 'inner'], None, False, True, None),
- (None, None, ['outer', 'inner'], True, False, None)])
- def test_merge_series(on, left_on, right_on, left_index, right_index, nm):
- # GH 21220
- a = pd.DataFrame({"A": [1, 2, 3, 4]},
- index=pd.MultiIndex.from_product([['a', 'b'], [0, 1]],
- names=['outer', 'inner']))
- b = pd.Series([1, 2, 3, 4],
- index=pd.MultiIndex.from_product([['a', 'b'], [1, 2]],
- names=['outer', 'inner']), name=nm)
- expected = pd.DataFrame({"A": [2, 4], "B": [1, 3]},
- index=pd.MultiIndex.from_product([['a', 'b'], [1]],
- names=['outer', 'inner']))
- if nm is not None:
- result = pd.merge(a, b, on=on, left_on=left_on, right_on=right_on,
- left_index=left_index, right_index=right_index)
- tm.assert_frame_equal(result, expected)
- else:
- msg = "Cannot merge a Series without a name"
- with pytest.raises(ValueError, match=msg):
- result = pd.merge(a, b, on=on, left_on=left_on, right_on=right_on,
- left_index=left_index, right_index=right_index)
|