import numpy as np import pytest import pytz import pandas as pd from pandas import Timedelta, merge_asof, read_csv, to_datetime from pandas.core.reshape.merge import MergeError from pandas.util.testing import assert_frame_equal class TestAsOfMerge(object): def read_data(self, datapath, name, dedupe=False): path = datapath('reshape', 'merge', 'data', name) x = read_csv(path) if dedupe: x = (x.drop_duplicates(['time', 'ticker'], keep='last') .reset_index(drop=True) ) x.time = to_datetime(x.time) return x @pytest.fixture(autouse=True) def setup_method(self, datapath): self.trades = self.read_data(datapath, 'trades.csv') self.quotes = self.read_data(datapath, 'quotes.csv', dedupe=True) self.asof = self.read_data(datapath, 'asof.csv') self.tolerance = self.read_data(datapath, 'tolerance.csv') self.allow_exact_matches = self.read_data(datapath, 'allow_exact_matches.csv') self.allow_exact_matches_and_tolerance = self.read_data( datapath, 'allow_exact_matches_and_tolerance.csv') def test_examples1(self): """ doc-string examples """ left = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c']}) right = pd.DataFrame({'a': [1, 2, 3, 6, 7], 'right_val': [1, 2, 3, 6, 7]}) expected = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c'], 'right_val': [1, 3, 7]}) result = pd.merge_asof(left, right, on='a') assert_frame_equal(result, expected) def test_examples2(self): """ doc-string examples """ trades = pd.DataFrame({ 'time': pd.to_datetime(['20160525 13:30:00.023', '20160525 13:30:00.038', '20160525 13:30:00.048', '20160525 13:30:00.048', '20160525 13:30:00.048']), 'ticker': ['MSFT', 'MSFT', 'GOOG', 'GOOG', 'AAPL'], 'price': [51.95, 51.95, 720.77, 720.92, 98.00], 'quantity': [75, 155, 100, 100, 100]}, columns=['time', 'ticker', 'price', 'quantity']) quotes = pd.DataFrame({ 'time': pd.to_datetime(['20160525 13:30:00.023', '20160525 13:30:00.023', '20160525 13:30:00.030', '20160525 13:30:00.041', '20160525 13:30:00.048', '20160525 13:30:00.049', '20160525 13:30:00.072', '20160525 13:30:00.075']), 'ticker': ['GOOG', 'MSFT', 'MSFT', 'MSFT', 'GOOG', 'AAPL', 'GOOG', 'MSFT'], 'bid': [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], 'ask': [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]}, columns=['time', 'ticker', 'bid', 'ask']) pd.merge_asof(trades, quotes, on='time', by='ticker') pd.merge_asof(trades, quotes, on='time', by='ticker', tolerance=pd.Timedelta('2ms')) expected = pd.DataFrame({ 'time': pd.to_datetime(['20160525 13:30:00.023', '20160525 13:30:00.038', '20160525 13:30:00.048', '20160525 13:30:00.048', '20160525 13:30:00.048']), 'ticker': ['MSFT', 'MSFT', 'GOOG', 'GOOG', 'AAPL'], 'price': [51.95, 51.95, 720.77, 720.92, 98.00], 'quantity': [75, 155, 100, 100, 100], 'bid': [np.nan, 51.97, np.nan, np.nan, np.nan], 'ask': [np.nan, 51.98, np.nan, np.nan, np.nan]}, columns=['time', 'ticker', 'price', 'quantity', 'bid', 'ask']) result = pd.merge_asof(trades, quotes, on='time', by='ticker', tolerance=pd.Timedelta('10ms'), allow_exact_matches=False) assert_frame_equal(result, expected) def test_examples3(self): """ doc-string examples """ # GH14887 left = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c']}) right = pd.DataFrame({'a': [1, 2, 3, 6, 7], 'right_val': [1, 2, 3, 6, 7]}) expected = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c'], 'right_val': [1, 6, np.nan]}) result = pd.merge_asof(left, right, on='a', direction='forward') assert_frame_equal(result, expected) def test_examples4(self): """ doc-string examples """ # GH14887 left = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c']}) right = pd.DataFrame({'a': [1, 2, 3, 6, 7], 'right_val': [1, 2, 3, 6, 7]}) expected = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c'], 'right_val': [1, 6, 7]}) result = pd.merge_asof(left, right, on='a', direction='nearest') assert_frame_equal(result, expected) def test_basic(self): expected = self.asof trades = self.trades quotes = self.quotes result = merge_asof(trades, quotes, on='time', by='ticker') assert_frame_equal(result, expected) def test_basic_categorical(self): expected = self.asof trades = self.trades.copy() trades.ticker = trades.ticker.astype('category') quotes = self.quotes.copy() quotes.ticker = quotes.ticker.astype('category') expected.ticker = expected.ticker.astype('category') result = merge_asof(trades, quotes, on='time', by='ticker') assert_frame_equal(result, expected) def test_basic_left_index(self): # GH14253 expected = self.asof trades = self.trades.set_index('time') quotes = self.quotes result = merge_asof(trades, quotes, left_index=True, right_on='time', by='ticker') # left-only index uses right's index, oddly expected.index = result.index # time column appears after left's columns expected = expected[result.columns] assert_frame_equal(result, expected) def test_basic_right_index(self): expected = self.asof trades = self.trades quotes = self.quotes.set_index('time') result = merge_asof(trades, quotes, left_on='time', right_index=True, by='ticker') assert_frame_equal(result, expected) def test_basic_left_index_right_index(self): expected = self.asof.set_index('time') trades = self.trades.set_index('time') quotes = self.quotes.set_index('time') result = merge_asof(trades, quotes, left_index=True, right_index=True, by='ticker') assert_frame_equal(result, expected) def test_multi_index(self): # MultiIndex is prohibited trades = self.trades.set_index(['time', 'price']) quotes = self.quotes.set_index('time') with pytest.raises(MergeError): merge_asof(trades, quotes, left_index=True, right_index=True) trades = self.trades.set_index('time') quotes = self.quotes.set_index(['time', 'bid']) with pytest.raises(MergeError): merge_asof(trades, quotes, left_index=True, right_index=True) def test_on_and_index(self): # 'on' parameter and index together is prohibited trades = self.trades.set_index('time') quotes = self.quotes.set_index('time') with pytest.raises(MergeError): merge_asof(trades, quotes, left_on='price', left_index=True, right_index=True) trades = self.trades.set_index('time') quotes = self.quotes.set_index('time') with pytest.raises(MergeError): merge_asof(trades, quotes, right_on='bid', left_index=True, right_index=True) def test_basic_left_by_right_by(self): # GH14253 expected = self.asof trades = self.trades quotes = self.quotes result = merge_asof(trades, quotes, on='time', left_by='ticker', right_by='ticker') assert_frame_equal(result, expected) def test_missing_right_by(self): expected = self.asof trades = self.trades quotes = self.quotes q = quotes[quotes.ticker != 'MSFT'] result = merge_asof(trades, q, on='time', by='ticker') expected.loc[expected.ticker == 'MSFT', ['bid', 'ask']] = np.nan assert_frame_equal(result, expected) def test_multiby(self): # GH13936 trades = pd.DataFrame({ 'time': pd.to_datetime(['20160525 13:30:00.023', '20160525 13:30:00.023', '20160525 13:30:00.046', '20160525 13:30:00.048', '20160525 13:30:00.050']), 'ticker': ['MSFT', 'MSFT', 'GOOG', 'GOOG', 'AAPL'], 'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'], 'price': [51.95, 51.95, 720.77, 720.92, 98.00], 'quantity': [75, 155, 100, 100, 100]}, columns=['time', 'ticker', 'exch', 'price', 'quantity']) quotes = pd.DataFrame({ 'time': pd.to_datetime(['20160525 13:30:00.023', '20160525 13:30:00.023', '20160525 13:30:00.030', '20160525 13:30:00.041', '20160525 13:30:00.045', '20160525 13:30:00.049']), 'ticker': ['GOOG', 'MSFT', 'MSFT', 'MSFT', 'GOOG', 'AAPL'], 'exch': ['BATS', 'NSDQ', 'ARCA', 'ARCA', 'NSDQ', 'ARCA'], 'bid': [720.51, 51.95, 51.97, 51.99, 720.50, 97.99], 'ask': [720.92, 51.96, 51.98, 52.00, 720.93, 98.01]}, columns=['time', 'ticker', 'exch', 'bid', 'ask']) expected = pd.DataFrame({ 'time': pd.to_datetime(['20160525 13:30:00.023', '20160525 13:30:00.023', '20160525 13:30:00.046', '20160525 13:30:00.048', '20160525 13:30:00.050']), 'ticker': ['MSFT', 'MSFT', 'GOOG', 'GOOG', 'AAPL'], 'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'], 'price': [51.95, 51.95, 720.77, 720.92, 98.00], 'quantity': [75, 155, 100, 100, 100], 'bid': [np.nan, 51.95, 720.50, 720.51, np.nan], 'ask': [np.nan, 51.96, 720.93, 720.92, np.nan]}, columns=['time', 'ticker', 'exch', 'price', 'quantity', 'bid', 'ask']) result = pd.merge_asof(trades, quotes, on='time', by=['ticker', 'exch']) assert_frame_equal(result, expected) def test_multiby_heterogeneous_types(self): # GH13936 trades = pd.DataFrame({ 'time': pd.to_datetime(['20160525 13:30:00.023', '20160525 13:30:00.023', '20160525 13:30:00.046', '20160525 13:30:00.048', '20160525 13:30:00.050']), 'ticker': [0, 0, 1, 1, 2], 'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'], 'price': [51.95, 51.95, 720.77, 720.92, 98.00], 'quantity': [75, 155, 100, 100, 100]}, columns=['time', 'ticker', 'exch', 'price', 'quantity']) quotes = pd.DataFrame({ 'time': pd.to_datetime(['20160525 13:30:00.023', '20160525 13:30:00.023', '20160525 13:30:00.030', '20160525 13:30:00.041', '20160525 13:30:00.045', '20160525 13:30:00.049']), 'ticker': [1, 0, 0, 0, 1, 2], 'exch': ['BATS', 'NSDQ', 'ARCA', 'ARCA', 'NSDQ', 'ARCA'], 'bid': [720.51, 51.95, 51.97, 51.99, 720.50, 97.99], 'ask': [720.92, 51.96, 51.98, 52.00, 720.93, 98.01]}, columns=['time', 'ticker', 'exch', 'bid', 'ask']) expected = pd.DataFrame({ 'time': pd.to_datetime(['20160525 13:30:00.023', '20160525 13:30:00.023', '20160525 13:30:00.046', '20160525 13:30:00.048', '20160525 13:30:00.050']), 'ticker': [0, 0, 1, 1, 2], 'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'], 'price': [51.95, 51.95, 720.77, 720.92, 98.00], 'quantity': [75, 155, 100, 100, 100], 'bid': [np.nan, 51.95, 720.50, 720.51, np.nan], 'ask': [np.nan, 51.96, 720.93, 720.92, np.nan]}, columns=['time', 'ticker', 'exch', 'price', 'quantity', 'bid', 'ask']) result = pd.merge_asof(trades, quotes, on='time', by=['ticker', 'exch']) assert_frame_equal(result, expected) def test_multiby_indexed(self): # GH15676 left = pd.DataFrame([ [pd.to_datetime('20160602'), 1, 'a'], [pd.to_datetime('20160602'), 2, 'a'], [pd.to_datetime('20160603'), 1, 'b'], [pd.to_datetime('20160603'), 2, 'b']], columns=['time', 'k1', 'k2']).set_index('time') right = pd.DataFrame([ [pd.to_datetime('20160502'), 1, 'a', 1.0], [pd.to_datetime('20160502'), 2, 'a', 2.0], [pd.to_datetime('20160503'), 1, 'b', 3.0], [pd.to_datetime('20160503'), 2, 'b', 4.0]], columns=['time', 'k1', 'k2', 'value']).set_index('time') expected = pd.DataFrame([ [pd.to_datetime('20160602'), 1, 'a', 1.0], [pd.to_datetime('20160602'), 2, 'a', 2.0], [pd.to_datetime('20160603'), 1, 'b', 3.0], [pd.to_datetime('20160603'), 2, 'b', 4.0]], columns=['time', 'k1', 'k2', 'value']).set_index('time') result = pd.merge_asof(left, right, left_index=True, right_index=True, by=['k1', 'k2']) assert_frame_equal(expected, result) with pytest.raises(MergeError): pd.merge_asof(left, right, left_index=True, right_index=True, left_by=['k1', 'k2'], right_by=['k1']) def test_basic2(self, datapath): expected = self.read_data(datapath, 'asof2.csv') trades = self.read_data(datapath, 'trades2.csv') quotes = self.read_data(datapath, 'quotes2.csv', dedupe=True) result = merge_asof(trades, quotes, on='time', by='ticker') assert_frame_equal(result, expected) def test_basic_no_by(self): f = lambda x: x[x.ticker == 'MSFT'].drop('ticker', axis=1) \ .reset_index(drop=True) # just use a single ticker expected = f(self.asof) trades = f(self.trades) quotes = f(self.quotes) result = merge_asof(trades, quotes, on='time') assert_frame_equal(result, expected) def test_valid_join_keys(self): trades = self.trades quotes = self.quotes with pytest.raises(MergeError): merge_asof(trades, quotes, left_on='time', right_on='bid', by='ticker') with pytest.raises(MergeError): merge_asof(trades, quotes, on=['time', 'ticker'], by='ticker') with pytest.raises(MergeError): merge_asof(trades, quotes, by='ticker') def test_with_duplicates(self, datapath): q = pd.concat([self.quotes, self.quotes]).sort_values( ['time', 'ticker']).reset_index(drop=True) result = merge_asof(self.trades, q, on='time', by='ticker') expected = self.read_data(datapath, 'asof.csv') assert_frame_equal(result, expected) def test_with_duplicates_no_on(self): df1 = pd.DataFrame({'key': [1, 1, 3], 'left_val': [1, 2, 3]}) df2 = pd.DataFrame({'key': [1, 2, 2], 'right_val': [1, 2, 3]}) result = merge_asof(df1, df2, on='key') expected = pd.DataFrame({'key': [1, 1, 3], 'left_val': [1, 2, 3], 'right_val': [1, 1, 3]}) assert_frame_equal(result, expected) def test_valid_allow_exact_matches(self): trades = self.trades quotes = self.quotes with pytest.raises(MergeError): merge_asof(trades, quotes, on='time', by='ticker', allow_exact_matches='foo') def test_valid_tolerance(self): trades = self.trades quotes = self.quotes # dti merge_asof(trades, quotes, on='time', by='ticker', tolerance=Timedelta('1s')) # integer merge_asof(trades.reset_index(), quotes.reset_index(), on='index', by='ticker', tolerance=1) # incompat with pytest.raises(MergeError): merge_asof(trades, quotes, on='time', by='ticker', tolerance=1) # invalid with pytest.raises(MergeError): merge_asof(trades.reset_index(), quotes.reset_index(), on='index', by='ticker', tolerance=1.0) # invalid negative with pytest.raises(MergeError): merge_asof(trades, quotes, on='time', by='ticker', tolerance=-Timedelta('1s')) with pytest.raises(MergeError): merge_asof(trades.reset_index(), quotes.reset_index(), on='index', by='ticker', tolerance=-1) def test_non_sorted(self): trades = self.trades.sort_values('time', ascending=False) quotes = self.quotes.sort_values('time', ascending=False) # we require that we are already sorted on time & quotes assert not trades.time.is_monotonic assert not quotes.time.is_monotonic with pytest.raises(ValueError): merge_asof(trades, quotes, on='time', by='ticker') trades = self.trades.sort_values('time') assert trades.time.is_monotonic assert not quotes.time.is_monotonic with pytest.raises(ValueError): merge_asof(trades, quotes, on='time', by='ticker') quotes = self.quotes.sort_values('time') assert trades.time.is_monotonic assert quotes.time.is_monotonic # ok, though has dupes merge_asof(trades, self.quotes, on='time', by='ticker') def test_tolerance(self): trades = self.trades quotes = self.quotes result = merge_asof(trades, quotes, on='time', by='ticker', tolerance=Timedelta('1day')) expected = self.tolerance assert_frame_equal(result, expected) def test_tolerance_forward(self): # GH14887 left = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c']}) right = pd.DataFrame({'a': [1, 2, 3, 7, 11], 'right_val': [1, 2, 3, 7, 11]}) expected = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c'], 'right_val': [1, np.nan, 11]}) result = pd.merge_asof(left, right, on='a', direction='forward', tolerance=1) assert_frame_equal(result, expected) def test_tolerance_nearest(self): # GH14887 left = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c']}) right = pd.DataFrame({'a': [1, 2, 3, 7, 11], 'right_val': [1, 2, 3, 7, 11]}) expected = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c'], 'right_val': [1, np.nan, 11]}) result = pd.merge_asof(left, right, on='a', direction='nearest', tolerance=1) assert_frame_equal(result, expected) def test_tolerance_tz(self): # GH 14844 left = pd.DataFrame( {'date': pd.date_range(start=pd.to_datetime('2016-01-02'), freq='D', periods=5, tz=pytz.timezone('UTC')), 'value1': np.arange(5)}) right = pd.DataFrame( {'date': pd.date_range(start=pd.to_datetime('2016-01-01'), freq='D', periods=5, tz=pytz.timezone('UTC')), 'value2': list("ABCDE")}) result = pd.merge_asof(left, right, on='date', tolerance=pd.Timedelta('1 day')) expected = pd.DataFrame( {'date': pd.date_range(start=pd.to_datetime('2016-01-02'), freq='D', periods=5, tz=pytz.timezone('UTC')), 'value1': np.arange(5), 'value2': list("BCDEE")}) assert_frame_equal(result, expected) def test_tolerance_float(self): # GH22981 left = pd.DataFrame({'a': [1.1, 3.5, 10.9], 'left_val': ['a', 'b', 'c']}) right = pd.DataFrame({'a': [1.0, 2.5, 3.3, 7.5, 11.5], 'right_val': [1.0, 2.5, 3.3, 7.5, 11.5]}) expected = pd.DataFrame({'a': [1.1, 3.5, 10.9], 'left_val': ['a', 'b', 'c'], 'right_val': [1, 3.3, np.nan]}) result = pd.merge_asof(left, right, on='a', direction='nearest', tolerance=0.5) assert_frame_equal(result, expected) def test_index_tolerance(self): # GH 15135 expected = self.tolerance.set_index('time') trades = self.trades.set_index('time') quotes = self.quotes.set_index('time') result = pd.merge_asof(trades, quotes, left_index=True, right_index=True, by='ticker', tolerance=pd.Timedelta('1day')) assert_frame_equal(result, expected) def test_allow_exact_matches(self): result = merge_asof(self.trades, self.quotes, on='time', by='ticker', allow_exact_matches=False) expected = self.allow_exact_matches assert_frame_equal(result, expected) def test_allow_exact_matches_forward(self): # GH14887 left = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c']}) right = pd.DataFrame({'a': [1, 2, 3, 7, 11], 'right_val': [1, 2, 3, 7, 11]}) expected = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c'], 'right_val': [2, 7, 11]}) result = pd.merge_asof(left, right, on='a', direction='forward', allow_exact_matches=False) assert_frame_equal(result, expected) def test_allow_exact_matches_nearest(self): # GH14887 left = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c']}) right = pd.DataFrame({'a': [1, 2, 3, 7, 11], 'right_val': [1, 2, 3, 7, 11]}) expected = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c'], 'right_val': [2, 3, 11]}) result = pd.merge_asof(left, right, on='a', direction='nearest', allow_exact_matches=False) assert_frame_equal(result, expected) def test_allow_exact_matches_and_tolerance(self): result = merge_asof(self.trades, self.quotes, on='time', by='ticker', tolerance=Timedelta('100ms'), allow_exact_matches=False) expected = self.allow_exact_matches_and_tolerance assert_frame_equal(result, expected) def test_allow_exact_matches_and_tolerance2(self): # GH 13695 df1 = pd.DataFrame({ 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), 'username': ['bob']}) df2 = pd.DataFrame({ 'time': pd.to_datetime(['2016-07-15 13:30:00.000', '2016-07-15 13:30:00.030']), 'version': [1, 2]}) result = pd.merge_asof(df1, df2, on='time') expected = pd.DataFrame({ 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), 'username': ['bob'], 'version': [2]}) assert_frame_equal(result, expected) result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False) expected = pd.DataFrame({ 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), 'username': ['bob'], 'version': [1]}) assert_frame_equal(result, expected) result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False, tolerance=pd.Timedelta('10ms')) expected = pd.DataFrame({ 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), 'username': ['bob'], 'version': [np.nan]}) assert_frame_equal(result, expected) def test_allow_exact_matches_and_tolerance3(self): # GH 13709 df1 = pd.DataFrame({ 'time': pd.to_datetime(['2016-07-15 13:30:00.030', '2016-07-15 13:30:00.030']), 'username': ['bob', 'charlie']}) df2 = pd.DataFrame({ 'time': pd.to_datetime(['2016-07-15 13:30:00.000', '2016-07-15 13:30:00.030']), 'version': [1, 2]}) result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False, tolerance=pd.Timedelta('10ms')) expected = pd.DataFrame({ 'time': pd.to_datetime(['2016-07-15 13:30:00.030', '2016-07-15 13:30:00.030']), 'username': ['bob', 'charlie'], 'version': [np.nan, np.nan]}) assert_frame_equal(result, expected) def test_allow_exact_matches_and_tolerance_forward(self): # GH14887 left = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c']}) right = pd.DataFrame({'a': [1, 3, 4, 6, 11], 'right_val': [1, 3, 4, 6, 11]}) expected = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c'], 'right_val': [np.nan, 6, 11]}) result = pd.merge_asof(left, right, on='a', direction='forward', allow_exact_matches=False, tolerance=1) assert_frame_equal(result, expected) def test_allow_exact_matches_and_tolerance_nearest(self): # GH14887 left = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c']}) right = pd.DataFrame({'a': [1, 3, 4, 6, 11], 'right_val': [1, 3, 4, 7, 11]}) expected = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c'], 'right_val': [np.nan, 4, 11]}) result = pd.merge_asof(left, right, on='a', direction='nearest', allow_exact_matches=False, tolerance=1) assert_frame_equal(result, expected) def test_forward_by(self): # GH14887 left = pd.DataFrame({'a': [1, 5, 10, 12, 15], 'b': ['X', 'X', 'Y', 'Z', 'Y'], 'left_val': ['a', 'b', 'c', 'd', 'e']}) right = pd.DataFrame({'a': [1, 6, 11, 15, 16], 'b': ['X', 'Z', 'Y', 'Z', 'Y'], 'right_val': [1, 6, 11, 15, 16]}) expected = pd.DataFrame({'a': [1, 5, 10, 12, 15], 'b': ['X', 'X', 'Y', 'Z', 'Y'], 'left_val': ['a', 'b', 'c', 'd', 'e'], 'right_val': [1, np.nan, 11, 15, 16]}) result = pd.merge_asof(left, right, on='a', by='b', direction='forward') assert_frame_equal(result, expected) def test_nearest_by(self): # GH14887 left = pd.DataFrame({'a': [1, 5, 10, 12, 15], 'b': ['X', 'X', 'Z', 'Z', 'Y'], 'left_val': ['a', 'b', 'c', 'd', 'e']}) right = pd.DataFrame({'a': [1, 6, 11, 15, 16], 'b': ['X', 'Z', 'Z', 'Z', 'Y'], 'right_val': [1, 6, 11, 15, 16]}) expected = pd.DataFrame({'a': [1, 5, 10, 12, 15], 'b': ['X', 'X', 'Z', 'Z', 'Y'], 'left_val': ['a', 'b', 'c', 'd', 'e'], 'right_val': [1, 1, 11, 11, 16]}) result = pd.merge_asof(left, right, on='a', by='b', direction='nearest') assert_frame_equal(result, expected) def test_by_int(self): # we specialize by type, so test that this is correct df1 = pd.DataFrame({ 'time': pd.to_datetime(['20160525 13:30:00.020', '20160525 13:30:00.030', '20160525 13:30:00.040', '20160525 13:30:00.050', '20160525 13:30:00.060']), 'key': [1, 2, 1, 3, 2], 'value1': [1.1, 1.2, 1.3, 1.4, 1.5]}, columns=['time', 'key', 'value1']) df2 = pd.DataFrame({ 'time': pd.to_datetime(['20160525 13:30:00.015', '20160525 13:30:00.020', '20160525 13:30:00.025', '20160525 13:30:00.035', '20160525 13:30:00.040', '20160525 13:30:00.055', '20160525 13:30:00.060', '20160525 13:30:00.065']), 'key': [2, 1, 1, 3, 2, 1, 2, 3], 'value2': [2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8]}, columns=['time', 'key', 'value2']) result = pd.merge_asof(df1, df2, on='time', by='key') expected = pd.DataFrame({ 'time': pd.to_datetime(['20160525 13:30:00.020', '20160525 13:30:00.030', '20160525 13:30:00.040', '20160525 13:30:00.050', '20160525 13:30:00.060']), 'key': [1, 2, 1, 3, 2], 'value1': [1.1, 1.2, 1.3, 1.4, 1.5], 'value2': [2.2, 2.1, 2.3, 2.4, 2.7]}, columns=['time', 'key', 'value1', 'value2']) assert_frame_equal(result, expected) def test_on_float(self): # mimics how to determine the minimum-price variation df1 = pd.DataFrame({ 'price': [5.01, 0.0023, 25.13, 340.05, 30.78, 1040.90, 0.0078], 'symbol': list("ABCDEFG")}, columns=['symbol', 'price']) df2 = pd.DataFrame({ 'price': [0.0, 1.0, 100.0], 'mpv': [0.0001, 0.01, 0.05]}, columns=['price', 'mpv']) df1 = df1.sort_values('price').reset_index(drop=True) result = pd.merge_asof(df1, df2, on='price') expected = pd.DataFrame({ 'symbol': list("BGACEDF"), 'price': [0.0023, 0.0078, 5.01, 25.13, 30.78, 340.05, 1040.90], 'mpv': [0.0001, 0.0001, 0.01, 0.01, 0.01, 0.05, 0.05]}, columns=['symbol', 'price', 'mpv']) assert_frame_equal(result, expected) def test_on_specialized_type(self, any_real_dtype): # see gh-13936 dtype = np.dtype(any_real_dtype).type df1 = pd.DataFrame({ "value": [5, 2, 25, 100, 78, 120, 79], "symbol": list("ABCDEFG")}, columns=["symbol", "value"]) df1.value = dtype(df1.value) df2 = pd.DataFrame({ "value": [0, 80, 120, 125], "result": list("xyzw")}, columns=["value", "result"]) df2.value = dtype(df2.value) df1 = df1.sort_values("value").reset_index(drop=True) result = pd.merge_asof(df1, df2, on="value") expected = pd.DataFrame( {"symbol": list("BACEGDF"), "value": [2, 5, 25, 78, 79, 100, 120], "result": list("xxxxxyz") }, columns=["symbol", "value", "result"]) expected.value = dtype(expected.value) assert_frame_equal(result, expected) def test_on_specialized_type_by_int(self, any_real_dtype): # see gh-13936 dtype = np.dtype(any_real_dtype).type df1 = pd.DataFrame({ "value": [5, 2, 25, 100, 78, 120, 79], "key": [1, 2, 3, 2, 3, 1, 2], "symbol": list("ABCDEFG")}, columns=["symbol", "key", "value"]) df1.value = dtype(df1.value) df2 = pd.DataFrame({ "value": [0, 80, 120, 125], "key": [1, 2, 2, 3], "result": list("xyzw")}, columns=["value", "key", "result"]) df2.value = dtype(df2.value) df1 = df1.sort_values("value").reset_index(drop=True) result = pd.merge_asof(df1, df2, on="value", by="key") expected = pd.DataFrame({ "symbol": list("BACEGDF"), "key": [2, 1, 3, 3, 2, 2, 1], "value": [2, 5, 25, 78, 79, 100, 120], "result": [np.nan, "x", np.nan, np.nan, np.nan, "y", "x"]}, columns=["symbol", "key", "value", "result"]) expected.value = dtype(expected.value) assert_frame_equal(result, expected) def test_on_float_by_int(self): # type specialize both "by" and "on" parameters df1 = pd.DataFrame({ 'symbol': list("AAABBBCCC"), 'exch': [1, 2, 3, 1, 2, 3, 1, 2, 3], 'price': [3.26, 3.2599, 3.2598, 12.58, 12.59, 12.5, 378.15, 378.2, 378.25]}, columns=['symbol', 'exch', 'price']) df2 = pd.DataFrame({ 'exch': [1, 1, 1, 2, 2, 2, 3, 3, 3], 'price': [0.0, 1.0, 100.0, 0.0, 5.0, 100.0, 0.0, 5.0, 1000.0], 'mpv': [0.0001, 0.01, 0.05, 0.0001, 0.01, 0.1, 0.0001, 0.25, 1.0]}, columns=['exch', 'price', 'mpv']) df1 = df1.sort_values('price').reset_index(drop=True) df2 = df2.sort_values('price').reset_index(drop=True) result = pd.merge_asof(df1, df2, on='price', by='exch') expected = pd.DataFrame({ 'symbol': list("AAABBBCCC"), 'exch': [3, 2, 1, 3, 1, 2, 1, 2, 3], 'price': [3.2598, 3.2599, 3.26, 12.5, 12.58, 12.59, 378.15, 378.2, 378.25], 'mpv': [0.0001, 0.0001, 0.01, 0.25, 0.01, 0.01, 0.05, 0.1, 0.25]}, columns=['symbol', 'exch', 'price', 'mpv']) assert_frame_equal(result, expected) def test_merge_datatype_error(self): """ Tests merge datatype mismatch error """ msg = r'merge keys \[0\] object and int64, must be the same type' left = pd.DataFrame({'left_val': [1, 5, 10], 'a': ['a', 'b', 'c']}) right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7], 'a': [1, 2, 3, 6, 7]}) with pytest.raises(MergeError, match=msg): merge_asof(left, right, on='a') @pytest.mark.parametrize('func', [lambda x: x, lambda x: to_datetime(x)], ids=['numeric', 'datetime']) @pytest.mark.parametrize('side', ['left', 'right']) def test_merge_on_nans(self, func, side): # GH 23189 msg = "Merge keys contain null values on {} side".format(side) nulls = func([1.0, 5.0, np.nan]) non_nulls = func([1.0, 5.0, 10.]) df_null = pd.DataFrame({'a': nulls, 'left_val': ['a', 'b', 'c']}) df = pd.DataFrame({'a': non_nulls, 'right_val': [1, 6, 11]}) with pytest.raises(ValueError, match=msg): if side == 'left': merge_asof(df_null, df, on='a') else: merge_asof(df, df_null, on='a') def test_merge_by_col_tz_aware(self): # GH 21184 left = pd.DataFrame( {'by_col': pd.DatetimeIndex(['2018-01-01']).tz_localize('UTC'), 'on_col': [2], 'values': ['a']}) right = pd.DataFrame( {'by_col': pd.DatetimeIndex(['2018-01-01']).tz_localize('UTC'), 'on_col': [1], 'values': ['b']}) result = pd.merge_asof(left, right, by='by_col', on='on_col') expected = pd.DataFrame([ [pd.Timestamp('2018-01-01', tz='UTC'), 2, 'a', 'b'] ], columns=['by_col', 'on_col', 'values_x', 'values_y']) assert_frame_equal(result, expected)