123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038 |
- import numpy as np
- import pytest
- import pytz
- import pandas as pd
- from pandas import Timedelta, merge_asof, read_csv, to_datetime
- from pandas.core.reshape.merge import MergeError
- from pandas.util.testing import assert_frame_equal
- class TestAsOfMerge(object):
- def read_data(self, datapath, name, dedupe=False):
- path = datapath('reshape', 'merge', 'data', name)
- x = read_csv(path)
- if dedupe:
- x = (x.drop_duplicates(['time', 'ticker'], keep='last')
- .reset_index(drop=True)
- )
- x.time = to_datetime(x.time)
- return x
- @pytest.fixture(autouse=True)
- def setup_method(self, datapath):
- self.trades = self.read_data(datapath, 'trades.csv')
- self.quotes = self.read_data(datapath, 'quotes.csv', dedupe=True)
- self.asof = self.read_data(datapath, 'asof.csv')
- self.tolerance = self.read_data(datapath, 'tolerance.csv')
- self.allow_exact_matches = self.read_data(datapath,
- 'allow_exact_matches.csv')
- self.allow_exact_matches_and_tolerance = self.read_data(
- datapath, 'allow_exact_matches_and_tolerance.csv')
- def test_examples1(self):
- """ doc-string examples """
- left = pd.DataFrame({'a': [1, 5, 10],
- 'left_val': ['a', 'b', 'c']})
- right = pd.DataFrame({'a': [1, 2, 3, 6, 7],
- 'right_val': [1, 2, 3, 6, 7]})
- expected = pd.DataFrame({'a': [1, 5, 10],
- 'left_val': ['a', 'b', 'c'],
- 'right_val': [1, 3, 7]})
- result = pd.merge_asof(left, right, on='a')
- assert_frame_equal(result, expected)
- def test_examples2(self):
- """ doc-string examples """
- trades = pd.DataFrame({
- 'time': pd.to_datetime(['20160525 13:30:00.023',
- '20160525 13:30:00.038',
- '20160525 13:30:00.048',
- '20160525 13:30:00.048',
- '20160525 13:30:00.048']),
- 'ticker': ['MSFT', 'MSFT',
- 'GOOG', 'GOOG', 'AAPL'],
- 'price': [51.95, 51.95,
- 720.77, 720.92, 98.00],
- 'quantity': [75, 155,
- 100, 100, 100]},
- columns=['time', 'ticker', 'price', 'quantity'])
- quotes = pd.DataFrame({
- 'time': pd.to_datetime(['20160525 13:30:00.023',
- '20160525 13:30:00.023',
- '20160525 13:30:00.030',
- '20160525 13:30:00.041',
- '20160525 13:30:00.048',
- '20160525 13:30:00.049',
- '20160525 13:30:00.072',
- '20160525 13:30:00.075']),
- 'ticker': ['GOOG', 'MSFT', 'MSFT',
- 'MSFT', 'GOOG', 'AAPL', 'GOOG',
- 'MSFT'],
- 'bid': [720.50, 51.95, 51.97, 51.99,
- 720.50, 97.99, 720.50, 52.01],
- 'ask': [720.93, 51.96, 51.98, 52.00,
- 720.93, 98.01, 720.88, 52.03]},
- columns=['time', 'ticker', 'bid', 'ask'])
- pd.merge_asof(trades, quotes,
- on='time',
- by='ticker')
- pd.merge_asof(trades, quotes,
- on='time',
- by='ticker',
- tolerance=pd.Timedelta('2ms'))
- expected = pd.DataFrame({
- 'time': pd.to_datetime(['20160525 13:30:00.023',
- '20160525 13:30:00.038',
- '20160525 13:30:00.048',
- '20160525 13:30:00.048',
- '20160525 13:30:00.048']),
- 'ticker': ['MSFT', 'MSFT', 'GOOG', 'GOOG', 'AAPL'],
- 'price': [51.95, 51.95,
- 720.77, 720.92, 98.00],
- 'quantity': [75, 155,
- 100, 100, 100],
- 'bid': [np.nan, 51.97, np.nan,
- np.nan, np.nan],
- 'ask': [np.nan, 51.98, np.nan,
- np.nan, np.nan]},
- columns=['time', 'ticker', 'price', 'quantity',
- 'bid', 'ask'])
- result = pd.merge_asof(trades, quotes,
- on='time',
- by='ticker',
- tolerance=pd.Timedelta('10ms'),
- allow_exact_matches=False)
- assert_frame_equal(result, expected)
- def test_examples3(self):
- """ doc-string examples """
- # GH14887
- left = pd.DataFrame({'a': [1, 5, 10],
- 'left_val': ['a', 'b', 'c']})
- right = pd.DataFrame({'a': [1, 2, 3, 6, 7],
- 'right_val': [1, 2, 3, 6, 7]})
- expected = pd.DataFrame({'a': [1, 5, 10],
- 'left_val': ['a', 'b', 'c'],
- 'right_val': [1, 6, np.nan]})
- result = pd.merge_asof(left, right, on='a', direction='forward')
- assert_frame_equal(result, expected)
- def test_examples4(self):
- """ doc-string examples """
- # GH14887
- left = pd.DataFrame({'a': [1, 5, 10],
- 'left_val': ['a', 'b', 'c']})
- right = pd.DataFrame({'a': [1, 2, 3, 6, 7],
- 'right_val': [1, 2, 3, 6, 7]})
- expected = pd.DataFrame({'a': [1, 5, 10],
- 'left_val': ['a', 'b', 'c'],
- 'right_val': [1, 6, 7]})
- result = pd.merge_asof(left, right, on='a', direction='nearest')
- assert_frame_equal(result, expected)
- def test_basic(self):
- expected = self.asof
- trades = self.trades
- quotes = self.quotes
- result = merge_asof(trades, quotes,
- on='time',
- by='ticker')
- assert_frame_equal(result, expected)
- def test_basic_categorical(self):
- expected = self.asof
- trades = self.trades.copy()
- trades.ticker = trades.ticker.astype('category')
- quotes = self.quotes.copy()
- quotes.ticker = quotes.ticker.astype('category')
- expected.ticker = expected.ticker.astype('category')
- result = merge_asof(trades, quotes,
- on='time',
- by='ticker')
- assert_frame_equal(result, expected)
- def test_basic_left_index(self):
- # GH14253
- expected = self.asof
- trades = self.trades.set_index('time')
- quotes = self.quotes
- result = merge_asof(trades, quotes,
- left_index=True,
- right_on='time',
- by='ticker')
- # left-only index uses right's index, oddly
- expected.index = result.index
- # time column appears after left's columns
- expected = expected[result.columns]
- assert_frame_equal(result, expected)
- def test_basic_right_index(self):
- expected = self.asof
- trades = self.trades
- quotes = self.quotes.set_index('time')
- result = merge_asof(trades, quotes,
- left_on='time',
- right_index=True,
- by='ticker')
- assert_frame_equal(result, expected)
- def test_basic_left_index_right_index(self):
- expected = self.asof.set_index('time')
- trades = self.trades.set_index('time')
- quotes = self.quotes.set_index('time')
- result = merge_asof(trades, quotes,
- left_index=True,
- right_index=True,
- by='ticker')
- assert_frame_equal(result, expected)
- def test_multi_index(self):
- # MultiIndex is prohibited
- trades = self.trades.set_index(['time', 'price'])
- quotes = self.quotes.set_index('time')
- with pytest.raises(MergeError):
- merge_asof(trades, quotes,
- left_index=True,
- right_index=True)
- trades = self.trades.set_index('time')
- quotes = self.quotes.set_index(['time', 'bid'])
- with pytest.raises(MergeError):
- merge_asof(trades, quotes,
- left_index=True,
- right_index=True)
- def test_on_and_index(self):
- # 'on' parameter and index together is prohibited
- trades = self.trades.set_index('time')
- quotes = self.quotes.set_index('time')
- with pytest.raises(MergeError):
- merge_asof(trades, quotes,
- left_on='price',
- left_index=True,
- right_index=True)
- trades = self.trades.set_index('time')
- quotes = self.quotes.set_index('time')
- with pytest.raises(MergeError):
- merge_asof(trades, quotes,
- right_on='bid',
- left_index=True,
- right_index=True)
- def test_basic_left_by_right_by(self):
- # GH14253
- expected = self.asof
- trades = self.trades
- quotes = self.quotes
- result = merge_asof(trades, quotes,
- on='time',
- left_by='ticker',
- right_by='ticker')
- assert_frame_equal(result, expected)
- def test_missing_right_by(self):
- expected = self.asof
- trades = self.trades
- quotes = self.quotes
- q = quotes[quotes.ticker != 'MSFT']
- result = merge_asof(trades, q,
- on='time',
- by='ticker')
- expected.loc[expected.ticker == 'MSFT', ['bid', 'ask']] = np.nan
- assert_frame_equal(result, expected)
- def test_multiby(self):
- # GH13936
- trades = pd.DataFrame({
- 'time': pd.to_datetime(['20160525 13:30:00.023',
- '20160525 13:30:00.023',
- '20160525 13:30:00.046',
- '20160525 13:30:00.048',
- '20160525 13:30:00.050']),
- 'ticker': ['MSFT', 'MSFT',
- 'GOOG', 'GOOG', 'AAPL'],
- 'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'],
- 'price': [51.95, 51.95,
- 720.77, 720.92, 98.00],
- 'quantity': [75, 155,
- 100, 100, 100]},
- columns=['time', 'ticker', 'exch',
- 'price', 'quantity'])
- quotes = pd.DataFrame({
- 'time': pd.to_datetime(['20160525 13:30:00.023',
- '20160525 13:30:00.023',
- '20160525 13:30:00.030',
- '20160525 13:30:00.041',
- '20160525 13:30:00.045',
- '20160525 13:30:00.049']),
- 'ticker': ['GOOG', 'MSFT', 'MSFT',
- 'MSFT', 'GOOG', 'AAPL'],
- 'exch': ['BATS', 'NSDQ', 'ARCA', 'ARCA',
- 'NSDQ', 'ARCA'],
- 'bid': [720.51, 51.95, 51.97, 51.99,
- 720.50, 97.99],
- 'ask': [720.92, 51.96, 51.98, 52.00,
- 720.93, 98.01]},
- columns=['time', 'ticker', 'exch', 'bid', 'ask'])
- expected = pd.DataFrame({
- 'time': pd.to_datetime(['20160525 13:30:00.023',
- '20160525 13:30:00.023',
- '20160525 13:30:00.046',
- '20160525 13:30:00.048',
- '20160525 13:30:00.050']),
- 'ticker': ['MSFT', 'MSFT',
- 'GOOG', 'GOOG', 'AAPL'],
- 'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'],
- 'price': [51.95, 51.95,
- 720.77, 720.92, 98.00],
- 'quantity': [75, 155,
- 100, 100, 100],
- 'bid': [np.nan, 51.95, 720.50, 720.51, np.nan],
- 'ask': [np.nan, 51.96, 720.93, 720.92, np.nan]},
- columns=['time', 'ticker', 'exch',
- 'price', 'quantity', 'bid', 'ask'])
- result = pd.merge_asof(trades, quotes, on='time',
- by=['ticker', 'exch'])
- assert_frame_equal(result, expected)
- def test_multiby_heterogeneous_types(self):
- # GH13936
- trades = pd.DataFrame({
- 'time': pd.to_datetime(['20160525 13:30:00.023',
- '20160525 13:30:00.023',
- '20160525 13:30:00.046',
- '20160525 13:30:00.048',
- '20160525 13:30:00.050']),
- 'ticker': [0, 0, 1, 1, 2],
- 'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'],
- 'price': [51.95, 51.95,
- 720.77, 720.92, 98.00],
- 'quantity': [75, 155,
- 100, 100, 100]},
- columns=['time', 'ticker', 'exch',
- 'price', 'quantity'])
- quotes = pd.DataFrame({
- 'time': pd.to_datetime(['20160525 13:30:00.023',
- '20160525 13:30:00.023',
- '20160525 13:30:00.030',
- '20160525 13:30:00.041',
- '20160525 13:30:00.045',
- '20160525 13:30:00.049']),
- 'ticker': [1, 0, 0, 0, 1, 2],
- 'exch': ['BATS', 'NSDQ', 'ARCA', 'ARCA',
- 'NSDQ', 'ARCA'],
- 'bid': [720.51, 51.95, 51.97, 51.99,
- 720.50, 97.99],
- 'ask': [720.92, 51.96, 51.98, 52.00,
- 720.93, 98.01]},
- columns=['time', 'ticker', 'exch', 'bid', 'ask'])
- expected = pd.DataFrame({
- 'time': pd.to_datetime(['20160525 13:30:00.023',
- '20160525 13:30:00.023',
- '20160525 13:30:00.046',
- '20160525 13:30:00.048',
- '20160525 13:30:00.050']),
- 'ticker': [0, 0, 1, 1, 2],
- 'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'],
- 'price': [51.95, 51.95,
- 720.77, 720.92, 98.00],
- 'quantity': [75, 155,
- 100, 100, 100],
- 'bid': [np.nan, 51.95, 720.50, 720.51, np.nan],
- 'ask': [np.nan, 51.96, 720.93, 720.92, np.nan]},
- columns=['time', 'ticker', 'exch',
- 'price', 'quantity', 'bid', 'ask'])
- result = pd.merge_asof(trades, quotes, on='time',
- by=['ticker', 'exch'])
- assert_frame_equal(result, expected)
- def test_multiby_indexed(self):
- # GH15676
- left = pd.DataFrame([
- [pd.to_datetime('20160602'), 1, 'a'],
- [pd.to_datetime('20160602'), 2, 'a'],
- [pd.to_datetime('20160603'), 1, 'b'],
- [pd.to_datetime('20160603'), 2, 'b']],
- columns=['time', 'k1', 'k2']).set_index('time')
- right = pd.DataFrame([
- [pd.to_datetime('20160502'), 1, 'a', 1.0],
- [pd.to_datetime('20160502'), 2, 'a', 2.0],
- [pd.to_datetime('20160503'), 1, 'b', 3.0],
- [pd.to_datetime('20160503'), 2, 'b', 4.0]],
- columns=['time', 'k1', 'k2', 'value']).set_index('time')
- expected = pd.DataFrame([
- [pd.to_datetime('20160602'), 1, 'a', 1.0],
- [pd.to_datetime('20160602'), 2, 'a', 2.0],
- [pd.to_datetime('20160603'), 1, 'b', 3.0],
- [pd.to_datetime('20160603'), 2, 'b', 4.0]],
- columns=['time', 'k1', 'k2', 'value']).set_index('time')
- result = pd.merge_asof(left,
- right,
- left_index=True,
- right_index=True,
- by=['k1', 'k2'])
- assert_frame_equal(expected, result)
- with pytest.raises(MergeError):
- pd.merge_asof(left, right, left_index=True, right_index=True,
- left_by=['k1', 'k2'], right_by=['k1'])
- def test_basic2(self, datapath):
- expected = self.read_data(datapath, 'asof2.csv')
- trades = self.read_data(datapath, 'trades2.csv')
- quotes = self.read_data(datapath, 'quotes2.csv', dedupe=True)
- result = merge_asof(trades, quotes,
- on='time',
- by='ticker')
- assert_frame_equal(result, expected)
- def test_basic_no_by(self):
- f = lambda x: x[x.ticker == 'MSFT'].drop('ticker', axis=1) \
- .reset_index(drop=True)
- # just use a single ticker
- expected = f(self.asof)
- trades = f(self.trades)
- quotes = f(self.quotes)
- result = merge_asof(trades, quotes,
- on='time')
- assert_frame_equal(result, expected)
- def test_valid_join_keys(self):
- trades = self.trades
- quotes = self.quotes
- with pytest.raises(MergeError):
- merge_asof(trades, quotes,
- left_on='time',
- right_on='bid',
- by='ticker')
- with pytest.raises(MergeError):
- merge_asof(trades, quotes,
- on=['time', 'ticker'],
- by='ticker')
- with pytest.raises(MergeError):
- merge_asof(trades, quotes,
- by='ticker')
- def test_with_duplicates(self, datapath):
- q = pd.concat([self.quotes, self.quotes]).sort_values(
- ['time', 'ticker']).reset_index(drop=True)
- result = merge_asof(self.trades, q,
- on='time',
- by='ticker')
- expected = self.read_data(datapath, 'asof.csv')
- assert_frame_equal(result, expected)
- def test_with_duplicates_no_on(self):
- df1 = pd.DataFrame({'key': [1, 1, 3],
- 'left_val': [1, 2, 3]})
- df2 = pd.DataFrame({'key': [1, 2, 2],
- 'right_val': [1, 2, 3]})
- result = merge_asof(df1, df2, on='key')
- expected = pd.DataFrame({'key': [1, 1, 3],
- 'left_val': [1, 2, 3],
- 'right_val': [1, 1, 3]})
- assert_frame_equal(result, expected)
- def test_valid_allow_exact_matches(self):
- trades = self.trades
- quotes = self.quotes
- with pytest.raises(MergeError):
- merge_asof(trades, quotes,
- on='time',
- by='ticker',
- allow_exact_matches='foo')
- def test_valid_tolerance(self):
- trades = self.trades
- quotes = self.quotes
- # dti
- merge_asof(trades, quotes,
- on='time',
- by='ticker',
- tolerance=Timedelta('1s'))
- # integer
- merge_asof(trades.reset_index(), quotes.reset_index(),
- on='index',
- by='ticker',
- tolerance=1)
- # incompat
- with pytest.raises(MergeError):
- merge_asof(trades, quotes,
- on='time',
- by='ticker',
- tolerance=1)
- # invalid
- with pytest.raises(MergeError):
- merge_asof(trades.reset_index(), quotes.reset_index(),
- on='index',
- by='ticker',
- tolerance=1.0)
- # invalid negative
- with pytest.raises(MergeError):
- merge_asof(trades, quotes,
- on='time',
- by='ticker',
- tolerance=-Timedelta('1s'))
- with pytest.raises(MergeError):
- merge_asof(trades.reset_index(), quotes.reset_index(),
- on='index',
- by='ticker',
- tolerance=-1)
- def test_non_sorted(self):
- trades = self.trades.sort_values('time', ascending=False)
- quotes = self.quotes.sort_values('time', ascending=False)
- # we require that we are already sorted on time & quotes
- assert not trades.time.is_monotonic
- assert not quotes.time.is_monotonic
- with pytest.raises(ValueError):
- merge_asof(trades, quotes,
- on='time',
- by='ticker')
- trades = self.trades.sort_values('time')
- assert trades.time.is_monotonic
- assert not quotes.time.is_monotonic
- with pytest.raises(ValueError):
- merge_asof(trades, quotes,
- on='time',
- by='ticker')
- quotes = self.quotes.sort_values('time')
- assert trades.time.is_monotonic
- assert quotes.time.is_monotonic
- # ok, though has dupes
- merge_asof(trades, self.quotes,
- on='time',
- by='ticker')
- def test_tolerance(self):
- trades = self.trades
- quotes = self.quotes
- result = merge_asof(trades, quotes,
- on='time',
- by='ticker',
- tolerance=Timedelta('1day'))
- expected = self.tolerance
- assert_frame_equal(result, expected)
- def test_tolerance_forward(self):
- # GH14887
- left = pd.DataFrame({'a': [1, 5, 10],
- 'left_val': ['a', 'b', 'c']})
- right = pd.DataFrame({'a': [1, 2, 3, 7, 11],
- 'right_val': [1, 2, 3, 7, 11]})
- expected = pd.DataFrame({'a': [1, 5, 10],
- 'left_val': ['a', 'b', 'c'],
- 'right_val': [1, np.nan, 11]})
- result = pd.merge_asof(left, right, on='a', direction='forward',
- tolerance=1)
- assert_frame_equal(result, expected)
- def test_tolerance_nearest(self):
- # GH14887
- left = pd.DataFrame({'a': [1, 5, 10],
- 'left_val': ['a', 'b', 'c']})
- right = pd.DataFrame({'a': [1, 2, 3, 7, 11],
- 'right_val': [1, 2, 3, 7, 11]})
- expected = pd.DataFrame({'a': [1, 5, 10],
- 'left_val': ['a', 'b', 'c'],
- 'right_val': [1, np.nan, 11]})
- result = pd.merge_asof(left, right, on='a', direction='nearest',
- tolerance=1)
- assert_frame_equal(result, expected)
- def test_tolerance_tz(self):
- # GH 14844
- left = pd.DataFrame(
- {'date': pd.date_range(start=pd.to_datetime('2016-01-02'),
- freq='D', periods=5,
- tz=pytz.timezone('UTC')),
- 'value1': np.arange(5)})
- right = pd.DataFrame(
- {'date': pd.date_range(start=pd.to_datetime('2016-01-01'),
- freq='D', periods=5,
- tz=pytz.timezone('UTC')),
- 'value2': list("ABCDE")})
- result = pd.merge_asof(left, right, on='date',
- tolerance=pd.Timedelta('1 day'))
- expected = pd.DataFrame(
- {'date': pd.date_range(start=pd.to_datetime('2016-01-02'),
- freq='D', periods=5,
- tz=pytz.timezone('UTC')),
- 'value1': np.arange(5),
- 'value2': list("BCDEE")})
- assert_frame_equal(result, expected)
- def test_tolerance_float(self):
- # GH22981
- left = pd.DataFrame({'a': [1.1, 3.5, 10.9],
- 'left_val': ['a', 'b', 'c']})
- right = pd.DataFrame({'a': [1.0, 2.5, 3.3, 7.5, 11.5],
- 'right_val': [1.0, 2.5, 3.3, 7.5, 11.5]})
- expected = pd.DataFrame({'a': [1.1, 3.5, 10.9],
- 'left_val': ['a', 'b', 'c'],
- 'right_val': [1, 3.3, np.nan]})
- result = pd.merge_asof(left, right, on='a', direction='nearest',
- tolerance=0.5)
- assert_frame_equal(result, expected)
- def test_index_tolerance(self):
- # GH 15135
- expected = self.tolerance.set_index('time')
- trades = self.trades.set_index('time')
- quotes = self.quotes.set_index('time')
- result = pd.merge_asof(trades, quotes,
- left_index=True,
- right_index=True,
- by='ticker',
- tolerance=pd.Timedelta('1day'))
- assert_frame_equal(result, expected)
- def test_allow_exact_matches(self):
- result = merge_asof(self.trades, self.quotes,
- on='time',
- by='ticker',
- allow_exact_matches=False)
- expected = self.allow_exact_matches
- assert_frame_equal(result, expected)
- def test_allow_exact_matches_forward(self):
- # GH14887
- left = pd.DataFrame({'a': [1, 5, 10],
- 'left_val': ['a', 'b', 'c']})
- right = pd.DataFrame({'a': [1, 2, 3, 7, 11],
- 'right_val': [1, 2, 3, 7, 11]})
- expected = pd.DataFrame({'a': [1, 5, 10],
- 'left_val': ['a', 'b', 'c'],
- 'right_val': [2, 7, 11]})
- result = pd.merge_asof(left, right, on='a', direction='forward',
- allow_exact_matches=False)
- assert_frame_equal(result, expected)
- def test_allow_exact_matches_nearest(self):
- # GH14887
- left = pd.DataFrame({'a': [1, 5, 10],
- 'left_val': ['a', 'b', 'c']})
- right = pd.DataFrame({'a': [1, 2, 3, 7, 11],
- 'right_val': [1, 2, 3, 7, 11]})
- expected = pd.DataFrame({'a': [1, 5, 10],
- 'left_val': ['a', 'b', 'c'],
- 'right_val': [2, 3, 11]})
- result = pd.merge_asof(left, right, on='a', direction='nearest',
- allow_exact_matches=False)
- assert_frame_equal(result, expected)
- def test_allow_exact_matches_and_tolerance(self):
- result = merge_asof(self.trades, self.quotes,
- on='time',
- by='ticker',
- tolerance=Timedelta('100ms'),
- allow_exact_matches=False)
- expected = self.allow_exact_matches_and_tolerance
- assert_frame_equal(result, expected)
- def test_allow_exact_matches_and_tolerance2(self):
- # GH 13695
- df1 = pd.DataFrame({
- 'time': pd.to_datetime(['2016-07-15 13:30:00.030']),
- 'username': ['bob']})
- df2 = pd.DataFrame({
- 'time': pd.to_datetime(['2016-07-15 13:30:00.000',
- '2016-07-15 13:30:00.030']),
- 'version': [1, 2]})
- result = pd.merge_asof(df1, df2, on='time')
- expected = pd.DataFrame({
- 'time': pd.to_datetime(['2016-07-15 13:30:00.030']),
- 'username': ['bob'],
- 'version': [2]})
- assert_frame_equal(result, expected)
- result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False)
- expected = pd.DataFrame({
- 'time': pd.to_datetime(['2016-07-15 13:30:00.030']),
- 'username': ['bob'],
- 'version': [1]})
- assert_frame_equal(result, expected)
- result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False,
- tolerance=pd.Timedelta('10ms'))
- expected = pd.DataFrame({
- 'time': pd.to_datetime(['2016-07-15 13:30:00.030']),
- 'username': ['bob'],
- 'version': [np.nan]})
- assert_frame_equal(result, expected)
- def test_allow_exact_matches_and_tolerance3(self):
- # GH 13709
- df1 = pd.DataFrame({
- 'time': pd.to_datetime(['2016-07-15 13:30:00.030',
- '2016-07-15 13:30:00.030']),
- 'username': ['bob', 'charlie']})
- df2 = pd.DataFrame({
- 'time': pd.to_datetime(['2016-07-15 13:30:00.000',
- '2016-07-15 13:30:00.030']),
- 'version': [1, 2]})
- result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False,
- tolerance=pd.Timedelta('10ms'))
- expected = pd.DataFrame({
- 'time': pd.to_datetime(['2016-07-15 13:30:00.030',
- '2016-07-15 13:30:00.030']),
- 'username': ['bob', 'charlie'],
- 'version': [np.nan, np.nan]})
- assert_frame_equal(result, expected)
- def test_allow_exact_matches_and_tolerance_forward(self):
- # GH14887
- left = pd.DataFrame({'a': [1, 5, 10],
- 'left_val': ['a', 'b', 'c']})
- right = pd.DataFrame({'a': [1, 3, 4, 6, 11],
- 'right_val': [1, 3, 4, 6, 11]})
- expected = pd.DataFrame({'a': [1, 5, 10],
- 'left_val': ['a', 'b', 'c'],
- 'right_val': [np.nan, 6, 11]})
- result = pd.merge_asof(left, right, on='a', direction='forward',
- allow_exact_matches=False, tolerance=1)
- assert_frame_equal(result, expected)
- def test_allow_exact_matches_and_tolerance_nearest(self):
- # GH14887
- left = pd.DataFrame({'a': [1, 5, 10],
- 'left_val': ['a', 'b', 'c']})
- right = pd.DataFrame({'a': [1, 3, 4, 6, 11],
- 'right_val': [1, 3, 4, 7, 11]})
- expected = pd.DataFrame({'a': [1, 5, 10],
- 'left_val': ['a', 'b', 'c'],
- 'right_val': [np.nan, 4, 11]})
- result = pd.merge_asof(left, right, on='a', direction='nearest',
- allow_exact_matches=False, tolerance=1)
- assert_frame_equal(result, expected)
- def test_forward_by(self):
- # GH14887
- left = pd.DataFrame({'a': [1, 5, 10, 12, 15],
- 'b': ['X', 'X', 'Y', 'Z', 'Y'],
- 'left_val': ['a', 'b', 'c', 'd', 'e']})
- right = pd.DataFrame({'a': [1, 6, 11, 15, 16],
- 'b': ['X', 'Z', 'Y', 'Z', 'Y'],
- 'right_val': [1, 6, 11, 15, 16]})
- expected = pd.DataFrame({'a': [1, 5, 10, 12, 15],
- 'b': ['X', 'X', 'Y', 'Z', 'Y'],
- 'left_val': ['a', 'b', 'c', 'd', 'e'],
- 'right_val': [1, np.nan, 11, 15, 16]})
- result = pd.merge_asof(left, right, on='a', by='b',
- direction='forward')
- assert_frame_equal(result, expected)
- def test_nearest_by(self):
- # GH14887
- left = pd.DataFrame({'a': [1, 5, 10, 12, 15],
- 'b': ['X', 'X', 'Z', 'Z', 'Y'],
- 'left_val': ['a', 'b', 'c', 'd', 'e']})
- right = pd.DataFrame({'a': [1, 6, 11, 15, 16],
- 'b': ['X', 'Z', 'Z', 'Z', 'Y'],
- 'right_val': [1, 6, 11, 15, 16]})
- expected = pd.DataFrame({'a': [1, 5, 10, 12, 15],
- 'b': ['X', 'X', 'Z', 'Z', 'Y'],
- 'left_val': ['a', 'b', 'c', 'd', 'e'],
- 'right_val': [1, 1, 11, 11, 16]})
- result = pd.merge_asof(left, right, on='a', by='b',
- direction='nearest')
- assert_frame_equal(result, expected)
- def test_by_int(self):
- # we specialize by type, so test that this is correct
- df1 = pd.DataFrame({
- 'time': pd.to_datetime(['20160525 13:30:00.020',
- '20160525 13:30:00.030',
- '20160525 13:30:00.040',
- '20160525 13:30:00.050',
- '20160525 13:30:00.060']),
- 'key': [1, 2, 1, 3, 2],
- 'value1': [1.1, 1.2, 1.3, 1.4, 1.5]},
- columns=['time', 'key', 'value1'])
- df2 = pd.DataFrame({
- 'time': pd.to_datetime(['20160525 13:30:00.015',
- '20160525 13:30:00.020',
- '20160525 13:30:00.025',
- '20160525 13:30:00.035',
- '20160525 13:30:00.040',
- '20160525 13:30:00.055',
- '20160525 13:30:00.060',
- '20160525 13:30:00.065']),
- 'key': [2, 1, 1, 3, 2, 1, 2, 3],
- 'value2': [2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8]},
- columns=['time', 'key', 'value2'])
- result = pd.merge_asof(df1, df2, on='time', by='key')
- expected = pd.DataFrame({
- 'time': pd.to_datetime(['20160525 13:30:00.020',
- '20160525 13:30:00.030',
- '20160525 13:30:00.040',
- '20160525 13:30:00.050',
- '20160525 13:30:00.060']),
- 'key': [1, 2, 1, 3, 2],
- 'value1': [1.1, 1.2, 1.3, 1.4, 1.5],
- 'value2': [2.2, 2.1, 2.3, 2.4, 2.7]},
- columns=['time', 'key', 'value1', 'value2'])
- assert_frame_equal(result, expected)
- def test_on_float(self):
- # mimics how to determine the minimum-price variation
- df1 = pd.DataFrame({
- 'price': [5.01, 0.0023, 25.13, 340.05, 30.78, 1040.90, 0.0078],
- 'symbol': list("ABCDEFG")},
- columns=['symbol', 'price'])
- df2 = pd.DataFrame({
- 'price': [0.0, 1.0, 100.0],
- 'mpv': [0.0001, 0.01, 0.05]},
- columns=['price', 'mpv'])
- df1 = df1.sort_values('price').reset_index(drop=True)
- result = pd.merge_asof(df1, df2, on='price')
- expected = pd.DataFrame({
- 'symbol': list("BGACEDF"),
- 'price': [0.0023, 0.0078, 5.01, 25.13, 30.78, 340.05, 1040.90],
- 'mpv': [0.0001, 0.0001, 0.01, 0.01, 0.01, 0.05, 0.05]},
- columns=['symbol', 'price', 'mpv'])
- assert_frame_equal(result, expected)
- def test_on_specialized_type(self, any_real_dtype):
- # see gh-13936
- dtype = np.dtype(any_real_dtype).type
- df1 = pd.DataFrame({
- "value": [5, 2, 25, 100, 78, 120, 79],
- "symbol": list("ABCDEFG")},
- columns=["symbol", "value"])
- df1.value = dtype(df1.value)
- df2 = pd.DataFrame({
- "value": [0, 80, 120, 125],
- "result": list("xyzw")},
- columns=["value", "result"])
- df2.value = dtype(df2.value)
- df1 = df1.sort_values("value").reset_index(drop=True)
- result = pd.merge_asof(df1, df2, on="value")
- expected = pd.DataFrame(
- {"symbol": list("BACEGDF"),
- "value": [2, 5, 25, 78, 79, 100, 120],
- "result": list("xxxxxyz")
- }, columns=["symbol", "value", "result"])
- expected.value = dtype(expected.value)
- assert_frame_equal(result, expected)
- def test_on_specialized_type_by_int(self, any_real_dtype):
- # see gh-13936
- dtype = np.dtype(any_real_dtype).type
- df1 = pd.DataFrame({
- "value": [5, 2, 25, 100, 78, 120, 79],
- "key": [1, 2, 3, 2, 3, 1, 2],
- "symbol": list("ABCDEFG")},
- columns=["symbol", "key", "value"])
- df1.value = dtype(df1.value)
- df2 = pd.DataFrame({
- "value": [0, 80, 120, 125],
- "key": [1, 2, 2, 3],
- "result": list("xyzw")},
- columns=["value", "key", "result"])
- df2.value = dtype(df2.value)
- df1 = df1.sort_values("value").reset_index(drop=True)
- result = pd.merge_asof(df1, df2, on="value", by="key")
- expected = pd.DataFrame({
- "symbol": list("BACEGDF"),
- "key": [2, 1, 3, 3, 2, 2, 1],
- "value": [2, 5, 25, 78, 79, 100, 120],
- "result": [np.nan, "x", np.nan, np.nan, np.nan, "y", "x"]},
- columns=["symbol", "key", "value", "result"])
- expected.value = dtype(expected.value)
- assert_frame_equal(result, expected)
- def test_on_float_by_int(self):
- # type specialize both "by" and "on" parameters
- df1 = pd.DataFrame({
- 'symbol': list("AAABBBCCC"),
- 'exch': [1, 2, 3, 1, 2, 3, 1, 2, 3],
- 'price': [3.26, 3.2599, 3.2598, 12.58, 12.59,
- 12.5, 378.15, 378.2, 378.25]},
- columns=['symbol', 'exch', 'price'])
- df2 = pd.DataFrame({
- 'exch': [1, 1, 1, 2, 2, 2, 3, 3, 3],
- 'price': [0.0, 1.0, 100.0, 0.0, 5.0, 100.0, 0.0, 5.0, 1000.0],
- 'mpv': [0.0001, 0.01, 0.05, 0.0001, 0.01, 0.1, 0.0001, 0.25, 1.0]},
- columns=['exch', 'price', 'mpv'])
- df1 = df1.sort_values('price').reset_index(drop=True)
- df2 = df2.sort_values('price').reset_index(drop=True)
- result = pd.merge_asof(df1, df2, on='price', by='exch')
- expected = pd.DataFrame({
- 'symbol': list("AAABBBCCC"),
- 'exch': [3, 2, 1, 3, 1, 2, 1, 2, 3],
- 'price': [3.2598, 3.2599, 3.26, 12.5, 12.58,
- 12.59, 378.15, 378.2, 378.25],
- 'mpv': [0.0001, 0.0001, 0.01, 0.25, 0.01, 0.01, 0.05, 0.1, 0.25]},
- columns=['symbol', 'exch', 'price', 'mpv'])
- assert_frame_equal(result, expected)
- def test_merge_datatype_error(self):
- """ Tests merge datatype mismatch error """
- msg = r'merge keys \[0\] object and int64, must be the same type'
- left = pd.DataFrame({'left_val': [1, 5, 10],
- 'a': ['a', 'b', 'c']})
- right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7],
- 'a': [1, 2, 3, 6, 7]})
- with pytest.raises(MergeError, match=msg):
- merge_asof(left, right, on='a')
- @pytest.mark.parametrize('func', [lambda x: x, lambda x: to_datetime(x)],
- ids=['numeric', 'datetime'])
- @pytest.mark.parametrize('side', ['left', 'right'])
- def test_merge_on_nans(self, func, side):
- # GH 23189
- msg = "Merge keys contain null values on {} side".format(side)
- nulls = func([1.0, 5.0, np.nan])
- non_nulls = func([1.0, 5.0, 10.])
- df_null = pd.DataFrame({'a': nulls, 'left_val': ['a', 'b', 'c']})
- df = pd.DataFrame({'a': non_nulls, 'right_val': [1, 6, 11]})
- with pytest.raises(ValueError, match=msg):
- if side == 'left':
- merge_asof(df_null, df, on='a')
- else:
- merge_asof(df, df_null, on='a')
- def test_merge_by_col_tz_aware(self):
- # GH 21184
- left = pd.DataFrame(
- {'by_col': pd.DatetimeIndex(['2018-01-01']).tz_localize('UTC'),
- 'on_col': [2], 'values': ['a']})
- right = pd.DataFrame(
- {'by_col': pd.DatetimeIndex(['2018-01-01']).tz_localize('UTC'),
- 'on_col': [1], 'values': ['b']})
- result = pd.merge_asof(left, right, by='by_col', on='on_col')
- expected = pd.DataFrame([
- [pd.Timestamp('2018-01-01', tz='UTC'), 2, 'a', 'b']
- ], columns=['by_col', 'on_col', 'values_x', 'values_y'])
- assert_frame_equal(result, expected)
|