merge.py 65 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752
  1. """
  2. SQL-style merge routines
  3. """
  4. import copy
  5. import string
  6. import warnings
  7. import numpy as np
  8. from pandas._libs import hashtable as libhashtable, join as libjoin, lib
  9. import pandas.compat as compat
  10. from pandas.compat import filter, lzip, map, range, zip
  11. from pandas.errors import MergeError
  12. from pandas.util._decorators import Appender, Substitution
  13. from pandas.core.dtypes.common import (
  14. ensure_float64, ensure_int64, ensure_object, is_array_like, is_bool,
  15. is_bool_dtype, is_categorical_dtype, is_datetime64_dtype,
  16. is_datetime64tz_dtype, is_datetimelike, is_dtype_equal,
  17. is_extension_array_dtype, is_float_dtype, is_int64_dtype, is_integer,
  18. is_integer_dtype, is_list_like, is_number, is_numeric_dtype,
  19. is_object_dtype, needs_i8_conversion)
  20. from pandas.core.dtypes.missing import isnull, na_value_for_dtype
  21. from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timedelta
  22. import pandas.core.algorithms as algos
  23. from pandas.core.arrays.categorical import _recode_for_categories
  24. import pandas.core.common as com
  25. from pandas.core.frame import _merge_doc
  26. from pandas.core.internals import (
  27. concatenate_block_managers, items_overlap_with_suffix)
  28. import pandas.core.sorting as sorting
  29. from pandas.core.sorting import is_int64_overflow_possible
  30. @Substitution('\nleft : DataFrame')
  31. @Appender(_merge_doc, indents=0)
  32. def merge(left, right, how='inner', on=None, left_on=None, right_on=None,
  33. left_index=False, right_index=False, sort=False,
  34. suffixes=('_x', '_y'), copy=True, indicator=False,
  35. validate=None):
  36. op = _MergeOperation(left, right, how=how, on=on, left_on=left_on,
  37. right_on=right_on, left_index=left_index,
  38. right_index=right_index, sort=sort, suffixes=suffixes,
  39. copy=copy, indicator=indicator,
  40. validate=validate)
  41. return op.get_result()
  42. if __debug__:
  43. merge.__doc__ = _merge_doc % '\nleft : DataFrame'
  44. def _groupby_and_merge(by, on, left, right, _merge_pieces,
  45. check_duplicates=True):
  46. """
  47. groupby & merge; we are always performing a left-by type operation
  48. Parameters
  49. ----------
  50. by: field to group
  51. on: duplicates field
  52. left: left frame
  53. right: right frame
  54. _merge_pieces: function for merging
  55. check_duplicates: boolean, default True
  56. should we check & clean duplicates
  57. """
  58. pieces = []
  59. if not isinstance(by, (list, tuple)):
  60. by = [by]
  61. lby = left.groupby(by, sort=False)
  62. # if we can groupby the rhs
  63. # then we can get vastly better perf
  64. try:
  65. # we will check & remove duplicates if indicated
  66. if check_duplicates:
  67. if on is None:
  68. on = []
  69. elif not isinstance(on, (list, tuple)):
  70. on = [on]
  71. if right.duplicated(by + on).any():
  72. right = right.drop_duplicates(by + on, keep='last')
  73. rby = right.groupby(by, sort=False)
  74. except KeyError:
  75. rby = None
  76. for key, lhs in lby:
  77. if rby is None:
  78. rhs = right
  79. else:
  80. try:
  81. rhs = right.take(rby.indices[key])
  82. except KeyError:
  83. # key doesn't exist in left
  84. lcols = lhs.columns.tolist()
  85. cols = lcols + [r for r in right.columns
  86. if r not in set(lcols)]
  87. merged = lhs.reindex(columns=cols)
  88. merged.index = range(len(merged))
  89. pieces.append(merged)
  90. continue
  91. merged = _merge_pieces(lhs, rhs)
  92. # make sure join keys are in the merged
  93. # TODO, should _merge_pieces do this?
  94. for k in by:
  95. try:
  96. if k in merged:
  97. merged[k] = key
  98. except KeyError:
  99. pass
  100. pieces.append(merged)
  101. # preserve the original order
  102. # if we have a missing piece this can be reset
  103. from pandas.core.reshape.concat import concat
  104. result = concat(pieces, ignore_index=True)
  105. result = result.reindex(columns=pieces[0].columns, copy=False)
  106. return result, lby
  107. def merge_ordered(left, right, on=None,
  108. left_on=None, right_on=None,
  109. left_by=None, right_by=None,
  110. fill_method=None, suffixes=('_x', '_y'),
  111. how='outer'):
  112. """Perform merge with optional filling/interpolation designed for ordered
  113. data like time series data. Optionally perform group-wise merge (see
  114. examples)
  115. Parameters
  116. ----------
  117. left : DataFrame
  118. right : DataFrame
  119. on : label or list
  120. Field names to join on. Must be found in both DataFrames.
  121. left_on : label or list, or array-like
  122. Field names to join on in left DataFrame. Can be a vector or list of
  123. vectors of the length of the DataFrame to use a particular vector as
  124. the join key instead of columns
  125. right_on : label or list, or array-like
  126. Field names to join on in right DataFrame or vector/list of vectors per
  127. left_on docs
  128. left_by : column name or list of column names
  129. Group left DataFrame by group columns and merge piece by piece with
  130. right DataFrame
  131. right_by : column name or list of column names
  132. Group right DataFrame by group columns and merge piece by piece with
  133. left DataFrame
  134. fill_method : {'ffill', None}, default None
  135. Interpolation method for data
  136. suffixes : 2-length sequence (tuple, list, ...)
  137. Suffix to apply to overlapping column names in the left and right
  138. side, respectively
  139. how : {'left', 'right', 'outer', 'inner'}, default 'outer'
  140. * left: use only keys from left frame (SQL: left outer join)
  141. * right: use only keys from right frame (SQL: right outer join)
  142. * outer: use union of keys from both frames (SQL: full outer join)
  143. * inner: use intersection of keys from both frames (SQL: inner join)
  144. .. versionadded:: 0.19.0
  145. Returns
  146. -------
  147. merged : DataFrame
  148. The output type will the be same as 'left', if it is a subclass
  149. of DataFrame.
  150. See Also
  151. --------
  152. merge
  153. merge_asof
  154. Examples
  155. --------
  156. >>> A >>> B
  157. key lvalue group key rvalue
  158. 0 a 1 a 0 b 1
  159. 1 c 2 a 1 c 2
  160. 2 e 3 a 2 d 3
  161. 3 a 1 b
  162. 4 c 2 b
  163. 5 e 3 b
  164. >>> merge_ordered(A, B, fill_method='ffill', left_by='group')
  165. group key lvalue rvalue
  166. 0 a a 1 NaN
  167. 1 a b 1 1.0
  168. 2 a c 2 2.0
  169. 3 a d 2 3.0
  170. 4 a e 3 3.0
  171. 5 b a 1 NaN
  172. 6 b b 1 1.0
  173. 7 b c 2 2.0
  174. 8 b d 2 3.0
  175. 9 b e 3 3.0
  176. """
  177. def _merger(x, y):
  178. # perform the ordered merge operation
  179. op = _OrderedMerge(x, y, on=on, left_on=left_on, right_on=right_on,
  180. suffixes=suffixes, fill_method=fill_method,
  181. how=how)
  182. return op.get_result()
  183. if left_by is not None and right_by is not None:
  184. raise ValueError('Can only group either left or right frames')
  185. elif left_by is not None:
  186. result, _ = _groupby_and_merge(left_by, on, left, right,
  187. lambda x, y: _merger(x, y),
  188. check_duplicates=False)
  189. elif right_by is not None:
  190. result, _ = _groupby_and_merge(right_by, on, right, left,
  191. lambda x, y: _merger(y, x),
  192. check_duplicates=False)
  193. else:
  194. result = _merger(left, right)
  195. return result
  196. def merge_asof(left, right, on=None,
  197. left_on=None, right_on=None,
  198. left_index=False, right_index=False,
  199. by=None, left_by=None, right_by=None,
  200. suffixes=('_x', '_y'),
  201. tolerance=None,
  202. allow_exact_matches=True,
  203. direction='backward'):
  204. """Perform an asof merge. This is similar to a left-join except that we
  205. match on nearest key rather than equal keys.
  206. Both DataFrames must be sorted by the key.
  207. For each row in the left DataFrame:
  208. - A "backward" search selects the last row in the right DataFrame whose
  209. 'on' key is less than or equal to the left's key.
  210. - A "forward" search selects the first row in the right DataFrame whose
  211. 'on' key is greater than or equal to the left's key.
  212. - A "nearest" search selects the row in the right DataFrame whose 'on'
  213. key is closest in absolute distance to the left's key.
  214. The default is "backward" and is compatible in versions below 0.20.0.
  215. The direction parameter was added in version 0.20.0 and introduces
  216. "forward" and "nearest".
  217. Optionally match on equivalent keys with 'by' before searching with 'on'.
  218. .. versionadded:: 0.19.0
  219. Parameters
  220. ----------
  221. left : DataFrame
  222. right : DataFrame
  223. on : label
  224. Field name to join on. Must be found in both DataFrames.
  225. The data MUST be ordered. Furthermore this must be a numeric column,
  226. such as datetimelike, integer, or float. On or left_on/right_on
  227. must be given.
  228. left_on : label
  229. Field name to join on in left DataFrame.
  230. right_on : label
  231. Field name to join on in right DataFrame.
  232. left_index : boolean
  233. Use the index of the left DataFrame as the join key.
  234. .. versionadded:: 0.19.2
  235. right_index : boolean
  236. Use the index of the right DataFrame as the join key.
  237. .. versionadded:: 0.19.2
  238. by : column name or list of column names
  239. Match on these columns before performing merge operation.
  240. left_by : column name
  241. Field names to match on in the left DataFrame.
  242. .. versionadded:: 0.19.2
  243. right_by : column name
  244. Field names to match on in the right DataFrame.
  245. .. versionadded:: 0.19.2
  246. suffixes : 2-length sequence (tuple, list, ...)
  247. Suffix to apply to overlapping column names in the left and right
  248. side, respectively.
  249. tolerance : integer or Timedelta, optional, default None
  250. Select asof tolerance within this range; must be compatible
  251. with the merge index.
  252. allow_exact_matches : boolean, default True
  253. - If True, allow matching with the same 'on' value
  254. (i.e. less-than-or-equal-to / greater-than-or-equal-to)
  255. - If False, don't match the same 'on' value
  256. (i.e., strictly less-than / strictly greater-than)
  257. direction : 'backward' (default), 'forward', or 'nearest'
  258. Whether to search for prior, subsequent, or closest matches.
  259. .. versionadded:: 0.20.0
  260. Returns
  261. -------
  262. merged : DataFrame
  263. See Also
  264. --------
  265. merge
  266. merge_ordered
  267. Examples
  268. --------
  269. >>> left = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c']})
  270. >>> left
  271. a left_val
  272. 0 1 a
  273. 1 5 b
  274. 2 10 c
  275. >>> right = pd.DataFrame({'a': [1, 2, 3, 6, 7],
  276. ... 'right_val': [1, 2, 3, 6, 7]})
  277. >>> right
  278. a right_val
  279. 0 1 1
  280. 1 2 2
  281. 2 3 3
  282. 3 6 6
  283. 4 7 7
  284. >>> pd.merge_asof(left, right, on='a')
  285. a left_val right_val
  286. 0 1 a 1
  287. 1 5 b 3
  288. 2 10 c 7
  289. >>> pd.merge_asof(left, right, on='a', allow_exact_matches=False)
  290. a left_val right_val
  291. 0 1 a NaN
  292. 1 5 b 3.0
  293. 2 10 c 7.0
  294. >>> pd.merge_asof(left, right, on='a', direction='forward')
  295. a left_val right_val
  296. 0 1 a 1.0
  297. 1 5 b 6.0
  298. 2 10 c NaN
  299. >>> pd.merge_asof(left, right, on='a', direction='nearest')
  300. a left_val right_val
  301. 0 1 a 1
  302. 1 5 b 6
  303. 2 10 c 7
  304. We can use indexed DataFrames as well.
  305. >>> left = pd.DataFrame({'left_val': ['a', 'b', 'c']}, index=[1, 5, 10])
  306. >>> left
  307. left_val
  308. 1 a
  309. 5 b
  310. 10 c
  311. >>> right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7]},
  312. ... index=[1, 2, 3, 6, 7])
  313. >>> right
  314. right_val
  315. 1 1
  316. 2 2
  317. 3 3
  318. 6 6
  319. 7 7
  320. >>> pd.merge_asof(left, right, left_index=True, right_index=True)
  321. left_val right_val
  322. 1 a 1
  323. 5 b 3
  324. 10 c 7
  325. Here is a real-world times-series example
  326. >>> quotes
  327. time ticker bid ask
  328. 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93
  329. 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96
  330. 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98
  331. 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00
  332. 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93
  333. 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01
  334. 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88
  335. 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03
  336. >>> trades
  337. time ticker price quantity
  338. 0 2016-05-25 13:30:00.023 MSFT 51.95 75
  339. 1 2016-05-25 13:30:00.038 MSFT 51.95 155
  340. 2 2016-05-25 13:30:00.048 GOOG 720.77 100
  341. 3 2016-05-25 13:30:00.048 GOOG 720.92 100
  342. 4 2016-05-25 13:30:00.048 AAPL 98.00 100
  343. By default we are taking the asof of the quotes
  344. >>> pd.merge_asof(trades, quotes,
  345. ... on='time',
  346. ... by='ticker')
  347. time ticker price quantity bid ask
  348. 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96
  349. 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98
  350. 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93
  351. 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93
  352. 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
  353. We only asof within 2ms between the quote time and the trade time
  354. >>> pd.merge_asof(trades, quotes,
  355. ... on='time',
  356. ... by='ticker',
  357. ... tolerance=pd.Timedelta('2ms'))
  358. time ticker price quantity bid ask
  359. 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96
  360. 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN
  361. 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93
  362. 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93
  363. 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
  364. We only asof within 10ms between the quote time and the trade time
  365. and we exclude exact matches on time. However *prior* data will
  366. propagate forward
  367. >>> pd.merge_asof(trades, quotes,
  368. ... on='time',
  369. ... by='ticker',
  370. ... tolerance=pd.Timedelta('10ms'),
  371. ... allow_exact_matches=False)
  372. time ticker price quantity bid ask
  373. 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN
  374. 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98
  375. 2 2016-05-25 13:30:00.048 GOOG 720.77 100 NaN NaN
  376. 3 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN
  377. 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
  378. """
  379. op = _AsOfMerge(left, right,
  380. on=on, left_on=left_on, right_on=right_on,
  381. left_index=left_index, right_index=right_index,
  382. by=by, left_by=left_by, right_by=right_by,
  383. suffixes=suffixes,
  384. how='asof', tolerance=tolerance,
  385. allow_exact_matches=allow_exact_matches,
  386. direction=direction)
  387. return op.get_result()
  388. # TODO: transformations??
  389. # TODO: only copy DataFrames when modification necessary
  390. class _MergeOperation(object):
  391. """
  392. Perform a database (SQL) merge operation between two DataFrame objects
  393. using either columns as keys or their row indexes
  394. """
  395. _merge_type = 'merge'
  396. def __init__(self, left, right, how='inner', on=None,
  397. left_on=None, right_on=None, axis=1,
  398. left_index=False, right_index=False, sort=True,
  399. suffixes=('_x', '_y'), copy=True, indicator=False,
  400. validate=None):
  401. left = validate_operand(left)
  402. right = validate_operand(right)
  403. self.left = self.orig_left = left
  404. self.right = self.orig_right = right
  405. self.how = how
  406. self.axis = axis
  407. self.on = com.maybe_make_list(on)
  408. self.left_on = com.maybe_make_list(left_on)
  409. self.right_on = com.maybe_make_list(right_on)
  410. self.copy = copy
  411. self.suffixes = suffixes
  412. self.sort = sort
  413. self.left_index = left_index
  414. self.right_index = right_index
  415. self.indicator = indicator
  416. if isinstance(self.indicator, compat.string_types):
  417. self.indicator_name = self.indicator
  418. elif isinstance(self.indicator, bool):
  419. self.indicator_name = '_merge' if self.indicator else None
  420. else:
  421. raise ValueError(
  422. 'indicator option can only accept boolean or string arguments')
  423. if not is_bool(left_index):
  424. raise ValueError(
  425. 'left_index parameter must be of type bool, not '
  426. '{left_index}'.format(left_index=type(left_index)))
  427. if not is_bool(right_index):
  428. raise ValueError(
  429. 'right_index parameter must be of type bool, not '
  430. '{right_index}'.format(right_index=type(right_index)))
  431. # warn user when merging between different levels
  432. if left.columns.nlevels != right.columns.nlevels:
  433. msg = ('merging between different levels can give an unintended '
  434. 'result ({left} levels on the left, {right} on the right)'
  435. ).format(left=left.columns.nlevels,
  436. right=right.columns.nlevels)
  437. warnings.warn(msg, UserWarning)
  438. self._validate_specification()
  439. # note this function has side effects
  440. (self.left_join_keys,
  441. self.right_join_keys,
  442. self.join_names) = self._get_merge_keys()
  443. # validate the merge keys dtypes. We may need to coerce
  444. # to avoid incompat dtypes
  445. self._maybe_coerce_merge_keys()
  446. # If argument passed to validate,
  447. # check if columns specified as unique
  448. # are in fact unique.
  449. if validate is not None:
  450. self._validate(validate)
  451. def get_result(self):
  452. if self.indicator:
  453. self.left, self.right = self._indicator_pre_merge(
  454. self.left, self.right)
  455. join_index, left_indexer, right_indexer = self._get_join_info()
  456. ldata, rdata = self.left._data, self.right._data
  457. lsuf, rsuf = self.suffixes
  458. llabels, rlabels = items_overlap_with_suffix(ldata.items, lsuf,
  459. rdata.items, rsuf)
  460. lindexers = {1: left_indexer} if left_indexer is not None else {}
  461. rindexers = {1: right_indexer} if right_indexer is not None else {}
  462. result_data = concatenate_block_managers(
  463. [(ldata, lindexers), (rdata, rindexers)],
  464. axes=[llabels.append(rlabels), join_index],
  465. concat_axis=0, copy=self.copy)
  466. typ = self.left._constructor
  467. result = typ(result_data).__finalize__(self, method=self._merge_type)
  468. if self.indicator:
  469. result = self._indicator_post_merge(result)
  470. self._maybe_add_join_keys(result, left_indexer, right_indexer)
  471. self._maybe_restore_index_levels(result)
  472. return result
  473. def _indicator_pre_merge(self, left, right):
  474. columns = left.columns.union(right.columns)
  475. for i in ['_left_indicator', '_right_indicator']:
  476. if i in columns:
  477. raise ValueError("Cannot use `indicator=True` option when "
  478. "data contains a column named {name}"
  479. .format(name=i))
  480. if self.indicator_name in columns:
  481. raise ValueError(
  482. "Cannot use name of an existing column for indicator column")
  483. left = left.copy()
  484. right = right.copy()
  485. left['_left_indicator'] = 1
  486. left['_left_indicator'] = left['_left_indicator'].astype('int8')
  487. right['_right_indicator'] = 2
  488. right['_right_indicator'] = right['_right_indicator'].astype('int8')
  489. return left, right
  490. def _indicator_post_merge(self, result):
  491. result['_left_indicator'] = result['_left_indicator'].fillna(0)
  492. result['_right_indicator'] = result['_right_indicator'].fillna(0)
  493. result[self.indicator_name] = Categorical((result['_left_indicator'] +
  494. result['_right_indicator']),
  495. categories=[1, 2, 3])
  496. result[self.indicator_name] = (
  497. result[self.indicator_name]
  498. .cat.rename_categories(['left_only', 'right_only', 'both']))
  499. result = result.drop(labels=['_left_indicator', '_right_indicator'],
  500. axis=1)
  501. return result
  502. def _maybe_restore_index_levels(self, result):
  503. """
  504. Restore index levels specified as `on` parameters
  505. Here we check for cases where `self.left_on` and `self.right_on` pairs
  506. each reference an index level in their respective DataFrames. The
  507. joined columns corresponding to these pairs are then restored to the
  508. index of `result`.
  509. **Note:** This method has side effects. It modifies `result` in-place
  510. Parameters
  511. ----------
  512. result: DataFrame
  513. merge result
  514. Returns
  515. -------
  516. None
  517. """
  518. names_to_restore = []
  519. for name, left_key, right_key in zip(self.join_names,
  520. self.left_on,
  521. self.right_on):
  522. if (self.orig_left._is_level_reference(left_key) and
  523. self.orig_right._is_level_reference(right_key) and
  524. name not in result.index.names):
  525. names_to_restore.append(name)
  526. if names_to_restore:
  527. result.set_index(names_to_restore, inplace=True)
  528. def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
  529. left_has_missing = None
  530. right_has_missing = None
  531. keys = zip(self.join_names, self.left_on, self.right_on)
  532. for i, (name, lname, rname) in enumerate(keys):
  533. if not _should_fill(lname, rname):
  534. continue
  535. take_left, take_right = None, None
  536. if name in result:
  537. if left_indexer is not None and right_indexer is not None:
  538. if name in self.left:
  539. if left_has_missing is None:
  540. left_has_missing = (left_indexer == -1).any()
  541. if left_has_missing:
  542. take_right = self.right_join_keys[i]
  543. if not is_dtype_equal(result[name].dtype,
  544. self.left[name].dtype):
  545. take_left = self.left[name]._values
  546. elif name in self.right:
  547. if right_has_missing is None:
  548. right_has_missing = (right_indexer == -1).any()
  549. if right_has_missing:
  550. take_left = self.left_join_keys[i]
  551. if not is_dtype_equal(result[name].dtype,
  552. self.right[name].dtype):
  553. take_right = self.right[name]._values
  554. elif left_indexer is not None \
  555. and is_array_like(self.left_join_keys[i]):
  556. take_left = self.left_join_keys[i]
  557. take_right = self.right_join_keys[i]
  558. if take_left is not None or take_right is not None:
  559. if take_left is None:
  560. lvals = result[name]._values
  561. else:
  562. lfill = na_value_for_dtype(take_left.dtype)
  563. lvals = algos.take_1d(take_left, left_indexer,
  564. fill_value=lfill)
  565. if take_right is None:
  566. rvals = result[name]._values
  567. else:
  568. rfill = na_value_for_dtype(take_right.dtype)
  569. rvals = algos.take_1d(take_right, right_indexer,
  570. fill_value=rfill)
  571. # if we have an all missing left_indexer
  572. # make sure to just use the right values
  573. mask = left_indexer == -1
  574. if mask.all():
  575. key_col = rvals
  576. else:
  577. key_col = Index(lvals).where(~mask, rvals)
  578. if result._is_label_reference(name):
  579. result[name] = key_col
  580. elif result._is_level_reference(name):
  581. if isinstance(result.index, MultiIndex):
  582. key_col.name = name
  583. idx_list = [result.index.get_level_values(level_name)
  584. if level_name != name else key_col
  585. for level_name in result.index.names]
  586. result.set_index(idx_list, inplace=True)
  587. else:
  588. result.index = Index(key_col, name=name)
  589. else:
  590. result.insert(i, name or 'key_{i}'.format(i=i), key_col)
  591. def _get_join_indexers(self):
  592. """ return the join indexers """
  593. return _get_join_indexers(self.left_join_keys,
  594. self.right_join_keys,
  595. sort=self.sort,
  596. how=self.how)
  597. def _get_join_info(self):
  598. left_ax = self.left._data.axes[self.axis]
  599. right_ax = self.right._data.axes[self.axis]
  600. if self.left_index and self.right_index and self.how != 'asof':
  601. join_index, left_indexer, right_indexer = \
  602. left_ax.join(right_ax, how=self.how, return_indexers=True,
  603. sort=self.sort)
  604. elif self.right_index and self.how == 'left':
  605. join_index, left_indexer, right_indexer = \
  606. _left_join_on_index(left_ax, right_ax, self.left_join_keys,
  607. sort=self.sort)
  608. elif self.left_index and self.how == 'right':
  609. join_index, right_indexer, left_indexer = \
  610. _left_join_on_index(right_ax, left_ax, self.right_join_keys,
  611. sort=self.sort)
  612. else:
  613. (left_indexer,
  614. right_indexer) = self._get_join_indexers()
  615. if self.right_index:
  616. if len(self.left) > 0:
  617. join_index = self.left.index.take(left_indexer)
  618. else:
  619. join_index = self.right.index.take(right_indexer)
  620. left_indexer = np.array([-1] * len(join_index))
  621. elif self.left_index:
  622. if len(self.right) > 0:
  623. join_index = self.right.index.take(right_indexer)
  624. else:
  625. join_index = self.left.index.take(left_indexer)
  626. right_indexer = np.array([-1] * len(join_index))
  627. else:
  628. join_index = Index(np.arange(len(left_indexer)))
  629. if len(join_index) == 0:
  630. join_index = join_index.astype(object)
  631. return join_index, left_indexer, right_indexer
  632. def _get_merge_keys(self):
  633. """
  634. Note: has side effects (copy/delete key columns)
  635. Parameters
  636. ----------
  637. left
  638. right
  639. on
  640. Returns
  641. -------
  642. left_keys, right_keys
  643. """
  644. left_keys = []
  645. right_keys = []
  646. join_names = []
  647. right_drop = []
  648. left_drop = []
  649. left, right = self.left, self.right
  650. is_lkey = lambda x: is_array_like(x) and len(x) == len(left)
  651. is_rkey = lambda x: is_array_like(x) and len(x) == len(right)
  652. # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A
  653. # user could, for example, request 'left_index' and 'left_by'. In a
  654. # regular pd.merge(), users cannot specify both 'left_index' and
  655. # 'left_on'. (Instead, users have a MultiIndex). That means the
  656. # self.left_on in this function is always empty in a pd.merge(), but
  657. # a pd.merge_asof(left_index=True, left_by=...) will result in a
  658. # self.left_on array with a None in the middle of it. This requires
  659. # a work-around as designated in the code below.
  660. # See _validate_specification() for where this happens.
  661. # ugh, spaghetti re #733
  662. if _any(self.left_on) and _any(self.right_on):
  663. for lk, rk in zip(self.left_on, self.right_on):
  664. if is_lkey(lk):
  665. left_keys.append(lk)
  666. if is_rkey(rk):
  667. right_keys.append(rk)
  668. join_names.append(None) # what to do?
  669. else:
  670. if rk is not None:
  671. right_keys.append(
  672. right._get_label_or_level_values(rk))
  673. join_names.append(rk)
  674. else:
  675. # work-around for merge_asof(right_index=True)
  676. right_keys.append(right.index)
  677. join_names.append(right.index.name)
  678. else:
  679. if not is_rkey(rk):
  680. if rk is not None:
  681. right_keys.append(
  682. right._get_label_or_level_values(rk))
  683. else:
  684. # work-around for merge_asof(right_index=True)
  685. right_keys.append(right.index)
  686. if lk is not None and lk == rk:
  687. # avoid key upcast in corner case (length-0)
  688. if len(left) > 0:
  689. right_drop.append(rk)
  690. else:
  691. left_drop.append(lk)
  692. else:
  693. right_keys.append(rk)
  694. if lk is not None:
  695. left_keys.append(left._get_label_or_level_values(lk))
  696. join_names.append(lk)
  697. else:
  698. # work-around for merge_asof(left_index=True)
  699. left_keys.append(left.index)
  700. join_names.append(left.index.name)
  701. elif _any(self.left_on):
  702. for k in self.left_on:
  703. if is_lkey(k):
  704. left_keys.append(k)
  705. join_names.append(None)
  706. else:
  707. left_keys.append(left._get_label_or_level_values(k))
  708. join_names.append(k)
  709. if isinstance(self.right.index, MultiIndex):
  710. right_keys = [lev._values.take(lev_codes) for lev, lev_codes
  711. in zip(self.right.index.levels,
  712. self.right.index.codes)]
  713. else:
  714. right_keys = [self.right.index._values]
  715. elif _any(self.right_on):
  716. for k in self.right_on:
  717. if is_rkey(k):
  718. right_keys.append(k)
  719. join_names.append(None)
  720. else:
  721. right_keys.append(right._get_label_or_level_values(k))
  722. join_names.append(k)
  723. if isinstance(self.left.index, MultiIndex):
  724. left_keys = [lev._values.take(lev_codes) for lev, lev_codes
  725. in zip(self.left.index.levels,
  726. self.left.index.codes)]
  727. else:
  728. left_keys = [self.left.index.values]
  729. if left_drop:
  730. self.left = self.left._drop_labels_or_levels(left_drop)
  731. if right_drop:
  732. self.right = self.right._drop_labels_or_levels(right_drop)
  733. return left_keys, right_keys, join_names
  734. def _maybe_coerce_merge_keys(self):
  735. # we have valid mergees but we may have to further
  736. # coerce these if they are originally incompatible types
  737. #
  738. # for example if these are categorical, but are not dtype_equal
  739. # or if we have object and integer dtypes
  740. for lk, rk, name in zip(self.left_join_keys,
  741. self.right_join_keys,
  742. self.join_names):
  743. if (len(lk) and not len(rk)) or (not len(lk) and len(rk)):
  744. continue
  745. lk_is_cat = is_categorical_dtype(lk)
  746. rk_is_cat = is_categorical_dtype(rk)
  747. lk_is_object = is_object_dtype(lk)
  748. rk_is_object = is_object_dtype(rk)
  749. # if either left or right is a categorical
  750. # then the must match exactly in categories & ordered
  751. if lk_is_cat and rk_is_cat:
  752. if lk.is_dtype_equal(rk):
  753. continue
  754. elif lk_is_cat or rk_is_cat:
  755. pass
  756. elif is_dtype_equal(lk.dtype, rk.dtype):
  757. continue
  758. msg = ("You are trying to merge on {lk_dtype} and "
  759. "{rk_dtype} columns. If you wish to proceed "
  760. "you should use pd.concat".format(lk_dtype=lk.dtype,
  761. rk_dtype=rk.dtype))
  762. # if we are numeric, then allow differing
  763. # kinds to proceed, eg. int64 and int8, int and float
  764. # further if we are object, but we infer to
  765. # the same, then proceed
  766. if is_numeric_dtype(lk) and is_numeric_dtype(rk):
  767. if lk.dtype.kind == rk.dtype.kind:
  768. continue
  769. # check whether ints and floats
  770. elif is_integer_dtype(rk) and is_float_dtype(lk):
  771. if not (lk == lk.astype(rk.dtype))[~np.isnan(lk)].all():
  772. warnings.warn('You are merging on int and float '
  773. 'columns where the float values '
  774. 'are not equal to their int '
  775. 'representation', UserWarning)
  776. continue
  777. elif is_float_dtype(rk) and is_integer_dtype(lk):
  778. if not (rk == rk.astype(lk.dtype))[~np.isnan(rk)].all():
  779. warnings.warn('You are merging on int and float '
  780. 'columns where the float values '
  781. 'are not equal to their int '
  782. 'representation', UserWarning)
  783. continue
  784. # let's infer and see if we are ok
  785. elif (lib.infer_dtype(lk, skipna=False)
  786. == lib.infer_dtype(rk, skipna=False)):
  787. continue
  788. # Check if we are trying to merge on obviously
  789. # incompatible dtypes GH 9780, GH 15800
  790. # bool values are coerced to object
  791. elif ((lk_is_object and is_bool_dtype(rk)) or
  792. (is_bool_dtype(lk) and rk_is_object)):
  793. pass
  794. # object values are allowed to be merged
  795. elif ((lk_is_object and is_numeric_dtype(rk)) or
  796. (is_numeric_dtype(lk) and rk_is_object)):
  797. inferred_left = lib.infer_dtype(lk, skipna=False)
  798. inferred_right = lib.infer_dtype(rk, skipna=False)
  799. bool_types = ['integer', 'mixed-integer', 'boolean', 'empty']
  800. string_types = ['string', 'unicode', 'mixed', 'bytes', 'empty']
  801. # inferred bool
  802. if (inferred_left in bool_types and
  803. inferred_right in bool_types):
  804. pass
  805. # unless we are merging non-string-like with string-like
  806. elif ((inferred_left in string_types and
  807. inferred_right not in string_types) or
  808. (inferred_right in string_types and
  809. inferred_left not in string_types)):
  810. raise ValueError(msg)
  811. # datetimelikes must match exactly
  812. elif is_datetimelike(lk) and not is_datetimelike(rk):
  813. raise ValueError(msg)
  814. elif not is_datetimelike(lk) and is_datetimelike(rk):
  815. raise ValueError(msg)
  816. elif is_datetime64tz_dtype(lk) and not is_datetime64tz_dtype(rk):
  817. raise ValueError(msg)
  818. elif not is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk):
  819. raise ValueError(msg)
  820. elif lk_is_object and rk_is_object:
  821. continue
  822. # Houston, we have a problem!
  823. # let's coerce to object if the dtypes aren't
  824. # categorical, otherwise coerce to the category
  825. # dtype. If we coerced categories to object,
  826. # then we would lose type information on some
  827. # columns, and end up trying to merge
  828. # incompatible dtypes. See GH 16900.
  829. if name in self.left.columns:
  830. typ = lk.categories.dtype if lk_is_cat else object
  831. self.left = self.left.assign(
  832. **{name: self.left[name].astype(typ)})
  833. if name in self.right.columns:
  834. typ = rk.categories.dtype if rk_is_cat else object
  835. self.right = self.right.assign(
  836. **{name: self.right[name].astype(typ)})
  837. def _validate_specification(self):
  838. # Hm, any way to make this logic less complicated??
  839. if self.on is None and self.left_on is None and self.right_on is None:
  840. if self.left_index and self.right_index:
  841. self.left_on, self.right_on = (), ()
  842. elif self.left_index:
  843. if self.right_on is None:
  844. raise MergeError('Must pass right_on or right_index=True')
  845. elif self.right_index:
  846. if self.left_on is None:
  847. raise MergeError('Must pass left_on or left_index=True')
  848. else:
  849. # use the common columns
  850. common_cols = self.left.columns.intersection(
  851. self.right.columns)
  852. if len(common_cols) == 0:
  853. raise MergeError(
  854. 'No common columns to perform merge on. '
  855. 'Merge options: left_on={lon}, right_on={ron}, '
  856. 'left_index={lidx}, right_index={ridx}'
  857. .format(lon=self.left_on, ron=self.right_on,
  858. lidx=self.left_index, ridx=self.right_index))
  859. if not common_cols.is_unique:
  860. raise MergeError("Data columns not unique: {common!r}"
  861. .format(common=common_cols))
  862. self.left_on = self.right_on = common_cols
  863. elif self.on is not None:
  864. if self.left_on is not None or self.right_on is not None:
  865. raise MergeError('Can only pass argument "on" OR "left_on" '
  866. 'and "right_on", not a combination of both.')
  867. self.left_on = self.right_on = self.on
  868. elif self.left_on is not None:
  869. n = len(self.left_on)
  870. if self.right_index:
  871. if len(self.left_on) != self.right.index.nlevels:
  872. raise ValueError('len(left_on) must equal the number '
  873. 'of levels in the index of "right"')
  874. self.right_on = [None] * n
  875. elif self.right_on is not None:
  876. n = len(self.right_on)
  877. if self.left_index:
  878. if len(self.right_on) != self.left.index.nlevels:
  879. raise ValueError('len(right_on) must equal the number '
  880. 'of levels in the index of "left"')
  881. self.left_on = [None] * n
  882. if len(self.right_on) != len(self.left_on):
  883. raise ValueError("len(right_on) must equal len(left_on)")
  884. def _validate(self, validate):
  885. # Check uniqueness of each
  886. if self.left_index:
  887. left_unique = self.orig_left.index.is_unique
  888. else:
  889. left_unique = MultiIndex.from_arrays(self.left_join_keys
  890. ).is_unique
  891. if self.right_index:
  892. right_unique = self.orig_right.index.is_unique
  893. else:
  894. right_unique = MultiIndex.from_arrays(self.right_join_keys
  895. ).is_unique
  896. # Check data integrity
  897. if validate in ["one_to_one", "1:1"]:
  898. if not left_unique and not right_unique:
  899. raise MergeError("Merge keys are not unique in either left"
  900. " or right dataset; not a one-to-one merge")
  901. elif not left_unique:
  902. raise MergeError("Merge keys are not unique in left dataset;"
  903. " not a one-to-one merge")
  904. elif not right_unique:
  905. raise MergeError("Merge keys are not unique in right dataset;"
  906. " not a one-to-one merge")
  907. elif validate in ["one_to_many", "1:m"]:
  908. if not left_unique:
  909. raise MergeError("Merge keys are not unique in left dataset;"
  910. " not a one-to-many merge")
  911. elif validate in ["many_to_one", "m:1"]:
  912. if not right_unique:
  913. raise MergeError("Merge keys are not unique in right dataset;"
  914. " not a many-to-one merge")
  915. elif validate in ['many_to_many', 'm:m']:
  916. pass
  917. else:
  918. raise ValueError("Not a valid argument for validate")
  919. def _get_join_indexers(left_keys, right_keys, sort=False, how='inner',
  920. **kwargs):
  921. """
  922. Parameters
  923. ----------
  924. left_keys: ndarray, Index, Series
  925. right_keys: ndarray, Index, Series
  926. sort: boolean, default False
  927. how: string {'inner', 'outer', 'left', 'right'}, default 'inner'
  928. Returns
  929. -------
  930. tuple of (left_indexer, right_indexer)
  931. indexers into the left_keys, right_keys
  932. """
  933. from functools import partial
  934. assert len(left_keys) == len(right_keys), \
  935. 'left_key and right_keys must be the same length'
  936. # bind `sort` arg. of _factorize_keys
  937. fkeys = partial(_factorize_keys, sort=sort)
  938. # get left & right join labels and num. of levels at each location
  939. llab, rlab, shape = map(list, zip(* map(fkeys, left_keys, right_keys)))
  940. # get flat i8 keys from label lists
  941. lkey, rkey = _get_join_keys(llab, rlab, shape, sort)
  942. # factorize keys to a dense i8 space
  943. # `count` is the num. of unique keys
  944. # set(lkey) | set(rkey) == range(count)
  945. lkey, rkey, count = fkeys(lkey, rkey)
  946. # preserve left frame order if how == 'left' and sort == False
  947. kwargs = copy.copy(kwargs)
  948. if how == 'left':
  949. kwargs['sort'] = sort
  950. join_func = _join_functions[how]
  951. return join_func(lkey, rkey, count, **kwargs)
  952. def _restore_dropped_levels_multijoin(left, right, dropped_level_names,
  953. join_index, lindexer, rindexer):
  954. """
  955. *this is an internal non-public method*
  956. Returns the levels, labels and names of a multi-index to multi-index join.
  957. Depending on the type of join, this method restores the appropriate
  958. dropped levels of the joined multi-index.
  959. The method relies on lidx, rindexer which hold the index positions of
  960. left and right, where a join was feasible
  961. Parameters
  962. ----------
  963. left : MultiIndex
  964. left index
  965. right : MultiIndex
  966. right index
  967. dropped_level_names : str array
  968. list of non-common level names
  969. join_index : MultiIndex
  970. the index of the join between the
  971. common levels of left and right
  972. lindexer : intp array
  973. left indexer
  974. rindexer : intp array
  975. right indexer
  976. Returns
  977. -------
  978. levels : list of Index
  979. levels of combined multiindexes
  980. labels : intp array
  981. labels of combined multiindexes
  982. names : str array
  983. names of combined multiindexes
  984. """
  985. def _convert_to_mulitindex(index):
  986. if isinstance(index, MultiIndex):
  987. return index
  988. else:
  989. return MultiIndex.from_arrays([index.values],
  990. names=[index.name])
  991. # For multi-multi joins with one overlapping level,
  992. # the returned index if of type Index
  993. # Assure that join_index is of type MultiIndex
  994. # so that dropped levels can be appended
  995. join_index = _convert_to_mulitindex(join_index)
  996. join_levels = join_index.levels
  997. join_codes = join_index.codes
  998. join_names = join_index.names
  999. # lindexer and rindexer hold the indexes where the join occurred
  1000. # for left and right respectively. If left/right is None then
  1001. # the join occurred on all indices of left/right
  1002. if lindexer is None:
  1003. lindexer = range(left.size)
  1004. if rindexer is None:
  1005. rindexer = range(right.size)
  1006. # Iterate through the levels that must be restored
  1007. for dropped_level_name in dropped_level_names:
  1008. if dropped_level_name in left.names:
  1009. idx = left
  1010. indexer = lindexer
  1011. else:
  1012. idx = right
  1013. indexer = rindexer
  1014. # The index of the level name to be restored
  1015. name_idx = idx.names.index(dropped_level_name)
  1016. restore_levels = idx.levels[name_idx]
  1017. # Inject -1 in the codes list where a join was not possible
  1018. # IOW indexer[i]=-1
  1019. codes = idx.codes[name_idx]
  1020. restore_codes = algos.take_nd(codes, indexer, fill_value=-1)
  1021. join_levels = join_levels + [restore_levels]
  1022. join_codes = join_codes + [restore_codes]
  1023. join_names = join_names + [dropped_level_name]
  1024. return join_levels, join_codes, join_names
  1025. class _OrderedMerge(_MergeOperation):
  1026. _merge_type = 'ordered_merge'
  1027. def __init__(self, left, right, on=None, left_on=None, right_on=None,
  1028. left_index=False, right_index=False, axis=1,
  1029. suffixes=('_x', '_y'), copy=True,
  1030. fill_method=None, how='outer'):
  1031. self.fill_method = fill_method
  1032. _MergeOperation.__init__(self, left, right, on=on, left_on=left_on,
  1033. left_index=left_index,
  1034. right_index=right_index,
  1035. right_on=right_on, axis=axis,
  1036. how=how, suffixes=suffixes,
  1037. sort=True # factorize sorts
  1038. )
  1039. def get_result(self):
  1040. join_index, left_indexer, right_indexer = self._get_join_info()
  1041. # this is a bit kludgy
  1042. ldata, rdata = self.left._data, self.right._data
  1043. lsuf, rsuf = self.suffixes
  1044. llabels, rlabels = items_overlap_with_suffix(ldata.items, lsuf,
  1045. rdata.items, rsuf)
  1046. if self.fill_method == 'ffill':
  1047. left_join_indexer = libjoin.ffill_indexer(left_indexer)
  1048. right_join_indexer = libjoin.ffill_indexer(right_indexer)
  1049. else:
  1050. left_join_indexer = left_indexer
  1051. right_join_indexer = right_indexer
  1052. lindexers = {
  1053. 1: left_join_indexer} if left_join_indexer is not None else {}
  1054. rindexers = {
  1055. 1: right_join_indexer} if right_join_indexer is not None else {}
  1056. result_data = concatenate_block_managers(
  1057. [(ldata, lindexers), (rdata, rindexers)],
  1058. axes=[llabels.append(rlabels), join_index],
  1059. concat_axis=0, copy=self.copy)
  1060. typ = self.left._constructor
  1061. result = typ(result_data).__finalize__(self, method=self._merge_type)
  1062. self._maybe_add_join_keys(result, left_indexer, right_indexer)
  1063. return result
  1064. def _asof_function(direction):
  1065. name = 'asof_join_{dir}'.format(dir=direction)
  1066. return getattr(libjoin, name, None)
  1067. def _asof_by_function(direction):
  1068. name = 'asof_join_{dir}_on_X_by_Y'.format(dir=direction)
  1069. return getattr(libjoin, name, None)
  1070. _type_casters = {
  1071. 'int64_t': ensure_int64,
  1072. 'double': ensure_float64,
  1073. 'object': ensure_object,
  1074. }
  1075. def _get_cython_type_upcast(dtype):
  1076. """ Upcast a dtype to 'int64_t', 'double', or 'object' """
  1077. if is_integer_dtype(dtype):
  1078. return 'int64_t'
  1079. elif is_float_dtype(dtype):
  1080. return 'double'
  1081. else:
  1082. return 'object'
  1083. class _AsOfMerge(_OrderedMerge):
  1084. _merge_type = 'asof_merge'
  1085. def __init__(self, left, right, on=None, left_on=None, right_on=None,
  1086. left_index=False, right_index=False,
  1087. by=None, left_by=None, right_by=None,
  1088. axis=1, suffixes=('_x', '_y'), copy=True,
  1089. fill_method=None,
  1090. how='asof', tolerance=None,
  1091. allow_exact_matches=True,
  1092. direction='backward'):
  1093. self.by = by
  1094. self.left_by = left_by
  1095. self.right_by = right_by
  1096. self.tolerance = tolerance
  1097. self.allow_exact_matches = allow_exact_matches
  1098. self.direction = direction
  1099. _OrderedMerge.__init__(self, left, right, on=on, left_on=left_on,
  1100. right_on=right_on, left_index=left_index,
  1101. right_index=right_index, axis=axis,
  1102. how=how, suffixes=suffixes,
  1103. fill_method=fill_method)
  1104. def _validate_specification(self):
  1105. super(_AsOfMerge, self)._validate_specification()
  1106. # we only allow on to be a single item for on
  1107. if len(self.left_on) != 1 and not self.left_index:
  1108. raise MergeError("can only asof on a key for left")
  1109. if len(self.right_on) != 1 and not self.right_index:
  1110. raise MergeError("can only asof on a key for right")
  1111. if self.left_index and isinstance(self.left.index, MultiIndex):
  1112. raise MergeError("left can only have one index")
  1113. if self.right_index and isinstance(self.right.index, MultiIndex):
  1114. raise MergeError("right can only have one index")
  1115. # set 'by' columns
  1116. if self.by is not None:
  1117. if self.left_by is not None or self.right_by is not None:
  1118. raise MergeError('Can only pass by OR left_by '
  1119. 'and right_by')
  1120. self.left_by = self.right_by = self.by
  1121. if self.left_by is None and self.right_by is not None:
  1122. raise MergeError('missing left_by')
  1123. if self.left_by is not None and self.right_by is None:
  1124. raise MergeError('missing right_by')
  1125. # add 'by' to our key-list so we can have it in the
  1126. # output as a key
  1127. if self.left_by is not None:
  1128. if not is_list_like(self.left_by):
  1129. self.left_by = [self.left_by]
  1130. if not is_list_like(self.right_by):
  1131. self.right_by = [self.right_by]
  1132. if len(self.left_by) != len(self.right_by):
  1133. raise MergeError('left_by and right_by must be same length')
  1134. self.left_on = self.left_by + list(self.left_on)
  1135. self.right_on = self.right_by + list(self.right_on)
  1136. # check 'direction' is valid
  1137. if self.direction not in ['backward', 'forward', 'nearest']:
  1138. raise MergeError('direction invalid: {direction}'
  1139. .format(direction=self.direction))
  1140. @property
  1141. def _asof_key(self):
  1142. """ This is our asof key, the 'on' """
  1143. return self.left_on[-1]
  1144. def _get_merge_keys(self):
  1145. # note this function has side effects
  1146. (left_join_keys,
  1147. right_join_keys,
  1148. join_names) = super(_AsOfMerge, self)._get_merge_keys()
  1149. # validate index types are the same
  1150. for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)):
  1151. if not is_dtype_equal(lk.dtype, rk.dtype):
  1152. raise MergeError("incompatible merge keys [{i}] {lkdtype} and "
  1153. "{rkdtype}, must be the same type"
  1154. .format(i=i, lkdtype=lk.dtype,
  1155. rkdtype=rk.dtype))
  1156. # validate tolerance; must be a Timedelta if we have a DTI
  1157. if self.tolerance is not None:
  1158. if self.left_index:
  1159. lt = self.left.index
  1160. else:
  1161. lt = left_join_keys[-1]
  1162. msg = ("incompatible tolerance {tolerance}, must be compat "
  1163. "with type {lkdtype}".format(
  1164. tolerance=type(self.tolerance),
  1165. lkdtype=lt.dtype))
  1166. if is_datetime64_dtype(lt) or is_datetime64tz_dtype(lt):
  1167. if not isinstance(self.tolerance, Timedelta):
  1168. raise MergeError(msg)
  1169. if self.tolerance < Timedelta(0):
  1170. raise MergeError("tolerance must be positive")
  1171. elif is_int64_dtype(lt):
  1172. if not is_integer(self.tolerance):
  1173. raise MergeError(msg)
  1174. if self.tolerance < 0:
  1175. raise MergeError("tolerance must be positive")
  1176. elif is_float_dtype(lt):
  1177. if not is_number(self.tolerance):
  1178. raise MergeError(msg)
  1179. if self.tolerance < 0:
  1180. raise MergeError("tolerance must be positive")
  1181. else:
  1182. raise MergeError("key must be integer, timestamp or float")
  1183. # validate allow_exact_matches
  1184. if not is_bool(self.allow_exact_matches):
  1185. msg = "allow_exact_matches must be boolean, passed {passed}"
  1186. raise MergeError(msg.format(passed=self.allow_exact_matches))
  1187. return left_join_keys, right_join_keys, join_names
  1188. def _get_join_indexers(self):
  1189. """ return the join indexers """
  1190. def flip(xs):
  1191. """ unlike np.transpose, this returns an array of tuples """
  1192. labels = list(string.ascii_lowercase[:len(xs)])
  1193. dtypes = [x.dtype for x in xs]
  1194. labeled_dtypes = list(zip(labels, dtypes))
  1195. return np.array(lzip(*xs), labeled_dtypes)
  1196. # values to compare
  1197. left_values = (self.left.index.values if self.left_index else
  1198. self.left_join_keys[-1])
  1199. right_values = (self.right.index.values if self.right_index else
  1200. self.right_join_keys[-1])
  1201. tolerance = self.tolerance
  1202. # we require sortedness and non-null values in the join keys
  1203. msg_sorted = "{side} keys must be sorted"
  1204. msg_missings = "Merge keys contain null values on {side} side"
  1205. if not Index(left_values).is_monotonic:
  1206. if isnull(left_values).any():
  1207. raise ValueError(msg_missings.format(side='left'))
  1208. else:
  1209. raise ValueError(msg_sorted.format(side='left'))
  1210. if not Index(right_values).is_monotonic:
  1211. if isnull(right_values).any():
  1212. raise ValueError(msg_missings.format(side='right'))
  1213. else:
  1214. raise ValueError(msg_sorted.format(side='right'))
  1215. # initial type conversion as needed
  1216. if needs_i8_conversion(left_values):
  1217. left_values = left_values.view('i8')
  1218. right_values = right_values.view('i8')
  1219. if tolerance is not None:
  1220. tolerance = tolerance.value
  1221. # a "by" parameter requires special handling
  1222. if self.left_by is not None:
  1223. # remove 'on' parameter from values if one existed
  1224. if self.left_index and self.right_index:
  1225. left_by_values = self.left_join_keys
  1226. right_by_values = self.right_join_keys
  1227. else:
  1228. left_by_values = self.left_join_keys[0:-1]
  1229. right_by_values = self.right_join_keys[0:-1]
  1230. # get tuple representation of values if more than one
  1231. if len(left_by_values) == 1:
  1232. left_by_values = left_by_values[0]
  1233. right_by_values = right_by_values[0]
  1234. else:
  1235. left_by_values = flip(left_by_values)
  1236. right_by_values = flip(right_by_values)
  1237. # upcast 'by' parameter because HashTable is limited
  1238. by_type = _get_cython_type_upcast(left_by_values.dtype)
  1239. by_type_caster = _type_casters[by_type]
  1240. left_by_values = by_type_caster(left_by_values)
  1241. right_by_values = by_type_caster(right_by_values)
  1242. # choose appropriate function by type
  1243. func = _asof_by_function(self.direction)
  1244. return func(left_values,
  1245. right_values,
  1246. left_by_values,
  1247. right_by_values,
  1248. self.allow_exact_matches,
  1249. tolerance)
  1250. else:
  1251. # choose appropriate function by type
  1252. func = _asof_function(self.direction)
  1253. return func(left_values,
  1254. right_values,
  1255. self.allow_exact_matches,
  1256. tolerance)
  1257. def _get_multiindex_indexer(join_keys, index, sort):
  1258. from functools import partial
  1259. # bind `sort` argument
  1260. fkeys = partial(_factorize_keys, sort=sort)
  1261. # left & right join labels and num. of levels at each location
  1262. rcodes, lcodes, shape = map(list, zip(* map(fkeys,
  1263. index.levels,
  1264. join_keys)))
  1265. if sort:
  1266. rcodes = list(map(np.take, rcodes, index.codes))
  1267. else:
  1268. i8copy = lambda a: a.astype('i8', subok=False, copy=True)
  1269. rcodes = list(map(i8copy, index.codes))
  1270. # fix right labels if there were any nulls
  1271. for i in range(len(join_keys)):
  1272. mask = index.codes[i] == -1
  1273. if mask.any():
  1274. # check if there already was any nulls at this location
  1275. # if there was, it is factorized to `shape[i] - 1`
  1276. a = join_keys[i][lcodes[i] == shape[i] - 1]
  1277. if a.size == 0 or not a[0] != a[0]:
  1278. shape[i] += 1
  1279. rcodes[i][mask] = shape[i] - 1
  1280. # get flat i8 join keys
  1281. lkey, rkey = _get_join_keys(lcodes, rcodes, shape, sort)
  1282. # factorize keys to a dense i8 space
  1283. lkey, rkey, count = fkeys(lkey, rkey)
  1284. return libjoin.left_outer_join(lkey, rkey, count, sort=sort)
  1285. def _get_single_indexer(join_key, index, sort=False):
  1286. left_key, right_key, count = _factorize_keys(join_key, index, sort=sort)
  1287. left_indexer, right_indexer = libjoin.left_outer_join(
  1288. ensure_int64(left_key),
  1289. ensure_int64(right_key),
  1290. count, sort=sort)
  1291. return left_indexer, right_indexer
  1292. def _left_join_on_index(left_ax, right_ax, join_keys, sort=False):
  1293. if len(join_keys) > 1:
  1294. if not ((isinstance(right_ax, MultiIndex) and
  1295. len(join_keys) == right_ax.nlevels)):
  1296. raise AssertionError("If more than one join key is given then "
  1297. "'right_ax' must be a MultiIndex and the "
  1298. "number of join keys must be the number of "
  1299. "levels in right_ax")
  1300. left_indexer, right_indexer = \
  1301. _get_multiindex_indexer(join_keys, right_ax, sort=sort)
  1302. else:
  1303. jkey = join_keys[0]
  1304. left_indexer, right_indexer = \
  1305. _get_single_indexer(jkey, right_ax, sort=sort)
  1306. if sort or len(left_ax) != len(left_indexer):
  1307. # if asked to sort or there are 1-to-many matches
  1308. join_index = left_ax.take(left_indexer)
  1309. return join_index, left_indexer, right_indexer
  1310. # left frame preserves order & length of its index
  1311. return left_ax, None, right_indexer
  1312. def _right_outer_join(x, y, max_groups):
  1313. right_indexer, left_indexer = libjoin.left_outer_join(y, x, max_groups)
  1314. return left_indexer, right_indexer
  1315. _join_functions = {
  1316. 'inner': libjoin.inner_join,
  1317. 'left': libjoin.left_outer_join,
  1318. 'right': _right_outer_join,
  1319. 'outer': libjoin.full_outer_join,
  1320. }
  1321. def _factorize_keys(lk, rk, sort=True):
  1322. # Some pre-processing for non-ndarray lk / rk
  1323. if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk):
  1324. lk = lk._data
  1325. rk = rk._data
  1326. elif (is_categorical_dtype(lk) and
  1327. is_categorical_dtype(rk) and
  1328. lk.is_dtype_equal(rk)):
  1329. if lk.categories.equals(rk.categories):
  1330. # if we exactly match in categories, allow us to factorize on codes
  1331. rk = rk.codes
  1332. else:
  1333. # Same categories in different orders -> recode
  1334. rk = _recode_for_categories(rk.codes, rk.categories, lk.categories)
  1335. lk = ensure_int64(lk.codes)
  1336. rk = ensure_int64(rk)
  1337. elif (is_extension_array_dtype(lk.dtype) and
  1338. is_extension_array_dtype(rk.dtype) and
  1339. lk.dtype == rk.dtype):
  1340. lk, _ = lk._values_for_factorize()
  1341. rk, _ = rk._values_for_factorize()
  1342. if is_integer_dtype(lk) and is_integer_dtype(rk):
  1343. # GH#23917 TODO: needs tests for case where lk is integer-dtype
  1344. # and rk is datetime-dtype
  1345. klass = libhashtable.Int64Factorizer
  1346. lk = ensure_int64(com.values_from_object(lk))
  1347. rk = ensure_int64(com.values_from_object(rk))
  1348. elif (issubclass(lk.dtype.type, (np.timedelta64, np.datetime64)) and
  1349. issubclass(rk.dtype.type, (np.timedelta64, np.datetime64))):
  1350. # GH#23917 TODO: Needs tests for non-matching dtypes
  1351. klass = libhashtable.Int64Factorizer
  1352. lk = ensure_int64(com.values_from_object(lk))
  1353. rk = ensure_int64(com.values_from_object(rk))
  1354. else:
  1355. klass = libhashtable.Factorizer
  1356. lk = ensure_object(lk)
  1357. rk = ensure_object(rk)
  1358. rizer = klass(max(len(lk), len(rk)))
  1359. llab = rizer.factorize(lk)
  1360. rlab = rizer.factorize(rk)
  1361. count = rizer.get_count()
  1362. if sort:
  1363. uniques = rizer.uniques.to_array()
  1364. llab, rlab = _sort_labels(uniques, llab, rlab)
  1365. # NA group
  1366. lmask = llab == -1
  1367. lany = lmask.any()
  1368. rmask = rlab == -1
  1369. rany = rmask.any()
  1370. if lany or rany:
  1371. if lany:
  1372. np.putmask(llab, lmask, count)
  1373. if rany:
  1374. np.putmask(rlab, rmask, count)
  1375. count += 1
  1376. return llab, rlab, count
  1377. def _sort_labels(uniques, left, right):
  1378. if not isinstance(uniques, np.ndarray):
  1379. # tuplesafe
  1380. uniques = Index(uniques).values
  1381. llength = len(left)
  1382. labels = np.concatenate([left, right])
  1383. _, new_labels = sorting.safe_sort(uniques, labels, na_sentinel=-1)
  1384. new_labels = ensure_int64(new_labels)
  1385. new_left, new_right = new_labels[:llength], new_labels[llength:]
  1386. return new_left, new_right
  1387. def _get_join_keys(llab, rlab, shape, sort):
  1388. # how many levels can be done without overflow
  1389. pred = lambda i: not is_int64_overflow_possible(shape[:i])
  1390. nlev = next(filter(pred, range(len(shape), 0, -1)))
  1391. # get keys for the first `nlev` levels
  1392. stride = np.prod(shape[1:nlev], dtype='i8')
  1393. lkey = stride * llab[0].astype('i8', subok=False, copy=False)
  1394. rkey = stride * rlab[0].astype('i8', subok=False, copy=False)
  1395. for i in range(1, nlev):
  1396. with np.errstate(divide='ignore'):
  1397. stride //= shape[i]
  1398. lkey += llab[i] * stride
  1399. rkey += rlab[i] * stride
  1400. if nlev == len(shape): # all done!
  1401. return lkey, rkey
  1402. # densify current keys to avoid overflow
  1403. lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)
  1404. llab = [lkey] + llab[nlev:]
  1405. rlab = [rkey] + rlab[nlev:]
  1406. shape = [count] + shape[nlev:]
  1407. return _get_join_keys(llab, rlab, shape, sort)
  1408. def _should_fill(lname, rname):
  1409. if (not isinstance(lname, compat.string_types) or
  1410. not isinstance(rname, compat.string_types)):
  1411. return True
  1412. return lname == rname
  1413. def _any(x):
  1414. return x is not None and com._any_not_none(*x)
  1415. def validate_operand(obj):
  1416. if isinstance(obj, DataFrame):
  1417. return obj
  1418. elif isinstance(obj, Series):
  1419. if obj.name is None:
  1420. raise ValueError('Cannot merge a Series without a name')
  1421. else:
  1422. return obj.to_frame()
  1423. else:
  1424. raise TypeError('Can only merge Series or DataFrame objects, '
  1425. 'a {obj} was passed'.format(obj=type(obj)))