concat.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635
  1. """
  2. concat routines
  3. """
  4. import numpy as np
  5. import pandas.core.dtypes.concat as _concat
  6. from pandas import DataFrame, Index, MultiIndex, Series, compat
  7. from pandas.core import common as com
  8. from pandas.core.arrays.categorical import (
  9. _factorize_from_iterable, _factorize_from_iterables)
  10. from pandas.core.generic import NDFrame
  11. from pandas.core.index import (
  12. _all_indexes_same, _get_consensus_names, _get_objs_combined_axis,
  13. ensure_index)
  14. import pandas.core.indexes.base as ibase
  15. from pandas.core.internals import concatenate_block_managers
  16. # ---------------------------------------------------------------------
  17. # Concatenate DataFrame objects
  18. def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
  19. keys=None, levels=None, names=None, verify_integrity=False,
  20. sort=None, copy=True):
  21. """
  22. Concatenate pandas objects along a particular axis with optional set logic
  23. along the other axes.
  24. Can also add a layer of hierarchical indexing on the concatenation axis,
  25. which may be useful if the labels are the same (or overlapping) on
  26. the passed axis number.
  27. Parameters
  28. ----------
  29. objs : a sequence or mapping of Series, DataFrame, or Panel objects
  30. If a dict is passed, the sorted keys will be used as the `keys`
  31. argument, unless it is passed, in which case the values will be
  32. selected (see below). Any None objects will be dropped silently unless
  33. they are all None in which case a ValueError will be raised
  34. axis : {0/'index', 1/'columns'}, default 0
  35. The axis to concatenate along
  36. join : {'inner', 'outer'}, default 'outer'
  37. How to handle indexes on other axis(es)
  38. join_axes : list of Index objects
  39. Specific indexes to use for the other n - 1 axes instead of performing
  40. inner/outer set logic
  41. ignore_index : boolean, default False
  42. If True, do not use the index values along the concatenation axis. The
  43. resulting axis will be labeled 0, ..., n - 1. This is useful if you are
  44. concatenating objects where the concatenation axis does not have
  45. meaningful indexing information. Note the index values on the other
  46. axes are still respected in the join.
  47. keys : sequence, default None
  48. If multiple levels passed, should contain tuples. Construct
  49. hierarchical index using the passed keys as the outermost level
  50. levels : list of sequences, default None
  51. Specific levels (unique values) to use for constructing a
  52. MultiIndex. Otherwise they will be inferred from the keys
  53. names : list, default None
  54. Names for the levels in the resulting hierarchical index
  55. verify_integrity : boolean, default False
  56. Check whether the new concatenated axis contains duplicates. This can
  57. be very expensive relative to the actual data concatenation
  58. sort : boolean, default None
  59. Sort non-concatenation axis if it is not already aligned when `join`
  60. is 'outer'. The current default of sorting is deprecated and will
  61. change to not-sorting in a future version of pandas.
  62. Explicitly pass ``sort=True`` to silence the warning and sort.
  63. Explicitly pass ``sort=False`` to silence the warning and not sort.
  64. This has no effect when ``join='inner'``, which already preserves
  65. the order of the non-concatenation axis.
  66. .. versionadded:: 0.23.0
  67. copy : boolean, default True
  68. If False, do not copy data unnecessarily
  69. Returns
  70. -------
  71. concatenated : object, type of objs
  72. When concatenating all ``Series`` along the index (axis=0), a
  73. ``Series`` is returned. When ``objs`` contains at least one
  74. ``DataFrame``, a ``DataFrame`` is returned. When concatenating along
  75. the columns (axis=1), a ``DataFrame`` is returned.
  76. See Also
  77. --------
  78. Series.append
  79. DataFrame.append
  80. DataFrame.join
  81. DataFrame.merge
  82. Notes
  83. -----
  84. The keys, levels, and names arguments are all optional.
  85. A walkthrough of how this method fits in with other tools for combining
  86. pandas objects can be found `here
  87. <http://pandas.pydata.org/pandas-docs/stable/merging.html>`__.
  88. Examples
  89. --------
  90. Combine two ``Series``.
  91. >>> s1 = pd.Series(['a', 'b'])
  92. >>> s2 = pd.Series(['c', 'd'])
  93. >>> pd.concat([s1, s2])
  94. 0 a
  95. 1 b
  96. 0 c
  97. 1 d
  98. dtype: object
  99. Clear the existing index and reset it in the result
  100. by setting the ``ignore_index`` option to ``True``.
  101. >>> pd.concat([s1, s2], ignore_index=True)
  102. 0 a
  103. 1 b
  104. 2 c
  105. 3 d
  106. dtype: object
  107. Add a hierarchical index at the outermost level of
  108. the data with the ``keys`` option.
  109. >>> pd.concat([s1, s2], keys=['s1', 's2',])
  110. s1 0 a
  111. 1 b
  112. s2 0 c
  113. 1 d
  114. dtype: object
  115. Label the index keys you create with the ``names`` option.
  116. >>> pd.concat([s1, s2], keys=['s1', 's2'],
  117. ... names=['Series name', 'Row ID'])
  118. Series name Row ID
  119. s1 0 a
  120. 1 b
  121. s2 0 c
  122. 1 d
  123. dtype: object
  124. Combine two ``DataFrame`` objects with identical columns.
  125. >>> df1 = pd.DataFrame([['a', 1], ['b', 2]],
  126. ... columns=['letter', 'number'])
  127. >>> df1
  128. letter number
  129. 0 a 1
  130. 1 b 2
  131. >>> df2 = pd.DataFrame([['c', 3], ['d', 4]],
  132. ... columns=['letter', 'number'])
  133. >>> df2
  134. letter number
  135. 0 c 3
  136. 1 d 4
  137. >>> pd.concat([df1, df2])
  138. letter number
  139. 0 a 1
  140. 1 b 2
  141. 0 c 3
  142. 1 d 4
  143. Combine ``DataFrame`` objects with overlapping columns
  144. and return everything. Columns outside the intersection will
  145. be filled with ``NaN`` values.
  146. >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
  147. ... columns=['letter', 'number', 'animal'])
  148. >>> df3
  149. letter number animal
  150. 0 c 3 cat
  151. 1 d 4 dog
  152. >>> pd.concat([df1, df3], sort=False)
  153. letter number animal
  154. 0 a 1 NaN
  155. 1 b 2 NaN
  156. 0 c 3 cat
  157. 1 d 4 dog
  158. Combine ``DataFrame`` objects with overlapping columns
  159. and return only those that are shared by passing ``inner`` to
  160. the ``join`` keyword argument.
  161. >>> pd.concat([df1, df3], join="inner")
  162. letter number
  163. 0 a 1
  164. 1 b 2
  165. 0 c 3
  166. 1 d 4
  167. Combine ``DataFrame`` objects horizontally along the x axis by
  168. passing in ``axis=1``.
  169. >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
  170. ... columns=['animal', 'name'])
  171. >>> pd.concat([df1, df4], axis=1)
  172. letter number animal name
  173. 0 a 1 bird polly
  174. 1 b 2 monkey george
  175. Prevent the result from including duplicate index values with the
  176. ``verify_integrity`` option.
  177. >>> df5 = pd.DataFrame([1], index=['a'])
  178. >>> df5
  179. 0
  180. a 1
  181. >>> df6 = pd.DataFrame([2], index=['a'])
  182. >>> df6
  183. 0
  184. a 2
  185. >>> pd.concat([df5, df6], verify_integrity=True)
  186. Traceback (most recent call last):
  187. ...
  188. ValueError: Indexes have overlapping values: ['a']
  189. """
  190. op = _Concatenator(objs, axis=axis, join_axes=join_axes,
  191. ignore_index=ignore_index, join=join,
  192. keys=keys, levels=levels, names=names,
  193. verify_integrity=verify_integrity,
  194. copy=copy, sort=sort)
  195. return op.get_result()
  196. class _Concatenator(object):
  197. """
  198. Orchestrates a concatenation operation for BlockManagers
  199. """
  200. def __init__(self, objs, axis=0, join='outer', join_axes=None,
  201. keys=None, levels=None, names=None,
  202. ignore_index=False, verify_integrity=False, copy=True,
  203. sort=False):
  204. if isinstance(objs, (NDFrame, compat.string_types)):
  205. raise TypeError('first argument must be an iterable of pandas '
  206. 'objects, you passed an object of type '
  207. '"{name}"'.format(name=type(objs).__name__))
  208. if join == 'outer':
  209. self.intersect = False
  210. elif join == 'inner':
  211. self.intersect = True
  212. else: # pragma: no cover
  213. raise ValueError('Only can inner (intersect) or outer (union) '
  214. 'join the other axis')
  215. if isinstance(objs, dict):
  216. if keys is None:
  217. keys = sorted(objs)
  218. objs = [objs[k] for k in keys]
  219. else:
  220. objs = list(objs)
  221. if len(objs) == 0:
  222. raise ValueError('No objects to concatenate')
  223. if keys is None:
  224. objs = list(com._not_none(*objs))
  225. else:
  226. # #1649
  227. clean_keys = []
  228. clean_objs = []
  229. for k, v in zip(keys, objs):
  230. if v is None:
  231. continue
  232. clean_keys.append(k)
  233. clean_objs.append(v)
  234. objs = clean_objs
  235. name = getattr(keys, 'name', None)
  236. keys = Index(clean_keys, name=name)
  237. if len(objs) == 0:
  238. raise ValueError('All objects passed were None')
  239. # consolidate data & figure out what our result ndim is going to be
  240. ndims = set()
  241. for obj in objs:
  242. if not isinstance(obj, NDFrame):
  243. msg = ('cannot concatenate object of type "{0}";'
  244. ' only pd.Series, pd.DataFrame, and pd.Panel'
  245. ' (deprecated) objs are valid'.format(type(obj)))
  246. raise TypeError(msg)
  247. # consolidate
  248. obj._consolidate(inplace=True)
  249. ndims.add(obj.ndim)
  250. # get the sample
  251. # want the highest ndim that we have, and must be non-empty
  252. # unless all objs are empty
  253. sample = None
  254. if len(ndims) > 1:
  255. max_ndim = max(ndims)
  256. for obj in objs:
  257. if obj.ndim == max_ndim and np.sum(obj.shape):
  258. sample = obj
  259. break
  260. else:
  261. # filter out the empties if we have not multi-index possibilities
  262. # note to keep empty Series as it affect to result columns / name
  263. non_empties = [obj for obj in objs
  264. if sum(obj.shape) > 0 or isinstance(obj, Series)]
  265. if (len(non_empties) and (keys is None and names is None and
  266. levels is None and
  267. join_axes is None and
  268. not self.intersect)):
  269. objs = non_empties
  270. sample = objs[0]
  271. if sample is None:
  272. sample = objs[0]
  273. self.objs = objs
  274. # Standardize axis parameter to int
  275. if isinstance(sample, Series):
  276. axis = DataFrame._get_axis_number(axis)
  277. else:
  278. axis = sample._get_axis_number(axis)
  279. # Need to flip BlockManager axis in the DataFrame special case
  280. self._is_frame = isinstance(sample, DataFrame)
  281. if self._is_frame:
  282. axis = 1 if axis == 0 else 0
  283. self._is_series = isinstance(sample, Series)
  284. if not 0 <= axis <= sample.ndim:
  285. raise AssertionError("axis must be between 0 and {ndim}, input was"
  286. " {axis}".format(ndim=sample.ndim, axis=axis))
  287. # if we have mixed ndims, then convert to highest ndim
  288. # creating column numbers as needed
  289. if len(ndims) > 1:
  290. current_column = 0
  291. max_ndim = sample.ndim
  292. self.objs, objs = [], self.objs
  293. for obj in objs:
  294. ndim = obj.ndim
  295. if ndim == max_ndim:
  296. pass
  297. elif ndim != max_ndim - 1:
  298. raise ValueError("cannot concatenate unaligned mixed "
  299. "dimensional NDFrame objects")
  300. else:
  301. name = getattr(obj, 'name', None)
  302. if ignore_index or name is None:
  303. name = current_column
  304. current_column += 1
  305. # doing a row-wise concatenation so need everything
  306. # to line up
  307. if self._is_frame and axis == 1:
  308. name = 0
  309. obj = sample._constructor({name: obj})
  310. self.objs.append(obj)
  311. # note: this is the BlockManager axis (since DataFrame is transposed)
  312. self.axis = axis
  313. self.join_axes = join_axes
  314. self.keys = keys
  315. self.names = names or getattr(keys, 'names', None)
  316. self.levels = levels
  317. self.sort = sort
  318. self.ignore_index = ignore_index
  319. self.verify_integrity = verify_integrity
  320. self.copy = copy
  321. self.new_axes = self._get_new_axes()
  322. def get_result(self):
  323. # series only
  324. if self._is_series:
  325. # stack blocks
  326. if self.axis == 0:
  327. name = com.consensus_name_attr(self.objs)
  328. mgr = self.objs[0]._data.concat([x._data for x in self.objs],
  329. self.new_axes)
  330. cons = _concat._get_series_result_type(mgr, self.objs)
  331. return cons(mgr, name=name).__finalize__(self, method='concat')
  332. # combine as columns in a frame
  333. else:
  334. data = dict(zip(range(len(self.objs)), self.objs))
  335. cons = _concat._get_series_result_type(data)
  336. index, columns = self.new_axes
  337. df = cons(data, index=index)
  338. df.columns = columns
  339. return df.__finalize__(self, method='concat')
  340. # combine block managers
  341. else:
  342. mgrs_indexers = []
  343. for obj in self.objs:
  344. mgr = obj._data
  345. indexers = {}
  346. for ax, new_labels in enumerate(self.new_axes):
  347. if ax == self.axis:
  348. # Suppress reindexing on concat axis
  349. continue
  350. obj_labels = mgr.axes[ax]
  351. if not new_labels.equals(obj_labels):
  352. indexers[ax] = obj_labels.reindex(new_labels)[1]
  353. mgrs_indexers.append((obj._data, indexers))
  354. new_data = concatenate_block_managers(
  355. mgrs_indexers, self.new_axes, concat_axis=self.axis,
  356. copy=self.copy)
  357. if not self.copy:
  358. new_data._consolidate_inplace()
  359. cons = _concat._get_frame_result_type(new_data, self.objs)
  360. return (cons._from_axes(new_data, self.new_axes)
  361. .__finalize__(self, method='concat'))
  362. def _get_result_dim(self):
  363. if self._is_series and self.axis == 1:
  364. return 2
  365. else:
  366. return self.objs[0].ndim
  367. def _get_new_axes(self):
  368. ndim = self._get_result_dim()
  369. new_axes = [None] * ndim
  370. if self.join_axes is None:
  371. for i in range(ndim):
  372. if i == self.axis:
  373. continue
  374. new_axes[i] = self._get_comb_axis(i)
  375. else:
  376. if len(self.join_axes) != ndim - 1:
  377. raise AssertionError("length of join_axes must be equal "
  378. "to {length}".format(length=ndim - 1))
  379. # ufff...
  380. indices = compat.lrange(ndim)
  381. indices.remove(self.axis)
  382. for i, ax in zip(indices, self.join_axes):
  383. new_axes[i] = ax
  384. new_axes[self.axis] = self._get_concat_axis()
  385. return new_axes
  386. def _get_comb_axis(self, i):
  387. data_axis = self.objs[0]._get_block_manager_axis(i)
  388. try:
  389. return _get_objs_combined_axis(self.objs, axis=data_axis,
  390. intersect=self.intersect,
  391. sort=self.sort)
  392. except IndexError:
  393. types = [type(x).__name__ for x in self.objs]
  394. raise TypeError("Cannot concatenate list of {types}"
  395. .format(types=types))
  396. def _get_concat_axis(self):
  397. """
  398. Return index to be used along concatenation axis.
  399. """
  400. if self._is_series:
  401. if self.axis == 0:
  402. indexes = [x.index for x in self.objs]
  403. elif self.ignore_index:
  404. idx = ibase.default_index(len(self.objs))
  405. return idx
  406. elif self.keys is None:
  407. names = [None] * len(self.objs)
  408. num = 0
  409. has_names = False
  410. for i, x in enumerate(self.objs):
  411. if not isinstance(x, Series):
  412. raise TypeError("Cannot concatenate type 'Series' "
  413. "with object of type {type!r}"
  414. .format(type=type(x).__name__))
  415. if x.name is not None:
  416. names[i] = x.name
  417. has_names = True
  418. else:
  419. names[i] = num
  420. num += 1
  421. if has_names:
  422. return Index(names)
  423. else:
  424. return ibase.default_index(len(self.objs))
  425. else:
  426. return ensure_index(self.keys).set_names(self.names)
  427. else:
  428. indexes = [x._data.axes[self.axis] for x in self.objs]
  429. if self.ignore_index:
  430. idx = ibase.default_index(sum(len(i) for i in indexes))
  431. return idx
  432. if self.keys is None:
  433. concat_axis = _concat_indexes(indexes)
  434. else:
  435. concat_axis = _make_concat_multiindex(indexes, self.keys,
  436. self.levels, self.names)
  437. self._maybe_check_integrity(concat_axis)
  438. return concat_axis
  439. def _maybe_check_integrity(self, concat_index):
  440. if self.verify_integrity:
  441. if not concat_index.is_unique:
  442. overlap = concat_index[concat_index.duplicated()].unique()
  443. raise ValueError('Indexes have overlapping values: '
  444. '{overlap!s}'.format(overlap=overlap))
  445. def _concat_indexes(indexes):
  446. return indexes[0].append(indexes[1:])
  447. def _make_concat_multiindex(indexes, keys, levels=None, names=None):
  448. if ((levels is None and isinstance(keys[0], tuple)) or
  449. (levels is not None and len(levels) > 1)):
  450. zipped = compat.lzip(*keys)
  451. if names is None:
  452. names = [None] * len(zipped)
  453. if levels is None:
  454. _, levels = _factorize_from_iterables(zipped)
  455. else:
  456. levels = [ensure_index(x) for x in levels]
  457. else:
  458. zipped = [keys]
  459. if names is None:
  460. names = [None]
  461. if levels is None:
  462. levels = [ensure_index(keys)]
  463. else:
  464. levels = [ensure_index(x) for x in levels]
  465. if not _all_indexes_same(indexes):
  466. codes_list = []
  467. # things are potentially different sizes, so compute the exact codes
  468. # for each level and pass those to MultiIndex.from_arrays
  469. for hlevel, level in zip(zipped, levels):
  470. to_concat = []
  471. for key, index in zip(hlevel, indexes):
  472. try:
  473. i = level.get_loc(key)
  474. except KeyError:
  475. raise ValueError('Key {key!s} not in level {level!s}'
  476. .format(key=key, level=level))
  477. to_concat.append(np.repeat(i, len(index)))
  478. codes_list.append(np.concatenate(to_concat))
  479. concat_index = _concat_indexes(indexes)
  480. # these go at the end
  481. if isinstance(concat_index, MultiIndex):
  482. levels.extend(concat_index.levels)
  483. codes_list.extend(concat_index.codes)
  484. else:
  485. codes, categories = _factorize_from_iterable(concat_index)
  486. levels.append(categories)
  487. codes_list.append(codes)
  488. if len(names) == len(levels):
  489. names = list(names)
  490. else:
  491. # make sure that all of the passed indices have the same nlevels
  492. if not len({idx.nlevels for idx in indexes}) == 1:
  493. raise AssertionError("Cannot concat indices that do"
  494. " not have the same number of levels")
  495. # also copies
  496. names = names + _get_consensus_names(indexes)
  497. return MultiIndex(levels=levels, codes=codes_list, names=names,
  498. verify_integrity=False)
  499. new_index = indexes[0]
  500. n = len(new_index)
  501. kpieces = len(indexes)
  502. # also copies
  503. new_names = list(names)
  504. new_levels = list(levels)
  505. # construct codes
  506. new_codes = []
  507. # do something a bit more speedy
  508. for hlevel, level in zip(zipped, levels):
  509. hlevel = ensure_index(hlevel)
  510. mapped = level.get_indexer(hlevel)
  511. mask = mapped == -1
  512. if mask.any():
  513. raise ValueError('Values not found in passed level: {hlevel!s}'
  514. .format(hlevel=hlevel[mask]))
  515. new_codes.append(np.repeat(mapped, n))
  516. if isinstance(new_index, MultiIndex):
  517. new_levels.extend(new_index.levels)
  518. new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes])
  519. else:
  520. new_levels.append(new_index)
  521. new_codes.append(np.tile(np.arange(n), kpieces))
  522. if len(new_names) < len(new_levels):
  523. new_names.extend(new_index.names)
  524. return MultiIndex(levels=new_levels, codes=new_codes, names=new_names,
  525. verify_integrity=False)