test_join.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. # -*- coding: utf-8 -*-
  2. import numpy as np
  3. import pytest
  4. from pandas import DataFrame, Index, period_range
  5. from pandas.tests.frame.common import TestData
  6. import pandas.util.testing as tm
  7. @pytest.fixture
  8. def frame_with_period_index():
  9. return DataFrame(
  10. data=np.arange(20).reshape(4, 5),
  11. columns=list('abcde'),
  12. index=period_range(start='2000', freq='A', periods=4))
  13. @pytest.fixture
  14. def frame():
  15. return TestData().frame
  16. @pytest.fixture
  17. def left():
  18. return DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0])
  19. @pytest.fixture
  20. def right():
  21. return DataFrame({'b': [300, 100, 200]}, index=[3, 1, 2])
  22. @pytest.mark.parametrize(
  23. "how, sort, expected",
  24. [('inner', False, DataFrame({'a': [20, 10],
  25. 'b': [200, 100]},
  26. index=[2, 1])),
  27. ('inner', True, DataFrame({'a': [10, 20],
  28. 'b': [100, 200]},
  29. index=[1, 2])),
  30. ('left', False, DataFrame({'a': [20, 10, 0],
  31. 'b': [200, 100, np.nan]},
  32. index=[2, 1, 0])),
  33. ('left', True, DataFrame({'a': [0, 10, 20],
  34. 'b': [np.nan, 100, 200]},
  35. index=[0, 1, 2])),
  36. ('right', False, DataFrame({'a': [np.nan, 10, 20],
  37. 'b': [300, 100, 200]},
  38. index=[3, 1, 2])),
  39. ('right', True, DataFrame({'a': [10, 20, np.nan],
  40. 'b': [100, 200, 300]},
  41. index=[1, 2, 3])),
  42. ('outer', False, DataFrame({'a': [0, 10, 20, np.nan],
  43. 'b': [np.nan, 100, 200, 300]},
  44. index=[0, 1, 2, 3])),
  45. ('outer', True, DataFrame({'a': [0, 10, 20, np.nan],
  46. 'b': [np.nan, 100, 200, 300]},
  47. index=[0, 1, 2, 3]))])
  48. def test_join(left, right, how, sort, expected):
  49. result = left.join(right, how=how, sort=sort)
  50. tm.assert_frame_equal(result, expected)
  51. def test_join_index(frame):
  52. # left / right
  53. f = frame.loc[frame.index[:10], ['A', 'B']]
  54. f2 = frame.loc[frame.index[5:], ['C', 'D']].iloc[::-1]
  55. joined = f.join(f2)
  56. tm.assert_index_equal(f.index, joined.index)
  57. expected_columns = Index(['A', 'B', 'C', 'D'])
  58. tm.assert_index_equal(joined.columns, expected_columns)
  59. joined = f.join(f2, how='left')
  60. tm.assert_index_equal(joined.index, f.index)
  61. tm.assert_index_equal(joined.columns, expected_columns)
  62. joined = f.join(f2, how='right')
  63. tm.assert_index_equal(joined.index, f2.index)
  64. tm.assert_index_equal(joined.columns, expected_columns)
  65. # inner
  66. joined = f.join(f2, how='inner')
  67. tm.assert_index_equal(joined.index, f.index[5:10])
  68. tm.assert_index_equal(joined.columns, expected_columns)
  69. # outer
  70. joined = f.join(f2, how='outer')
  71. tm.assert_index_equal(joined.index, frame.index.sort_values())
  72. tm.assert_index_equal(joined.columns, expected_columns)
  73. with pytest.raises(ValueError, match='join method'):
  74. f.join(f2, how='foo')
  75. # corner case - overlapping columns
  76. msg = 'columns overlap but no suffix'
  77. for how in ('outer', 'left', 'inner'):
  78. with pytest.raises(ValueError, match=msg):
  79. frame.join(frame, how=how)
  80. def test_join_index_more(frame):
  81. af = frame.loc[:, ['A', 'B']]
  82. bf = frame.loc[::2, ['C', 'D']]
  83. expected = af.copy()
  84. expected['C'] = frame['C'][::2]
  85. expected['D'] = frame['D'][::2]
  86. result = af.join(bf)
  87. tm.assert_frame_equal(result, expected)
  88. result = af.join(bf, how='right')
  89. tm.assert_frame_equal(result, expected[::2])
  90. result = bf.join(af, how='right')
  91. tm.assert_frame_equal(result, expected.loc[:, result.columns])
  92. def test_join_index_series(frame):
  93. df = frame.copy()
  94. s = df.pop(frame.columns[-1])
  95. joined = df.join(s)
  96. # TODO should this check_names ?
  97. tm.assert_frame_equal(joined, frame, check_names=False)
  98. s.name = None
  99. with pytest.raises(ValueError, match='must have a name'):
  100. df.join(s)
  101. def test_join_overlap(frame):
  102. df1 = frame.loc[:, ['A', 'B', 'C']]
  103. df2 = frame.loc[:, ['B', 'C', 'D']]
  104. joined = df1.join(df2, lsuffix='_df1', rsuffix='_df2')
  105. df1_suf = df1.loc[:, ['B', 'C']].add_suffix('_df1')
  106. df2_suf = df2.loc[:, ['B', 'C']].add_suffix('_df2')
  107. no_overlap = frame.loc[:, ['A', 'D']]
  108. expected = df1_suf.join(df2_suf).join(no_overlap)
  109. # column order not necessarily sorted
  110. tm.assert_frame_equal(joined, expected.loc[:, joined.columns])
  111. def test_join_period_index(frame_with_period_index):
  112. other = frame_with_period_index.rename(
  113. columns=lambda x: '{key}{key}'.format(key=x))
  114. joined_values = np.concatenate(
  115. [frame_with_period_index.values] * 2, axis=1)
  116. joined_cols = frame_with_period_index.columns.append(other.columns)
  117. joined = frame_with_period_index.join(other)
  118. expected = DataFrame(
  119. data=joined_values,
  120. columns=joined_cols,
  121. index=frame_with_period_index.index)
  122. tm.assert_frame_equal(joined, expected)
  123. def test_join_left_sequence_non_unique_index():
  124. # https://github.com/pandas-dev/pandas/issues/19607
  125. df1 = DataFrame({'a': [0, 10, 20]}, index=[1, 2, 3])
  126. df2 = DataFrame({'b': [100, 200, 300]}, index=[4, 3, 2])
  127. df3 = DataFrame({'c': [400, 500, 600]}, index=[2, 2, 4])
  128. joined = df1.join([df2, df3], how='left')
  129. expected = DataFrame({
  130. 'a': [0, 10, 10, 20],
  131. 'b': [np.nan, 300, 300, 200],
  132. 'c': [np.nan, 400, 500, np.nan]
  133. }, index=[1, 2, 2, 3])
  134. tm.assert_frame_equal(joined, expected)