test_join.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. # -*- coding: utf-8 -*-
  2. import numpy as np
  3. from pandas._libs import join as _join
  4. from pandas import Categorical, DataFrame, Index, merge
  5. import pandas.util.testing as tm
  6. from pandas.util.testing import assert_almost_equal, assert_frame_equal
  7. class TestIndexer(object):
  8. def test_outer_join_indexer(self):
  9. typemap = [('int32', _join.outer_join_indexer_int32),
  10. ('int64', _join.outer_join_indexer_int64),
  11. ('float32', _join.outer_join_indexer_float32),
  12. ('float64', _join.outer_join_indexer_float64),
  13. ('object', _join.outer_join_indexer_object)]
  14. for dtype, indexer in typemap:
  15. left = np.arange(3, dtype=dtype)
  16. right = np.arange(2, 5, dtype=dtype)
  17. empty = np.array([], dtype=dtype)
  18. result, lindexer, rindexer = indexer(left, right)
  19. assert isinstance(result, np.ndarray)
  20. assert isinstance(lindexer, np.ndarray)
  21. assert isinstance(rindexer, np.ndarray)
  22. tm.assert_numpy_array_equal(result, np.arange(5, dtype=dtype))
  23. exp = np.array([0, 1, 2, -1, -1], dtype=np.int64)
  24. tm.assert_numpy_array_equal(lindexer, exp)
  25. exp = np.array([-1, -1, 0, 1, 2], dtype=np.int64)
  26. tm.assert_numpy_array_equal(rindexer, exp)
  27. result, lindexer, rindexer = indexer(empty, right)
  28. tm.assert_numpy_array_equal(result, right)
  29. exp = np.array([-1, -1, -1], dtype=np.int64)
  30. tm.assert_numpy_array_equal(lindexer, exp)
  31. exp = np.array([0, 1, 2], dtype=np.int64)
  32. tm.assert_numpy_array_equal(rindexer, exp)
  33. result, lindexer, rindexer = indexer(left, empty)
  34. tm.assert_numpy_array_equal(result, left)
  35. exp = np.array([0, 1, 2], dtype=np.int64)
  36. tm.assert_numpy_array_equal(lindexer, exp)
  37. exp = np.array([-1, -1, -1], dtype=np.int64)
  38. tm.assert_numpy_array_equal(rindexer, exp)
  39. def test_left_join_indexer_unique():
  40. a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
  41. b = np.array([2, 2, 3, 4, 4], dtype=np.int64)
  42. result = _join.left_join_indexer_unique_int64(b, a)
  43. expected = np.array([1, 1, 2, 3, 3], dtype=np.int64)
  44. tm.assert_numpy_array_equal(result, expected)
  45. def test_left_outer_join_bug():
  46. left = np.array([0, 1, 0, 1, 1, 2, 3, 1, 0, 2, 1, 2, 0, 1, 1, 2, 3, 2, 3,
  47. 2, 1, 1, 3, 0, 3, 2, 3, 0, 0, 2, 3, 2, 0, 3, 1, 3, 0, 1,
  48. 3, 0, 0, 1, 0, 3, 1, 0, 1, 0, 1, 1, 0, 2, 2, 2, 2, 2, 0,
  49. 3, 1, 2, 0, 0, 3, 1, 3, 2, 2, 0, 1, 3, 0, 2, 3, 2, 3, 3,
  50. 2, 3, 3, 1, 3, 2, 0, 0, 3, 1, 1, 1, 0, 2, 3, 3, 1, 2, 0,
  51. 3, 1, 2, 0, 2], dtype=np.int64)
  52. right = np.array([3, 1], dtype=np.int64)
  53. max_groups = 4
  54. lidx, ridx = _join.left_outer_join(left, right, max_groups, sort=False)
  55. exp_lidx = np.arange(len(left), dtype=np.int64)
  56. exp_ridx = -np.ones(len(left), dtype=np.int64)
  57. exp_ridx[left == 1] = 1
  58. exp_ridx[left == 3] = 0
  59. tm.assert_numpy_array_equal(lidx, exp_lidx)
  60. tm.assert_numpy_array_equal(ridx, exp_ridx)
  61. def test_inner_join_indexer():
  62. a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
  63. b = np.array([0, 3, 5, 7, 9], dtype=np.int64)
  64. index, ares, bres = _join.inner_join_indexer_int64(a, b)
  65. index_exp = np.array([3, 5], dtype=np.int64)
  66. assert_almost_equal(index, index_exp)
  67. aexp = np.array([2, 4], dtype=np.int64)
  68. bexp = np.array([1, 2], dtype=np.int64)
  69. assert_almost_equal(ares, aexp)
  70. assert_almost_equal(bres, bexp)
  71. a = np.array([5], dtype=np.int64)
  72. b = np.array([5], dtype=np.int64)
  73. index, ares, bres = _join.inner_join_indexer_int64(a, b)
  74. tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64))
  75. tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64))
  76. tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64))
  77. def test_outer_join_indexer():
  78. a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
  79. b = np.array([0, 3, 5, 7, 9], dtype=np.int64)
  80. index, ares, bres = _join.outer_join_indexer_int64(a, b)
  81. index_exp = np.array([0, 1, 2, 3, 4, 5, 7, 9], dtype=np.int64)
  82. assert_almost_equal(index, index_exp)
  83. aexp = np.array([-1, 0, 1, 2, 3, 4, -1, -1], dtype=np.int64)
  84. bexp = np.array([0, -1, -1, 1, -1, 2, 3, 4], dtype=np.int64)
  85. assert_almost_equal(ares, aexp)
  86. assert_almost_equal(bres, bexp)
  87. a = np.array([5], dtype=np.int64)
  88. b = np.array([5], dtype=np.int64)
  89. index, ares, bres = _join.outer_join_indexer_int64(a, b)
  90. tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64))
  91. tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64))
  92. tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64))
  93. def test_left_join_indexer():
  94. a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
  95. b = np.array([0, 3, 5, 7, 9], dtype=np.int64)
  96. index, ares, bres = _join.left_join_indexer_int64(a, b)
  97. assert_almost_equal(index, a)
  98. aexp = np.array([0, 1, 2, 3, 4], dtype=np.int64)
  99. bexp = np.array([-1, -1, 1, -1, 2], dtype=np.int64)
  100. assert_almost_equal(ares, aexp)
  101. assert_almost_equal(bres, bexp)
  102. a = np.array([5], dtype=np.int64)
  103. b = np.array([5], dtype=np.int64)
  104. index, ares, bres = _join.left_join_indexer_int64(a, b)
  105. tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64))
  106. tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64))
  107. tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64))
  108. def test_left_join_indexer2():
  109. idx = Index([1, 1, 2, 5])
  110. idx2 = Index([1, 2, 5, 7, 9])
  111. res, lidx, ridx = _join.left_join_indexer_int64(idx2.values, idx.values)
  112. exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64)
  113. assert_almost_equal(res, exp_res)
  114. exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64)
  115. assert_almost_equal(lidx, exp_lidx)
  116. exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64)
  117. assert_almost_equal(ridx, exp_ridx)
  118. def test_outer_join_indexer2():
  119. idx = Index([1, 1, 2, 5])
  120. idx2 = Index([1, 2, 5, 7, 9])
  121. res, lidx, ridx = _join.outer_join_indexer_int64(idx2.values, idx.values)
  122. exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64)
  123. assert_almost_equal(res, exp_res)
  124. exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64)
  125. assert_almost_equal(lidx, exp_lidx)
  126. exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64)
  127. assert_almost_equal(ridx, exp_ridx)
  128. def test_inner_join_indexer2():
  129. idx = Index([1, 1, 2, 5])
  130. idx2 = Index([1, 2, 5, 7, 9])
  131. res, lidx, ridx = _join.inner_join_indexer_int64(idx2.values, idx.values)
  132. exp_res = np.array([1, 1, 2, 5], dtype=np.int64)
  133. assert_almost_equal(res, exp_res)
  134. exp_lidx = np.array([0, 0, 1, 2], dtype=np.int64)
  135. assert_almost_equal(lidx, exp_lidx)
  136. exp_ridx = np.array([0, 1, 2, 3], dtype=np.int64)
  137. assert_almost_equal(ridx, exp_ridx)
  138. def test_merge_join_categorical_multiindex():
  139. # From issue 16627
  140. a = {'Cat1': Categorical(['a', 'b', 'a', 'c', 'a', 'b'],
  141. ['a', 'b', 'c']),
  142. 'Int1': [0, 1, 0, 1, 0, 0]}
  143. a = DataFrame(a)
  144. b = {'Cat': Categorical(['a', 'b', 'c', 'a', 'b', 'c'],
  145. ['a', 'b', 'c']),
  146. 'Int': [0, 0, 0, 1, 1, 1],
  147. 'Factor': [1.1, 1.2, 1.3, 1.4, 1.5, 1.6]}
  148. b = DataFrame(b).set_index(['Cat', 'Int'])['Factor']
  149. expected = merge(a, b.reset_index(), left_on=['Cat1', 'Int1'],
  150. right_on=['Cat', 'Int'], how='left')
  151. result = a.join(b, on=['Cat1', 'Int1'])
  152. expected = expected.drop(['Cat', 'Int'], axis=1)
  153. assert_frame_equal(expected, result)
  154. # Same test, but with ordered categorical
  155. a = {'Cat1': Categorical(['a', 'b', 'a', 'c', 'a', 'b'],
  156. ['b', 'a', 'c'],
  157. ordered=True),
  158. 'Int1': [0, 1, 0, 1, 0, 0]}
  159. a = DataFrame(a)
  160. b = {'Cat': Categorical(['a', 'b', 'c', 'a', 'b', 'c'],
  161. ['b', 'a', 'c'],
  162. ordered=True),
  163. 'Int': [0, 0, 0, 1, 1, 1],
  164. 'Factor': [1.1, 1.2, 1.3, 1.4, 1.5, 1.6]}
  165. b = DataFrame(b).set_index(['Cat', 'Int'])['Factor']
  166. expected = merge(a, b.reset_index(), left_on=['Cat1', 'Int1'],
  167. right_on=['Cat', 'Int'], how='left')
  168. result = a.join(b, on=['Cat1', 'Int1'])
  169. expected = expected.drop(['Cat', 'Int'], axis=1)
  170. assert_frame_equal(expected, result)