reshaping.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. import itertools
  2. import numpy as np
  3. import pytest
  4. import pandas as pd
  5. from pandas.core.internals import ExtensionBlock
  6. from .base import BaseExtensionTests
  7. class BaseReshapingTests(BaseExtensionTests):
  8. """Tests for reshaping and concatenation."""
  9. @pytest.mark.parametrize('in_frame', [True, False])
  10. def test_concat(self, data, in_frame):
  11. wrapped = pd.Series(data)
  12. if in_frame:
  13. wrapped = pd.DataFrame(wrapped)
  14. result = pd.concat([wrapped, wrapped], ignore_index=True)
  15. assert len(result) == len(data) * 2
  16. if in_frame:
  17. dtype = result.dtypes[0]
  18. else:
  19. dtype = result.dtype
  20. assert dtype == data.dtype
  21. assert isinstance(result._data.blocks[0], ExtensionBlock)
  22. @pytest.mark.parametrize('in_frame', [True, False])
  23. def test_concat_all_na_block(self, data_missing, in_frame):
  24. valid_block = pd.Series(data_missing.take([1, 1]), index=[0, 1])
  25. na_block = pd.Series(data_missing.take([0, 0]), index=[2, 3])
  26. if in_frame:
  27. valid_block = pd.DataFrame({"a": valid_block})
  28. na_block = pd.DataFrame({"a": na_block})
  29. result = pd.concat([valid_block, na_block])
  30. if in_frame:
  31. expected = pd.DataFrame({"a": data_missing.take([1, 1, 0, 0])})
  32. self.assert_frame_equal(result, expected)
  33. else:
  34. expected = pd.Series(data_missing.take([1, 1, 0, 0]))
  35. self.assert_series_equal(result, expected)
  36. def test_concat_mixed_dtypes(self, data):
  37. # https://github.com/pandas-dev/pandas/issues/20762
  38. df1 = pd.DataFrame({'A': data[:3]})
  39. df2 = pd.DataFrame({"A": [1, 2, 3]})
  40. df3 = pd.DataFrame({"A": ['a', 'b', 'c']}).astype('category')
  41. dfs = [df1, df2, df3]
  42. # dataframes
  43. result = pd.concat(dfs)
  44. expected = pd.concat([x.astype(object) for x in dfs])
  45. self.assert_frame_equal(result, expected)
  46. # series
  47. result = pd.concat([x['A'] for x in dfs])
  48. expected = pd.concat([x['A'].astype(object) for x in dfs])
  49. self.assert_series_equal(result, expected)
  50. # simple test for just EA and one other
  51. result = pd.concat([df1, df2])
  52. expected = pd.concat([df1.astype('object'), df2.astype('object')])
  53. self.assert_frame_equal(result, expected)
  54. result = pd.concat([df1['A'], df2['A']])
  55. expected = pd.concat([df1['A'].astype('object'),
  56. df2['A'].astype('object')])
  57. self.assert_series_equal(result, expected)
  58. def test_concat_columns(self, data, na_value):
  59. df1 = pd.DataFrame({'A': data[:3]})
  60. df2 = pd.DataFrame({'B': [1, 2, 3]})
  61. expected = pd.DataFrame({'A': data[:3], 'B': [1, 2, 3]})
  62. result = pd.concat([df1, df2], axis=1)
  63. self.assert_frame_equal(result, expected)
  64. result = pd.concat([df1['A'], df2['B']], axis=1)
  65. self.assert_frame_equal(result, expected)
  66. # non-aligned
  67. df2 = pd.DataFrame({'B': [1, 2, 3]}, index=[1, 2, 3])
  68. expected = pd.DataFrame({
  69. 'A': data._from_sequence(list(data[:3]) + [na_value],
  70. dtype=data.dtype),
  71. 'B': [np.nan, 1, 2, 3]})
  72. result = pd.concat([df1, df2], axis=1)
  73. self.assert_frame_equal(result, expected)
  74. result = pd.concat([df1['A'], df2['B']], axis=1)
  75. self.assert_frame_equal(result, expected)
  76. def test_align(self, data, na_value):
  77. a = data[:3]
  78. b = data[2:5]
  79. r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3]))
  80. # Assumes that the ctor can take a list of scalars of the type
  81. e1 = pd.Series(data._from_sequence(list(a) + [na_value],
  82. dtype=data.dtype))
  83. e2 = pd.Series(data._from_sequence([na_value] + list(b),
  84. dtype=data.dtype))
  85. self.assert_series_equal(r1, e1)
  86. self.assert_series_equal(r2, e2)
  87. def test_align_frame(self, data, na_value):
  88. a = data[:3]
  89. b = data[2:5]
  90. r1, r2 = pd.DataFrame({'A': a}).align(
  91. pd.DataFrame({'A': b}, index=[1, 2, 3])
  92. )
  93. # Assumes that the ctor can take a list of scalars of the type
  94. e1 = pd.DataFrame({'A': data._from_sequence(list(a) + [na_value],
  95. dtype=data.dtype)})
  96. e2 = pd.DataFrame({'A': data._from_sequence([na_value] + list(b),
  97. dtype=data.dtype)})
  98. self.assert_frame_equal(r1, e1)
  99. self.assert_frame_equal(r2, e2)
  100. def test_align_series_frame(self, data, na_value):
  101. # https://github.com/pandas-dev/pandas/issues/20576
  102. ser = pd.Series(data, name='a')
  103. df = pd.DataFrame({"col": np.arange(len(ser) + 1)})
  104. r1, r2 = ser.align(df)
  105. e1 = pd.Series(data._from_sequence(list(data) + [na_value],
  106. dtype=data.dtype),
  107. name=ser.name)
  108. self.assert_series_equal(r1, e1)
  109. self.assert_frame_equal(r2, df)
  110. def test_set_frame_expand_regular_with_extension(self, data):
  111. df = pd.DataFrame({"A": [1] * len(data)})
  112. df['B'] = data
  113. expected = pd.DataFrame({"A": [1] * len(data), "B": data})
  114. self.assert_frame_equal(df, expected)
  115. def test_set_frame_expand_extension_with_regular(self, data):
  116. df = pd.DataFrame({'A': data})
  117. df['B'] = [1] * len(data)
  118. expected = pd.DataFrame({"A": data, "B": [1] * len(data)})
  119. self.assert_frame_equal(df, expected)
  120. def test_set_frame_overwrite_object(self, data):
  121. # https://github.com/pandas-dev/pandas/issues/20555
  122. df = pd.DataFrame({"A": [1] * len(data)}, dtype=object)
  123. df['A'] = data
  124. assert df.dtypes['A'] == data.dtype
  125. def test_merge(self, data, na_value):
  126. # GH-20743
  127. df1 = pd.DataFrame({'ext': data[:3], 'int1': [1, 2, 3],
  128. 'key': [0, 1, 2]})
  129. df2 = pd.DataFrame({'int2': [1, 2, 3, 4], 'key': [0, 0, 1, 3]})
  130. res = pd.merge(df1, df2)
  131. exp = pd.DataFrame(
  132. {'int1': [1, 1, 2], 'int2': [1, 2, 3], 'key': [0, 0, 1],
  133. 'ext': data._from_sequence([data[0], data[0], data[1]],
  134. dtype=data.dtype)})
  135. self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']])
  136. res = pd.merge(df1, df2, how='outer')
  137. exp = pd.DataFrame(
  138. {'int1': [1, 1, 2, 3, np.nan], 'int2': [1, 2, 3, np.nan, 4],
  139. 'key': [0, 0, 1, 2, 3],
  140. 'ext': data._from_sequence(
  141. [data[0], data[0], data[1], data[2], na_value],
  142. dtype=data.dtype)})
  143. self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']])
  144. def test_merge_on_extension_array(self, data):
  145. # GH 23020
  146. a, b = data[:2]
  147. key = type(data)._from_sequence([a, b], dtype=data.dtype)
  148. df = pd.DataFrame({"key": key, "val": [1, 2]})
  149. result = pd.merge(df, df, on='key')
  150. expected = pd.DataFrame({"key": key,
  151. "val_x": [1, 2],
  152. "val_y": [1, 2]})
  153. self.assert_frame_equal(result, expected)
  154. # order
  155. result = pd.merge(df.iloc[[1, 0]], df, on='key')
  156. expected = expected.iloc[[1, 0]].reset_index(drop=True)
  157. self.assert_frame_equal(result, expected)
  158. def test_merge_on_extension_array_duplicates(self, data):
  159. # GH 23020
  160. a, b = data[:2]
  161. key = type(data)._from_sequence([a, b, a], dtype=data.dtype)
  162. df1 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
  163. df2 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
  164. result = pd.merge(df1, df2, on='key')
  165. expected = pd.DataFrame({
  166. "key": key.take([0, 0, 0, 0, 1]),
  167. "val_x": [1, 1, 3, 3, 2],
  168. "val_y": [1, 3, 1, 3, 2],
  169. })
  170. self.assert_frame_equal(result, expected)
  171. @pytest.mark.parametrize("columns", [
  172. ["A", "B"],
  173. pd.MultiIndex.from_tuples([('A', 'a'), ('A', 'b')],
  174. names=['outer', 'inner']),
  175. ])
  176. def test_stack(self, data, columns):
  177. df = pd.DataFrame({"A": data[:5], "B": data[:5]})
  178. df.columns = columns
  179. result = df.stack()
  180. expected = df.astype(object).stack()
  181. # we need a second astype(object), in case the constructor inferred
  182. # object -> specialized, as is done for period.
  183. expected = expected.astype(object)
  184. if isinstance(expected, pd.Series):
  185. assert result.dtype == df.iloc[:, 0].dtype
  186. else:
  187. assert all(result.dtypes == df.iloc[:, 0].dtype)
  188. result = result.astype(object)
  189. self.assert_equal(result, expected)
  190. @pytest.mark.parametrize("index", [
  191. # Two levels, uniform.
  192. pd.MultiIndex.from_product(([['A', 'B'], ['a', 'b']]),
  193. names=['a', 'b']),
  194. # non-uniform
  195. pd.MultiIndex.from_tuples([('A', 'a'), ('A', 'b'), ('B', 'b')]),
  196. # three levels, non-uniform
  197. pd.MultiIndex.from_product([('A', 'B'), ('a', 'b', 'c'), (0, 1, 2)]),
  198. pd.MultiIndex.from_tuples([
  199. ('A', 'a', 1),
  200. ('A', 'b', 0),
  201. ('A', 'a', 0),
  202. ('B', 'a', 0),
  203. ('B', 'c', 1),
  204. ]),
  205. ])
  206. @pytest.mark.parametrize("obj", ["series", "frame"])
  207. def test_unstack(self, data, index, obj):
  208. data = data[:len(index)]
  209. if obj == "series":
  210. ser = pd.Series(data, index=index)
  211. else:
  212. ser = pd.DataFrame({"A": data, "B": data}, index=index)
  213. n = index.nlevels
  214. levels = list(range(n))
  215. # [0, 1, 2]
  216. # [(0,), (1,), (2,), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1)]
  217. combinations = itertools.chain.from_iterable(
  218. itertools.permutations(levels, i) for i in range(1, n)
  219. )
  220. for level in combinations:
  221. result = ser.unstack(level=level)
  222. assert all(isinstance(result[col].array, type(data))
  223. for col in result.columns)
  224. expected = ser.astype(object).unstack(level=level)
  225. result = result.astype(object)
  226. self.assert_frame_equal(result, expected)