test_mutate_columns.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
  1. # -*- coding: utf-8 -*-
  2. from __future__ import print_function
  3. import numpy as np
  4. import pytest
  5. from pandas.compat import PY36, lrange, range
  6. from pandas import DataFrame, Index, MultiIndex, Series
  7. from pandas.tests.frame.common import TestData
  8. import pandas.util.testing as tm
  9. from pandas.util.testing import assert_frame_equal
  10. # Column add, remove, delete.
  11. class TestDataFrameMutateColumns(TestData):
  12. def test_assign(self):
  13. df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
  14. original = df.copy()
  15. result = df.assign(C=df.B / df.A)
  16. expected = df.copy()
  17. expected['C'] = [4, 2.5, 2]
  18. assert_frame_equal(result, expected)
  19. # lambda syntax
  20. result = df.assign(C=lambda x: x.B / x.A)
  21. assert_frame_equal(result, expected)
  22. # original is unmodified
  23. assert_frame_equal(df, original)
  24. # Non-Series array-like
  25. result = df.assign(C=[4, 2.5, 2])
  26. assert_frame_equal(result, expected)
  27. # original is unmodified
  28. assert_frame_equal(df, original)
  29. result = df.assign(B=df.B / df.A)
  30. expected = expected.drop('B', axis=1).rename(columns={'C': 'B'})
  31. assert_frame_equal(result, expected)
  32. # overwrite
  33. result = df.assign(A=df.A + df.B)
  34. expected = df.copy()
  35. expected['A'] = [5, 7, 9]
  36. assert_frame_equal(result, expected)
  37. # lambda
  38. result = df.assign(A=lambda x: x.A + x.B)
  39. assert_frame_equal(result, expected)
  40. def test_assign_multiple(self):
  41. df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=['A', 'B'])
  42. result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B)
  43. expected = DataFrame([[1, 4, 7, 1, 4], [2, 5, 8, 2, 5],
  44. [3, 6, 9, 3, 6]], columns=list('ABCDE'))
  45. assert_frame_equal(result, expected)
  46. def test_assign_order(self):
  47. # GH 9818
  48. df = DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
  49. result = df.assign(D=df.A + df.B, C=df.A - df.B)
  50. if PY36:
  51. expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]],
  52. columns=list('ABDC'))
  53. else:
  54. expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]],
  55. columns=list('ABCD'))
  56. assert_frame_equal(result, expected)
  57. result = df.assign(C=df.A - df.B, D=df.A + df.B)
  58. expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]],
  59. columns=list('ABCD'))
  60. assert_frame_equal(result, expected)
  61. def test_assign_bad(self):
  62. df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
  63. # non-keyword argument
  64. with pytest.raises(TypeError):
  65. df.assign(lambda x: x.A)
  66. with pytest.raises(AttributeError):
  67. df.assign(C=df.A, D=df.A + df.C)
  68. @pytest.mark.skipif(PY36, reason="""Issue #14207: valid for python
  69. 3.6 and above""")
  70. def test_assign_dependent_old_python(self):
  71. df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
  72. # Key C does not exist at definition time of df
  73. with pytest.raises(KeyError):
  74. df.assign(C=lambda df: df.A,
  75. D=lambda df: df['A'] + df['C'])
  76. with pytest.raises(KeyError):
  77. df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
  78. @pytest.mark.skipif(not PY36, reason="""Issue #14207: not valid for
  79. python 3.5 and below""")
  80. def test_assign_dependent(self):
  81. df = DataFrame({'A': [1, 2], 'B': [3, 4]})
  82. result = df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
  83. expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]],
  84. columns=list('ABCD'))
  85. assert_frame_equal(result, expected)
  86. result = df.assign(C=lambda df: df.A,
  87. D=lambda df: df['A'] + df['C'])
  88. expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]],
  89. columns=list('ABCD'))
  90. assert_frame_equal(result, expected)
  91. def test_insert_error_msmgs(self):
  92. # GH 7432
  93. df = DataFrame({'foo': ['a', 'b', 'c'], 'bar': [
  94. 1, 2, 3], 'baz': ['d', 'e', 'f']}).set_index('foo')
  95. s = DataFrame({'foo': ['a', 'b', 'c', 'a'], 'fiz': [
  96. 'g', 'h', 'i', 'j']}).set_index('foo')
  97. msg = 'cannot reindex from a duplicate axis'
  98. with pytest.raises(ValueError, match=msg):
  99. df['newcol'] = s
  100. # GH 4107, more descriptive error message
  101. df = DataFrame(np.random.randint(0, 2, (4, 4)),
  102. columns=['a', 'b', 'c', 'd'])
  103. msg = 'incompatible index of inserted column with frame index'
  104. with pytest.raises(TypeError, match=msg):
  105. df['gr'] = df.groupby(['b', 'c']).count()
  106. def test_insert_benchmark(self):
  107. # from the vb_suite/frame_methods/frame_insert_columns
  108. N = 10
  109. K = 5
  110. df = DataFrame(index=lrange(N))
  111. new_col = np.random.randn(N)
  112. for i in range(K):
  113. df[i] = new_col
  114. expected = DataFrame(np.repeat(new_col, K).reshape(N, K),
  115. index=lrange(N))
  116. assert_frame_equal(df, expected)
  117. def test_insert(self):
  118. df = DataFrame(np.random.randn(5, 3), index=np.arange(5),
  119. columns=['c', 'b', 'a'])
  120. df.insert(0, 'foo', df['a'])
  121. tm.assert_index_equal(df.columns, Index(['foo', 'c', 'b', 'a']))
  122. tm.assert_series_equal(df['a'], df['foo'], check_names=False)
  123. df.insert(2, 'bar', df['c'])
  124. tm.assert_index_equal(df.columns,
  125. Index(['foo', 'c', 'bar', 'b', 'a']))
  126. tm.assert_almost_equal(df['c'], df['bar'], check_names=False)
  127. # diff dtype
  128. # new item
  129. df['x'] = df['a'].astype('float32')
  130. result = Series(dict(float32=1, float64=5))
  131. assert (df.get_dtype_counts().sort_index() == result).all()
  132. # replacing current (in different block)
  133. df['a'] = df['a'].astype('float32')
  134. result = Series(dict(float32=2, float64=4))
  135. assert (df.get_dtype_counts().sort_index() == result).all()
  136. df['y'] = df['a'].astype('int32')
  137. result = Series(dict(float32=2, float64=4, int32=1))
  138. assert (df.get_dtype_counts().sort_index() == result).all()
  139. with pytest.raises(ValueError, match='already exists'):
  140. df.insert(1, 'a', df['b'])
  141. pytest.raises(ValueError, df.insert, 1, 'c', df['b'])
  142. df.columns.name = 'some_name'
  143. # preserve columns name field
  144. df.insert(0, 'baz', df['c'])
  145. assert df.columns.name == 'some_name'
  146. # GH 13522
  147. df = DataFrame(index=['A', 'B', 'C'])
  148. df['X'] = df.index
  149. df['X'] = ['x', 'y', 'z']
  150. exp = DataFrame(data={'X': ['x', 'y', 'z']}, index=['A', 'B', 'C'])
  151. assert_frame_equal(df, exp)
  152. def test_delitem(self):
  153. del self.frame['A']
  154. assert 'A' not in self.frame
  155. def test_delitem_multiindex(self):
  156. midx = MultiIndex.from_product([['A', 'B'], [1, 2]])
  157. df = DataFrame(np.random.randn(4, 4), columns=midx)
  158. assert len(df.columns) == 4
  159. assert ('A', ) in df.columns
  160. assert 'A' in df.columns
  161. result = df['A']
  162. assert isinstance(result, DataFrame)
  163. del df['A']
  164. assert len(df.columns) == 2
  165. # A still in the levels, BUT get a KeyError if trying
  166. # to delete
  167. assert ('A', ) not in df.columns
  168. with pytest.raises(KeyError):
  169. del df[('A',)]
  170. # behavior of dropped/deleted MultiIndex levels changed from
  171. # GH 2770 to GH 19027: MultiIndex no longer '.__contains__'
  172. # levels which are dropped/deleted
  173. assert 'A' not in df.columns
  174. with pytest.raises(KeyError):
  175. del df['A']
  176. def test_pop(self):
  177. self.frame.columns.name = 'baz'
  178. self.frame.pop('A')
  179. assert 'A' not in self.frame
  180. self.frame['foo'] = 'bar'
  181. self.frame.pop('foo')
  182. assert 'foo' not in self.frame
  183. assert self.frame.columns.name == 'baz'
  184. # gh-10912: inplace ops cause caching issue
  185. a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[
  186. 'A', 'B', 'C'], index=['X', 'Y'])
  187. b = a.pop('B')
  188. b += 1
  189. # original frame
  190. expected = DataFrame([[1, 3], [4, 6]], columns=[
  191. 'A', 'C'], index=['X', 'Y'])
  192. tm.assert_frame_equal(a, expected)
  193. # result
  194. expected = Series([2, 5], index=['X', 'Y'], name='B') + 1
  195. tm.assert_series_equal(b, expected)
  196. def test_pop_non_unique_cols(self):
  197. df = DataFrame({0: [0, 1], 1: [0, 1], 2: [4, 5]})
  198. df.columns = ["a", "b", "a"]
  199. res = df.pop("a")
  200. assert type(res) == DataFrame
  201. assert len(res) == 2
  202. assert len(df.columns) == 1
  203. assert "b" in df.columns
  204. assert "a" not in df.columns
  205. assert len(df.index) == 2
  206. def test_insert_column_bug_4032(self):
  207. # GH4032, inserting a column and renaming causing errors
  208. df = DataFrame({'b': [1.1, 2.2]})
  209. df = df.rename(columns={})
  210. df.insert(0, 'a', [1, 2])
  211. result = df.rename(columns={})
  212. str(result)
  213. expected = DataFrame([[1, 1.1], [2, 2.2]], columns=['a', 'b'])
  214. assert_frame_equal(result, expected)
  215. df.insert(0, 'c', [1.3, 2.3])
  216. result = df.rename(columns={})
  217. str(result)
  218. expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]],
  219. columns=['c', 'a', 'b'])
  220. assert_frame_equal(result, expected)