test_reshape.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621
  1. # -*- coding: utf-8 -*-
  2. # pylint: disable-msg=W0612,E1101
  3. from collections import OrderedDict
  4. import numpy as np
  5. from numpy import nan
  6. import pytest
  7. from pandas.compat import u
  8. from pandas.core.dtypes.common import is_integer_dtype
  9. import pandas as pd
  10. from pandas import Categorical, DataFrame, Index, Series, get_dummies
  11. from pandas.core.sparse.api import SparseArray, SparseDtype
  12. import pandas.util.testing as tm
  13. from pandas.util.testing import assert_frame_equal
  14. class TestGetDummies(object):
  15. @pytest.fixture
  16. def df(self):
  17. return DataFrame({'A': ['a', 'b', 'a'],
  18. 'B': ['b', 'b', 'c'],
  19. 'C': [1, 2, 3]})
  20. @pytest.fixture(params=['uint8', 'i8', np.float64, bool, None])
  21. def dtype(self, request):
  22. return np.dtype(request.param)
  23. @pytest.fixture(params=['dense', 'sparse'])
  24. def sparse(self, request):
  25. # params are strings to simplify reading test results,
  26. # e.g. TestGetDummies::test_basic[uint8-sparse] instead of [uint8-True]
  27. return request.param == 'sparse'
  28. def effective_dtype(self, dtype):
  29. if dtype is None:
  30. return np.uint8
  31. return dtype
  32. def test_raises_on_dtype_object(self, df):
  33. with pytest.raises(ValueError):
  34. get_dummies(df, dtype='object')
  35. def test_basic(self, sparse, dtype):
  36. s_list = list('abc')
  37. s_series = Series(s_list)
  38. s_series_index = Series(s_list, list('ABC'))
  39. expected = DataFrame({'a': [1, 0, 0],
  40. 'b': [0, 1, 0],
  41. 'c': [0, 0, 1]},
  42. dtype=self.effective_dtype(dtype))
  43. if sparse:
  44. expected = expected.apply(pd.SparseArray, fill_value=0.0)
  45. result = get_dummies(s_list, sparse=sparse, dtype=dtype)
  46. assert_frame_equal(result, expected)
  47. result = get_dummies(s_series, sparse=sparse, dtype=dtype)
  48. assert_frame_equal(result, expected)
  49. expected.index = list('ABC')
  50. result = get_dummies(s_series_index, sparse=sparse, dtype=dtype)
  51. assert_frame_equal(result, expected)
  52. def test_basic_types(self, sparse, dtype):
  53. # GH 10531
  54. s_list = list('abc')
  55. s_series = Series(s_list)
  56. s_df = DataFrame({'a': [0, 1, 0, 1, 2],
  57. 'b': ['A', 'A', 'B', 'C', 'C'],
  58. 'c': [2, 3, 3, 3, 2]})
  59. expected = DataFrame({'a': [1, 0, 0],
  60. 'b': [0, 1, 0],
  61. 'c': [0, 0, 1]},
  62. dtype=self.effective_dtype(dtype),
  63. columns=list('abc'))
  64. if sparse:
  65. if is_integer_dtype(dtype):
  66. fill_value = 0
  67. elif dtype == bool:
  68. fill_value = False
  69. else:
  70. fill_value = 0.0
  71. expected = expected.apply(SparseArray, fill_value=fill_value)
  72. result = get_dummies(s_list, sparse=sparse, dtype=dtype)
  73. tm.assert_frame_equal(result, expected)
  74. result = get_dummies(s_series, sparse=sparse, dtype=dtype)
  75. tm.assert_frame_equal(result, expected)
  76. result = get_dummies(s_df, columns=s_df.columns,
  77. sparse=sparse, dtype=dtype)
  78. if sparse:
  79. dtype_name = 'Sparse[{}, {}]'.format(
  80. self.effective_dtype(dtype).name,
  81. fill_value
  82. )
  83. else:
  84. dtype_name = self.effective_dtype(dtype).name
  85. expected = Series({dtype_name: 8})
  86. tm.assert_series_equal(result.get_dtype_counts(), expected)
  87. result = get_dummies(s_df, columns=['a'], sparse=sparse, dtype=dtype)
  88. expected_counts = {'int64': 1, 'object': 1}
  89. expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0)
  90. expected = Series(expected_counts).sort_index()
  91. tm.assert_series_equal(result.get_dtype_counts().sort_index(),
  92. expected)
  93. def test_just_na(self, sparse):
  94. just_na_list = [np.nan]
  95. just_na_series = Series(just_na_list)
  96. just_na_series_index = Series(just_na_list, index=['A'])
  97. res_list = get_dummies(just_na_list, sparse=sparse)
  98. res_series = get_dummies(just_na_series, sparse=sparse)
  99. res_series_index = get_dummies(just_na_series_index, sparse=sparse)
  100. assert res_list.empty
  101. assert res_series.empty
  102. assert res_series_index.empty
  103. assert res_list.index.tolist() == [0]
  104. assert res_series.index.tolist() == [0]
  105. assert res_series_index.index.tolist() == ['A']
  106. def test_include_na(self, sparse, dtype):
  107. s = ['a', 'b', np.nan]
  108. res = get_dummies(s, sparse=sparse, dtype=dtype)
  109. exp = DataFrame({'a': [1, 0, 0],
  110. 'b': [0, 1, 0]},
  111. dtype=self.effective_dtype(dtype))
  112. if sparse:
  113. exp = exp.apply(pd.SparseArray, fill_value=0.0)
  114. assert_frame_equal(res, exp)
  115. # Sparse dataframes do not allow nan labelled columns, see #GH8822
  116. res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype)
  117. exp_na = DataFrame({nan: [0, 0, 1],
  118. 'a': [1, 0, 0],
  119. 'b': [0, 1, 0]},
  120. dtype=self.effective_dtype(dtype))
  121. exp_na = exp_na.reindex(['a', 'b', nan], axis=1)
  122. # hack (NaN handling in assert_index_equal)
  123. exp_na.columns = res_na.columns
  124. if sparse:
  125. exp_na = exp_na.apply(pd.SparseArray, fill_value=0.0)
  126. assert_frame_equal(res_na, exp_na)
  127. res_just_na = get_dummies([nan], dummy_na=True,
  128. sparse=sparse, dtype=dtype)
  129. exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan],
  130. dtype=self.effective_dtype(dtype))
  131. tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
  132. def test_unicode(self, sparse):
  133. # See GH 6885 - get_dummies chokes on unicode values
  134. import unicodedata
  135. e = 'e'
  136. eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE')
  137. s = [e, eacute, eacute]
  138. res = get_dummies(s, prefix='letter', sparse=sparse)
  139. exp = DataFrame({'letter_e': [1, 0, 0],
  140. u('letter_%s') % eacute: [0, 1, 1]},
  141. dtype=np.uint8)
  142. if sparse:
  143. exp = exp.apply(pd.SparseArray, fill_value=0)
  144. assert_frame_equal(res, exp)
  145. def test_dataframe_dummies_all_obj(self, df, sparse):
  146. df = df[['A', 'B']]
  147. result = get_dummies(df, sparse=sparse)
  148. expected = DataFrame({'A_a': [1, 0, 1],
  149. 'A_b': [0, 1, 0],
  150. 'B_b': [1, 1, 0],
  151. 'B_c': [0, 0, 1]},
  152. dtype=np.uint8)
  153. if sparse:
  154. expected = pd.DataFrame({
  155. "A_a": pd.SparseArray([1, 0, 1], dtype='uint8'),
  156. "A_b": pd.SparseArray([0, 1, 0], dtype='uint8'),
  157. "B_b": pd.SparseArray([1, 1, 0], dtype='uint8'),
  158. "B_c": pd.SparseArray([0, 0, 1], dtype='uint8'),
  159. })
  160. assert_frame_equal(result, expected)
  161. def test_dataframe_dummies_mix_default(self, df, sparse, dtype):
  162. result = get_dummies(df, sparse=sparse, dtype=dtype)
  163. if sparse:
  164. arr = SparseArray
  165. typ = SparseDtype(dtype, 0)
  166. else:
  167. arr = np.array
  168. typ = dtype
  169. expected = DataFrame({'C': [1, 2, 3],
  170. 'A_a': arr([1, 0, 1], dtype=typ),
  171. 'A_b': arr([0, 1, 0], dtype=typ),
  172. 'B_b': arr([1, 1, 0], dtype=typ),
  173. 'B_c': arr([0, 0, 1], dtype=typ)})
  174. expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
  175. assert_frame_equal(result, expected)
  176. def test_dataframe_dummies_prefix_list(self, df, sparse):
  177. prefixes = ['from_A', 'from_B']
  178. result = get_dummies(df, prefix=prefixes, sparse=sparse)
  179. expected = DataFrame({'C': [1, 2, 3],
  180. 'from_A_a': [1, 0, 1],
  181. 'from_A_b': [0, 1, 0],
  182. 'from_B_b': [1, 1, 0],
  183. 'from_B_c': [0, 0, 1]},
  184. dtype=np.uint8)
  185. expected[['C']] = df[['C']]
  186. cols = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c']
  187. expected = expected[['C'] + cols]
  188. typ = pd.SparseArray if sparse else pd.Series
  189. expected[cols] = expected[cols].apply(lambda x: typ(x))
  190. assert_frame_equal(result, expected)
  191. def test_dataframe_dummies_prefix_str(self, df, sparse):
  192. # not that you should do this...
  193. result = get_dummies(df, prefix='bad', sparse=sparse)
  194. bad_columns = ['bad_a', 'bad_b', 'bad_b', 'bad_c']
  195. expected = DataFrame([[1, 1, 0, 1, 0],
  196. [2, 0, 1, 1, 0],
  197. [3, 1, 0, 0, 1]],
  198. columns=['C'] + bad_columns,
  199. dtype=np.uint8)
  200. expected = expected.astype({"C": np.int64})
  201. if sparse:
  202. # work around astyping & assigning with duplicate columns
  203. # https://github.com/pandas-dev/pandas/issues/14427
  204. expected = pd.concat([
  205. pd.Series([1, 2, 3], name='C'),
  206. pd.Series([1, 0, 1], name='bad_a', dtype='Sparse[uint8]'),
  207. pd.Series([0, 1, 0], name='bad_b', dtype='Sparse[uint8]'),
  208. pd.Series([1, 1, 0], name='bad_b', dtype='Sparse[uint8]'),
  209. pd.Series([0, 0, 1], name='bad_c', dtype='Sparse[uint8]'),
  210. ], axis=1)
  211. assert_frame_equal(result, expected)
  212. def test_dataframe_dummies_subset(self, df, sparse):
  213. result = get_dummies(df, prefix=['from_A'], columns=['A'],
  214. sparse=sparse)
  215. expected = DataFrame({'B': ['b', 'b', 'c'],
  216. 'C': [1, 2, 3],
  217. 'from_A_a': [1, 0, 1],
  218. 'from_A_b': [0, 1, 0]}, dtype=np.uint8)
  219. expected[['C']] = df[['C']]
  220. if sparse:
  221. cols = ['from_A_a', 'from_A_b']
  222. expected[cols] = expected[cols].apply(lambda x: pd.SparseSeries(x))
  223. assert_frame_equal(result, expected)
  224. def test_dataframe_dummies_prefix_sep(self, df, sparse):
  225. result = get_dummies(df, prefix_sep='..', sparse=sparse)
  226. expected = DataFrame({'C': [1, 2, 3],
  227. 'A..a': [1, 0, 1],
  228. 'A..b': [0, 1, 0],
  229. 'B..b': [1, 1, 0],
  230. 'B..c': [0, 0, 1]},
  231. dtype=np.uint8)
  232. expected[['C']] = df[['C']]
  233. expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']]
  234. if sparse:
  235. cols = ['A..a', 'A..b', 'B..b', 'B..c']
  236. expected[cols] = expected[cols].apply(lambda x: pd.SparseSeries(x))
  237. assert_frame_equal(result, expected)
  238. result = get_dummies(df, prefix_sep=['..', '__'], sparse=sparse)
  239. expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'})
  240. assert_frame_equal(result, expected)
  241. result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'},
  242. sparse=sparse)
  243. assert_frame_equal(result, expected)
  244. def test_dataframe_dummies_prefix_bad_length(self, df, sparse):
  245. with pytest.raises(ValueError):
  246. get_dummies(df, prefix=['too few'], sparse=sparse)
  247. def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse):
  248. with pytest.raises(ValueError):
  249. get_dummies(df, prefix_sep=['bad'], sparse=sparse)
  250. def test_dataframe_dummies_prefix_dict(self, sparse):
  251. prefixes = {'A': 'from_A', 'B': 'from_B'}
  252. df = DataFrame({'C': [1, 2, 3],
  253. 'A': ['a', 'b', 'a'],
  254. 'B': ['b', 'b', 'c']})
  255. result = get_dummies(df, prefix=prefixes, sparse=sparse)
  256. expected = DataFrame({'C': [1, 2, 3],
  257. 'from_A_a': [1, 0, 1],
  258. 'from_A_b': [0, 1, 0],
  259. 'from_B_b': [1, 1, 0],
  260. 'from_B_c': [0, 0, 1]})
  261. columns = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c']
  262. expected[columns] = expected[columns].astype(np.uint8)
  263. if sparse:
  264. expected[columns] = expected[columns].apply(
  265. lambda x: pd.SparseSeries(x)
  266. )
  267. assert_frame_equal(result, expected)
  268. def test_dataframe_dummies_with_na(self, df, sparse, dtype):
  269. df.loc[3, :] = [np.nan, np.nan, np.nan]
  270. result = get_dummies(df, dummy_na=True,
  271. sparse=sparse, dtype=dtype).sort_index(axis=1)
  272. if sparse:
  273. arr = SparseArray
  274. typ = SparseDtype(dtype, 0)
  275. else:
  276. arr = np.array
  277. typ = dtype
  278. expected = DataFrame({'C': [1, 2, 3, np.nan],
  279. 'A_a': arr([1, 0, 1, 0], dtype=typ),
  280. 'A_b': arr([0, 1, 0, 0], dtype=typ),
  281. 'A_nan': arr([0, 0, 0, 1], dtype=typ),
  282. 'B_b': arr([1, 1, 0, 0], dtype=typ),
  283. 'B_c': arr([0, 0, 1, 0], dtype=typ),
  284. 'B_nan': arr([0, 0, 0, 1], dtype=typ)
  285. }).sort_index(axis=1)
  286. assert_frame_equal(result, expected)
  287. result = get_dummies(df, dummy_na=False, sparse=sparse, dtype=dtype)
  288. expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
  289. assert_frame_equal(result, expected)
  290. def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
  291. df['cat'] = pd.Categorical(['x', 'y', 'y'])
  292. result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1)
  293. if sparse:
  294. arr = SparseArray
  295. typ = SparseDtype(dtype, 0)
  296. else:
  297. arr = np.array
  298. typ = dtype
  299. expected = DataFrame({'C': [1, 2, 3],
  300. 'A_a': arr([1, 0, 1], dtype=typ),
  301. 'A_b': arr([0, 1, 0], dtype=typ),
  302. 'B_b': arr([1, 1, 0], dtype=typ),
  303. 'B_c': arr([0, 0, 1], dtype=typ),
  304. 'cat_x': arr([1, 0, 0], dtype=typ),
  305. 'cat_y': arr([0, 1, 1], dtype=typ)
  306. }).sort_index(axis=1)
  307. assert_frame_equal(result, expected)
  308. @pytest.mark.parametrize('get_dummies_kwargs,expected', [
  309. ({'data': pd.DataFrame(({u'ä': ['a']}))},
  310. pd.DataFrame({u'ä_a': [1]}, dtype=np.uint8)),
  311. ({'data': pd.DataFrame({'x': [u'ä']})},
  312. pd.DataFrame({u'x_ä': [1]}, dtype=np.uint8)),
  313. ({'data': pd.DataFrame({'x': [u'a']}), 'prefix':u'ä'},
  314. pd.DataFrame({u'ä_a': [1]}, dtype=np.uint8)),
  315. ({'data': pd.DataFrame({'x': [u'a']}), 'prefix_sep':u'ä'},
  316. pd.DataFrame({u'xäa': [1]}, dtype=np.uint8))])
  317. def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected):
  318. # GH22084 pd.get_dummies incorrectly encodes unicode characters
  319. # in dataframe column names
  320. result = get_dummies(**get_dummies_kwargs)
  321. assert_frame_equal(result, expected)
  322. def test_basic_drop_first(self, sparse):
  323. # GH12402 Add a new parameter `drop_first` to avoid collinearity
  324. # Basic case
  325. s_list = list('abc')
  326. s_series = Series(s_list)
  327. s_series_index = Series(s_list, list('ABC'))
  328. expected = DataFrame({'b': [0, 1, 0],
  329. 'c': [0, 0, 1]},
  330. dtype=np.uint8)
  331. result = get_dummies(s_list, drop_first=True, sparse=sparse)
  332. if sparse:
  333. expected = expected.apply(pd.SparseArray, fill_value=0)
  334. assert_frame_equal(result, expected)
  335. result = get_dummies(s_series, drop_first=True, sparse=sparse)
  336. assert_frame_equal(result, expected)
  337. expected.index = list('ABC')
  338. result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
  339. assert_frame_equal(result, expected)
  340. def test_basic_drop_first_one_level(self, sparse):
  341. # Test the case that categorical variable only has one level.
  342. s_list = list('aaa')
  343. s_series = Series(s_list)
  344. s_series_index = Series(s_list, list('ABC'))
  345. expected = DataFrame(index=np.arange(3))
  346. result = get_dummies(s_list, drop_first=True, sparse=sparse)
  347. assert_frame_equal(result, expected)
  348. result = get_dummies(s_series, drop_first=True, sparse=sparse)
  349. assert_frame_equal(result, expected)
  350. expected = DataFrame(index=list('ABC'))
  351. result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
  352. assert_frame_equal(result, expected)
  353. def test_basic_drop_first_NA(self, sparse):
  354. # Test NA handling together with drop_first
  355. s_NA = ['a', 'b', np.nan]
  356. res = get_dummies(s_NA, drop_first=True, sparse=sparse)
  357. exp = DataFrame({'b': [0, 1, 0]}, dtype=np.uint8)
  358. if sparse:
  359. exp = exp.apply(pd.SparseArray, fill_value=0)
  360. assert_frame_equal(res, exp)
  361. res_na = get_dummies(s_NA, dummy_na=True, drop_first=True,
  362. sparse=sparse)
  363. exp_na = DataFrame(
  364. {'b': [0, 1, 0],
  365. nan: [0, 0, 1]},
  366. dtype=np.uint8).reindex(['b', nan], axis=1)
  367. if sparse:
  368. exp_na = exp_na.apply(pd.SparseArray, fill_value=0)
  369. assert_frame_equal(res_na, exp_na)
  370. res_just_na = get_dummies([nan], dummy_na=True, drop_first=True,
  371. sparse=sparse)
  372. exp_just_na = DataFrame(index=np.arange(1))
  373. assert_frame_equal(res_just_na, exp_just_na)
  374. def test_dataframe_dummies_drop_first(self, df, sparse):
  375. df = df[['A', 'B']]
  376. result = get_dummies(df, drop_first=True, sparse=sparse)
  377. expected = DataFrame({'A_b': [0, 1, 0],
  378. 'B_c': [0, 0, 1]},
  379. dtype=np.uint8)
  380. if sparse:
  381. expected = expected.apply(pd.SparseArray, fill_value=0)
  382. assert_frame_equal(result, expected)
  383. def test_dataframe_dummies_drop_first_with_categorical(
  384. self, df, sparse, dtype):
  385. df['cat'] = pd.Categorical(['x', 'y', 'y'])
  386. result = get_dummies(df, drop_first=True, sparse=sparse)
  387. expected = DataFrame({'C': [1, 2, 3],
  388. 'A_b': [0, 1, 0],
  389. 'B_c': [0, 0, 1],
  390. 'cat_y': [0, 1, 1]})
  391. cols = ['A_b', 'B_c', 'cat_y']
  392. expected[cols] = expected[cols].astype(np.uint8)
  393. expected = expected[['C', 'A_b', 'B_c', 'cat_y']]
  394. if sparse:
  395. for col in cols:
  396. expected[col] = pd.SparseSeries(expected[col])
  397. assert_frame_equal(result, expected)
  398. def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
  399. df.loc[3, :] = [np.nan, np.nan, np.nan]
  400. result = get_dummies(df, dummy_na=True, drop_first=True,
  401. sparse=sparse).sort_index(axis=1)
  402. expected = DataFrame({'C': [1, 2, 3, np.nan],
  403. 'A_b': [0, 1, 0, 0],
  404. 'A_nan': [0, 0, 0, 1],
  405. 'B_c': [0, 0, 1, 0],
  406. 'B_nan': [0, 0, 0, 1]})
  407. cols = ['A_b', 'A_nan', 'B_c', 'B_nan']
  408. expected[cols] = expected[cols].astype(np.uint8)
  409. expected = expected.sort_index(axis=1)
  410. if sparse:
  411. for col in cols:
  412. expected[col] = pd.SparseSeries(expected[col])
  413. assert_frame_equal(result, expected)
  414. result = get_dummies(df, dummy_na=False, drop_first=True,
  415. sparse=sparse)
  416. expected = expected[['C', 'A_b', 'B_c']]
  417. assert_frame_equal(result, expected)
  418. def test_int_int(self):
  419. data = Series([1, 2, 1])
  420. result = pd.get_dummies(data)
  421. expected = DataFrame([[1, 0],
  422. [0, 1],
  423. [1, 0]],
  424. columns=[1, 2],
  425. dtype=np.uint8)
  426. tm.assert_frame_equal(result, expected)
  427. data = Series(pd.Categorical(['a', 'b', 'a']))
  428. result = pd.get_dummies(data)
  429. expected = DataFrame([[1, 0],
  430. [0, 1],
  431. [1, 0]],
  432. columns=pd.Categorical(['a', 'b']),
  433. dtype=np.uint8)
  434. tm.assert_frame_equal(result, expected)
  435. def test_int_df(self, dtype):
  436. data = DataFrame(
  437. {'A': [1, 2, 1],
  438. 'B': pd.Categorical(['a', 'b', 'a']),
  439. 'C': [1, 2, 1],
  440. 'D': [1., 2., 1.]
  441. }
  442. )
  443. columns = ['C', 'D', 'A_1', 'A_2', 'B_a', 'B_b']
  444. expected = DataFrame([
  445. [1, 1., 1, 0, 1, 0],
  446. [2, 2., 0, 1, 0, 1],
  447. [1, 1., 1, 0, 1, 0]
  448. ], columns=columns)
  449. expected[columns[2:]] = expected[columns[2:]].astype(dtype)
  450. result = pd.get_dummies(data, columns=['A', 'B'], dtype=dtype)
  451. tm.assert_frame_equal(result, expected)
  452. def test_dataframe_dummies_preserve_categorical_dtype(self, dtype):
  453. # GH13854
  454. for ordered in [False, True]:
  455. cat = pd.Categorical(list("xy"), categories=list("xyz"),
  456. ordered=ordered)
  457. result = get_dummies(cat, dtype=dtype)
  458. data = np.array([[1, 0, 0], [0, 1, 0]],
  459. dtype=self.effective_dtype(dtype))
  460. cols = pd.CategoricalIndex(cat.categories,
  461. categories=cat.categories,
  462. ordered=ordered)
  463. expected = DataFrame(data, columns=cols,
  464. dtype=self.effective_dtype(dtype))
  465. tm.assert_frame_equal(result, expected)
  466. @pytest.mark.parametrize('sparse', [True, False])
  467. def test_get_dummies_dont_sparsify_all_columns(self, sparse):
  468. # GH18914
  469. df = DataFrame.from_dict(OrderedDict([('GDP', [1, 2]),
  470. ('Nation', ['AB', 'CD'])]))
  471. df = get_dummies(df, columns=['Nation'], sparse=sparse)
  472. df2 = df.reindex(columns=['GDP'])
  473. tm.assert_frame_equal(df[['GDP']], df2)
  474. def test_get_dummies_duplicate_columns(self, df):
  475. # GH20839
  476. df.columns = ["A", "A", "A"]
  477. result = get_dummies(df).sort_index(axis=1)
  478. expected = DataFrame([[1, 1, 0, 1, 0],
  479. [2, 0, 1, 1, 0],
  480. [3, 1, 0, 0, 1]],
  481. columns=['A', 'A_a', 'A_b', 'A_b', 'A_c'],
  482. dtype=np.uint8).sort_index(axis=1)
  483. expected = expected.astype({"A": np.int64})
  484. tm.assert_frame_equal(result, expected)
  485. class TestCategoricalReshape(object):
  486. @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
  487. def test_reshaping_panel_categorical(self):
  488. p = tm.makePanel()
  489. p['str'] = 'foo'
  490. df = p.to_frame()
  491. df['category'] = df['str'].astype('category')
  492. result = df['category'].unstack()
  493. c = Categorical(['foo'] * len(p.major_axis))
  494. expected = DataFrame({'A': c.copy(),
  495. 'B': c.copy(),
  496. 'C': c.copy(),
  497. 'D': c.copy()},
  498. columns=Index(list('ABCD'), name='minor'),
  499. index=p.major_axis.set_names('major'))
  500. tm.assert_frame_equal(result, expected)
  501. class TestMakeAxisDummies(object):
  502. def test_preserve_categorical_dtype(self):
  503. # GH13854
  504. for ordered in [False, True]:
  505. cidx = pd.CategoricalIndex(list("xyz"), ordered=ordered)
  506. midx = pd.MultiIndex(levels=[['a'], cidx],
  507. codes=[[0, 0], [0, 1]])
  508. df = DataFrame([[10, 11]], index=midx)
  509. expected = DataFrame([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]],
  510. index=midx, columns=cidx)
  511. from pandas.core.reshape.reshape import make_axis_dummies
  512. result = make_axis_dummies(df)
  513. tm.assert_frame_equal(result, expected)
  514. result = make_axis_dummies(df, transform=lambda x: x)
  515. tm.assert_frame_equal(result, expected)