test_reshape.py 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968
  1. # -*- coding: utf-8 -*-
  2. from __future__ import print_function
  3. from datetime import datetime
  4. import itertools
  5. from warnings import catch_warnings, simplefilter
  6. import numpy as np
  7. import pytest
  8. from pandas.compat import u
  9. import pandas as pd
  10. from pandas import (
  11. DataFrame, Index, MultiIndex, Period, Series, Timedelta, date_range)
  12. from pandas.tests.frame.common import TestData
  13. import pandas.util.testing as tm
  14. from pandas.util.testing import assert_frame_equal, assert_series_equal
  15. class TestDataFrameReshape(TestData):
  16. def test_pivot(self):
  17. data = {
  18. 'index': ['A', 'B', 'C', 'C', 'B', 'A'],
  19. 'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
  20. 'values': [1., 2., 3., 3., 2., 1.]
  21. }
  22. frame = DataFrame(data)
  23. pivoted = frame.pivot(
  24. index='index', columns='columns', values='values')
  25. expected = DataFrame({
  26. 'One': {'A': 1., 'B': 2., 'C': 3.},
  27. 'Two': {'A': 1., 'B': 2., 'C': 3.}
  28. })
  29. expected.index.name, expected.columns.name = 'index', 'columns'
  30. tm.assert_frame_equal(pivoted, expected)
  31. # name tracking
  32. assert pivoted.index.name == 'index'
  33. assert pivoted.columns.name == 'columns'
  34. # don't specify values
  35. pivoted = frame.pivot(index='index', columns='columns')
  36. assert pivoted.index.name == 'index'
  37. assert pivoted.columns.names == (None, 'columns')
  38. with catch_warnings(record=True):
  39. # pivot multiple columns
  40. simplefilter("ignore", FutureWarning)
  41. wp = tm.makePanel()
  42. lp = wp.to_frame()
  43. df = lp.reset_index()
  44. tm.assert_frame_equal(df.pivot('major', 'minor'), lp.unstack())
  45. def test_pivot_duplicates(self):
  46. data = DataFrame({'a': ['bar', 'bar', 'foo', 'foo', 'foo'],
  47. 'b': ['one', 'two', 'one', 'one', 'two'],
  48. 'c': [1., 2., 3., 3., 4.]})
  49. with pytest.raises(ValueError, match='duplicate entries'):
  50. data.pivot('a', 'b', 'c')
  51. def test_pivot_empty(self):
  52. df = DataFrame({}, columns=['a', 'b', 'c'])
  53. result = df.pivot('a', 'b', 'c')
  54. expected = DataFrame({})
  55. tm.assert_frame_equal(result, expected, check_names=False)
  56. def test_pivot_integer_bug(self):
  57. df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")])
  58. result = df.pivot(index=1, columns=0, values=2)
  59. repr(result)
  60. tm.assert_index_equal(result.columns, Index(['A', 'B'], name=0))
  61. def test_pivot_index_none(self):
  62. # gh-3962
  63. data = {
  64. 'index': ['A', 'B', 'C', 'C', 'B', 'A'],
  65. 'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
  66. 'values': [1., 2., 3., 3., 2., 1.]
  67. }
  68. frame = DataFrame(data).set_index('index')
  69. result = frame.pivot(columns='columns', values='values')
  70. expected = DataFrame({
  71. 'One': {'A': 1., 'B': 2., 'C': 3.},
  72. 'Two': {'A': 1., 'B': 2., 'C': 3.}
  73. })
  74. expected.index.name, expected.columns.name = 'index', 'columns'
  75. assert_frame_equal(result, expected)
  76. # omit values
  77. result = frame.pivot(columns='columns')
  78. expected.columns = pd.MultiIndex.from_tuples([('values', 'One'),
  79. ('values', 'Two')],
  80. names=[None, 'columns'])
  81. expected.index.name = 'index'
  82. tm.assert_frame_equal(result, expected, check_names=False)
  83. assert result.index.name == 'index'
  84. assert result.columns.names == (None, 'columns')
  85. expected.columns = expected.columns.droplevel(0)
  86. result = frame.pivot(columns='columns', values='values')
  87. expected.columns.name = 'columns'
  88. tm.assert_frame_equal(result, expected)
  89. def test_stack_unstack(self):
  90. df = self.frame.copy()
  91. df[:] = np.arange(np.prod(df.shape)).reshape(df.shape)
  92. stacked = df.stack()
  93. stacked_df = DataFrame({'foo': stacked, 'bar': stacked})
  94. unstacked = stacked.unstack()
  95. unstacked_df = stacked_df.unstack()
  96. assert_frame_equal(unstacked, df)
  97. assert_frame_equal(unstacked_df['bar'], df)
  98. unstacked_cols = stacked.unstack(0)
  99. unstacked_cols_df = stacked_df.unstack(0)
  100. assert_frame_equal(unstacked_cols.T, df)
  101. assert_frame_equal(unstacked_cols_df['bar'].T, df)
  102. def test_stack_mixed_level(self):
  103. # GH 18310
  104. levels = [range(3), [3, 'a', 'b'], [1, 2]]
  105. # flat columns:
  106. df = DataFrame(1, index=levels[0], columns=levels[1])
  107. result = df.stack()
  108. expected = Series(1, index=MultiIndex.from_product(levels[:2]))
  109. assert_series_equal(result, expected)
  110. # MultiIndex columns:
  111. df = DataFrame(1, index=levels[0],
  112. columns=MultiIndex.from_product(levels[1:]))
  113. result = df.stack(1)
  114. expected = DataFrame(1, index=MultiIndex.from_product([levels[0],
  115. levels[2]]),
  116. columns=levels[1])
  117. assert_frame_equal(result, expected)
  118. # as above, but used labels in level are actually of homogeneous type
  119. result = df[['a', 'b']].stack(1)
  120. expected = expected[['a', 'b']]
  121. assert_frame_equal(result, expected)
  122. def test_unstack_fill(self):
  123. # GH #9746: fill_value keyword argument for Series
  124. # and DataFrame unstack
  125. # From a series
  126. data = Series([1, 2, 4, 5], dtype=np.int16)
  127. data.index = MultiIndex.from_tuples(
  128. [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
  129. result = data.unstack(fill_value=-1)
  130. expected = DataFrame({'a': [1, -1, 5], 'b': [2, 4, -1]},
  131. index=['x', 'y', 'z'], dtype=np.int16)
  132. assert_frame_equal(result, expected)
  133. # From a series with incorrect data type for fill_value
  134. result = data.unstack(fill_value=0.5)
  135. expected = DataFrame({'a': [1, 0.5, 5], 'b': [2, 4, 0.5]},
  136. index=['x', 'y', 'z'], dtype=np.float)
  137. assert_frame_equal(result, expected)
  138. # GH #13971: fill_value when unstacking multiple levels:
  139. df = DataFrame({'x': ['a', 'a', 'b'],
  140. 'y': ['j', 'k', 'j'],
  141. 'z': [0, 1, 2],
  142. 'w': [0, 1, 2]}).set_index(['x', 'y', 'z'])
  143. unstacked = df.unstack(['x', 'y'], fill_value=0)
  144. key = ('w', 'b', 'j')
  145. expected = unstacked[key]
  146. result = pd.Series([0, 0, 2], index=unstacked.index, name=key)
  147. assert_series_equal(result, expected)
  148. stacked = unstacked.stack(['x', 'y'])
  149. stacked.index = stacked.index.reorder_levels(df.index.names)
  150. # Workaround for GH #17886 (unnecessarily casts to float):
  151. stacked = stacked.astype(np.int64)
  152. result = stacked.loc[df.index]
  153. assert_frame_equal(result, df)
  154. # From a series
  155. s = df['w']
  156. result = s.unstack(['x', 'y'], fill_value=0)
  157. expected = unstacked['w']
  158. assert_frame_equal(result, expected)
  159. def test_unstack_fill_frame(self):
  160. # From a dataframe
  161. rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
  162. df = DataFrame(rows, columns=list('AB'), dtype=np.int32)
  163. df.index = MultiIndex.from_tuples(
  164. [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
  165. result = df.unstack(fill_value=-1)
  166. rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
  167. expected = DataFrame(rows, index=list('xyz'), dtype=np.int32)
  168. expected.columns = MultiIndex.from_tuples(
  169. [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
  170. assert_frame_equal(result, expected)
  171. # From a mixed type dataframe
  172. df['A'] = df['A'].astype(np.int16)
  173. df['B'] = df['B'].astype(np.float64)
  174. result = df.unstack(fill_value=-1)
  175. expected['A'] = expected['A'].astype(np.int16)
  176. expected['B'] = expected['B'].astype(np.float64)
  177. assert_frame_equal(result, expected)
  178. # From a dataframe with incorrect data type for fill_value
  179. result = df.unstack(fill_value=0.5)
  180. rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
  181. expected = DataFrame(rows, index=list('xyz'), dtype=np.float)
  182. expected.columns = MultiIndex.from_tuples(
  183. [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
  184. assert_frame_equal(result, expected)
  185. def test_unstack_fill_frame_datetime(self):
  186. # Test unstacking with date times
  187. dv = pd.date_range('2012-01-01', periods=4).values
  188. data = Series(dv)
  189. data.index = MultiIndex.from_tuples(
  190. [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
  191. result = data.unstack()
  192. expected = DataFrame({'a': [dv[0], pd.NaT, dv[3]],
  193. 'b': [dv[1], dv[2], pd.NaT]},
  194. index=['x', 'y', 'z'])
  195. assert_frame_equal(result, expected)
  196. result = data.unstack(fill_value=dv[0])
  197. expected = DataFrame({'a': [dv[0], dv[0], dv[3]],
  198. 'b': [dv[1], dv[2], dv[0]]},
  199. index=['x', 'y', 'z'])
  200. assert_frame_equal(result, expected)
  201. def test_unstack_fill_frame_timedelta(self):
  202. # Test unstacking with time deltas
  203. td = [Timedelta(days=i) for i in range(4)]
  204. data = Series(td)
  205. data.index = MultiIndex.from_tuples(
  206. [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
  207. result = data.unstack()
  208. expected = DataFrame({'a': [td[0], pd.NaT, td[3]],
  209. 'b': [td[1], td[2], pd.NaT]},
  210. index=['x', 'y', 'z'])
  211. assert_frame_equal(result, expected)
  212. result = data.unstack(fill_value=td[1])
  213. expected = DataFrame({'a': [td[0], td[1], td[3]],
  214. 'b': [td[1], td[2], td[1]]},
  215. index=['x', 'y', 'z'])
  216. assert_frame_equal(result, expected)
  217. def test_unstack_fill_frame_period(self):
  218. # Test unstacking with period
  219. periods = [Period('2012-01'), Period('2012-02'), Period('2012-03'),
  220. Period('2012-04')]
  221. data = Series(periods)
  222. data.index = MultiIndex.from_tuples(
  223. [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
  224. result = data.unstack()
  225. expected = DataFrame({'a': [periods[0], None, periods[3]],
  226. 'b': [periods[1], periods[2], None]},
  227. index=['x', 'y', 'z'])
  228. assert_frame_equal(result, expected)
  229. result = data.unstack(fill_value=periods[1])
  230. expected = DataFrame({'a': [periods[0], periods[1], periods[3]],
  231. 'b': [periods[1], periods[2], periods[1]]},
  232. index=['x', 'y', 'z'])
  233. assert_frame_equal(result, expected)
  234. def test_unstack_fill_frame_categorical(self):
  235. # Test unstacking with categorical
  236. data = pd.Series(['a', 'b', 'c', 'a'], dtype='category')
  237. data.index = pd.MultiIndex.from_tuples(
  238. [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')],
  239. )
  240. # By default missing values will be NaN
  241. result = data.unstack()
  242. expected = DataFrame({'a': pd.Categorical(list('axa'),
  243. categories=list('abc')),
  244. 'b': pd.Categorical(list('bcx'),
  245. categories=list('abc'))},
  246. index=list('xyz'))
  247. assert_frame_equal(result, expected)
  248. # Fill with non-category results in a TypeError
  249. msg = r"'fill_value' \('d'\) is not in"
  250. with pytest.raises(TypeError, match=msg):
  251. data.unstack(fill_value='d')
  252. # Fill with category value replaces missing values as expected
  253. result = data.unstack(fill_value='c')
  254. expected = DataFrame({'a': pd.Categorical(list('aca'),
  255. categories=list('abc')),
  256. 'b': pd.Categorical(list('bcc'),
  257. categories=list('abc'))},
  258. index=list('xyz'))
  259. assert_frame_equal(result, expected)
  260. def test_unstack_preserve_dtypes(self):
  261. # Checks fix for #11847
  262. df = pd.DataFrame(dict(state=['IL', 'MI', 'NC'],
  263. index=['a', 'b', 'c'],
  264. some_categories=pd.Series(['a', 'b', 'c']
  265. ).astype('category'),
  266. A=np.random.rand(3),
  267. B=1,
  268. C='foo',
  269. D=pd.Timestamp('20010102'),
  270. E=pd.Series([1.0, 50.0, 100.0]
  271. ).astype('float32'),
  272. F=pd.Series([3.0, 4.0, 5.0]).astype('float64'),
  273. G=False,
  274. H=pd.Series([1, 200, 923442], dtype='int8')))
  275. def unstack_and_compare(df, column_name):
  276. unstacked1 = df.unstack([column_name])
  277. unstacked2 = df.unstack(column_name)
  278. assert_frame_equal(unstacked1, unstacked2)
  279. df1 = df.set_index(['state', 'index'])
  280. unstack_and_compare(df1, 'index')
  281. df1 = df.set_index(['state', 'some_categories'])
  282. unstack_and_compare(df1, 'some_categories')
  283. df1 = df.set_index(['F', 'C'])
  284. unstack_and_compare(df1, 'F')
  285. df1 = df.set_index(['G', 'B', 'state'])
  286. unstack_and_compare(df1, 'B')
  287. df1 = df.set_index(['E', 'A'])
  288. unstack_and_compare(df1, 'E')
  289. df1 = df.set_index(['state', 'index'])
  290. s = df1['A']
  291. unstack_and_compare(s, 'index')
  292. def test_stack_ints(self):
  293. columns = MultiIndex.from_tuples(list(itertools.product(range(3),
  294. repeat=3)))
  295. df = DataFrame(np.random.randn(30, 27), columns=columns)
  296. assert_frame_equal(df.stack(level=[1, 2]),
  297. df.stack(level=1).stack(level=1))
  298. assert_frame_equal(df.stack(level=[-2, -1]),
  299. df.stack(level=1).stack(level=1))
  300. df_named = df.copy()
  301. df_named.columns.set_names(range(3), inplace=True)
  302. assert_frame_equal(df_named.stack(level=[1, 2]),
  303. df_named.stack(level=1).stack(level=1))
  304. def test_stack_mixed_levels(self):
  305. columns = MultiIndex.from_tuples(
  306. [('A', 'cat', 'long'), ('B', 'cat', 'long'),
  307. ('A', 'dog', 'short'), ('B', 'dog', 'short')],
  308. names=['exp', 'animal', 'hair_length']
  309. )
  310. df = DataFrame(np.random.randn(4, 4), columns=columns)
  311. animal_hair_stacked = df.stack(level=['animal', 'hair_length'])
  312. exp_hair_stacked = df.stack(level=['exp', 'hair_length'])
  313. # GH #8584: Need to check that stacking works when a number
  314. # is passed that is both a level name and in the range of
  315. # the level numbers
  316. df2 = df.copy()
  317. df2.columns.names = ['exp', 'animal', 1]
  318. assert_frame_equal(df2.stack(level=['animal', 1]),
  319. animal_hair_stacked, check_names=False)
  320. assert_frame_equal(df2.stack(level=['exp', 1]),
  321. exp_hair_stacked, check_names=False)
  322. # When mixed types are passed and the ints are not level
  323. # names, raise
  324. pytest.raises(ValueError, df2.stack, level=['animal', 0])
  325. # GH #8584: Having 0 in the level names could raise a
  326. # strange error about lexsort depth
  327. df3 = df.copy()
  328. df3.columns.names = ['exp', 'animal', 0]
  329. assert_frame_equal(df3.stack(level=['animal', 0]),
  330. animal_hair_stacked, check_names=False)
  331. def test_stack_int_level_names(self):
  332. columns = MultiIndex.from_tuples(
  333. [('A', 'cat', 'long'), ('B', 'cat', 'long'),
  334. ('A', 'dog', 'short'), ('B', 'dog', 'short')],
  335. names=['exp', 'animal', 'hair_length']
  336. )
  337. df = DataFrame(np.random.randn(4, 4), columns=columns)
  338. exp_animal_stacked = df.stack(level=['exp', 'animal'])
  339. animal_hair_stacked = df.stack(level=['animal', 'hair_length'])
  340. exp_hair_stacked = df.stack(level=['exp', 'hair_length'])
  341. df2 = df.copy()
  342. df2.columns.names = [0, 1, 2]
  343. assert_frame_equal(df2.stack(level=[1, 2]), animal_hair_stacked,
  344. check_names=False)
  345. assert_frame_equal(df2.stack(level=[0, 1]), exp_animal_stacked,
  346. check_names=False)
  347. assert_frame_equal(df2.stack(level=[0, 2]), exp_hair_stacked,
  348. check_names=False)
  349. # Out-of-order int column names
  350. df3 = df.copy()
  351. df3.columns.names = [2, 0, 1]
  352. assert_frame_equal(df3.stack(level=[0, 1]), animal_hair_stacked,
  353. check_names=False)
  354. assert_frame_equal(df3.stack(level=[2, 0]), exp_animal_stacked,
  355. check_names=False)
  356. assert_frame_equal(df3.stack(level=[2, 1]), exp_hair_stacked,
  357. check_names=False)
  358. def test_unstack_bool(self):
  359. df = DataFrame([False, False],
  360. index=MultiIndex.from_arrays([['a', 'b'], ['c', 'l']]),
  361. columns=['col'])
  362. rs = df.unstack()
  363. xp = DataFrame(np.array([[False, np.nan], [np.nan, False]],
  364. dtype=object),
  365. index=['a', 'b'],
  366. columns=MultiIndex.from_arrays([['col', 'col'],
  367. ['c', 'l']]))
  368. assert_frame_equal(rs, xp)
  369. def test_unstack_level_binding(self):
  370. # GH9856
  371. mi = pd.MultiIndex(
  372. levels=[[u('foo'), u('bar')], [u('one'), u('two')],
  373. [u('a'), u('b')]],
  374. codes=[[0, 0, 1, 1], [0, 1, 0, 1], [1, 0, 1, 0]],
  375. names=[u('first'), u('second'), u('third')])
  376. s = pd.Series(0, index=mi)
  377. result = s.unstack([1, 2]).stack(0)
  378. expected_mi = pd.MultiIndex(
  379. levels=[['foo', 'bar'], ['one', 'two']],
  380. codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
  381. names=['first', 'second'])
  382. expected = pd.DataFrame(np.array([[np.nan, 0],
  383. [0, np.nan],
  384. [np.nan, 0],
  385. [0, np.nan]],
  386. dtype=np.float64),
  387. index=expected_mi,
  388. columns=pd.Index(['a', 'b'], name='third'))
  389. assert_frame_equal(result, expected)
  390. def test_unstack_to_series(self):
  391. # check reversibility
  392. data = self.frame.unstack()
  393. assert isinstance(data, Series)
  394. undo = data.unstack().T
  395. assert_frame_equal(undo, self.frame)
  396. # check NA handling
  397. data = DataFrame({'x': [1, 2, np.NaN], 'y': [3.0, 4, np.NaN]})
  398. data.index = Index(['a', 'b', 'c'])
  399. result = data.unstack()
  400. midx = MultiIndex(levels=[['x', 'y'], ['a', 'b', 'c']],
  401. codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])
  402. expected = Series([1, 2, np.NaN, 3, 4, np.NaN], index=midx)
  403. assert_series_equal(result, expected)
  404. # check composability of unstack
  405. old_data = data.copy()
  406. for _ in range(4):
  407. data = data.unstack()
  408. assert_frame_equal(old_data, data)
  409. def test_unstack_dtypes(self):
  410. # GH 2929
  411. rows = [[1, 1, 3, 4],
  412. [1, 2, 3, 4],
  413. [2, 1, 3, 4],
  414. [2, 2, 3, 4]]
  415. df = DataFrame(rows, columns=list('ABCD'))
  416. result = df.get_dtype_counts()
  417. expected = Series({'int64': 4})
  418. assert_series_equal(result, expected)
  419. # single dtype
  420. df2 = df.set_index(['A', 'B'])
  421. df3 = df2.unstack('B')
  422. result = df3.get_dtype_counts()
  423. expected = Series({'int64': 4})
  424. assert_series_equal(result, expected)
  425. # mixed
  426. df2 = df.set_index(['A', 'B'])
  427. df2['C'] = 3.
  428. df3 = df2.unstack('B')
  429. result = df3.get_dtype_counts()
  430. expected = Series({'int64': 2, 'float64': 2})
  431. assert_series_equal(result, expected)
  432. df2['D'] = 'foo'
  433. df3 = df2.unstack('B')
  434. result = df3.get_dtype_counts()
  435. expected = Series({'float64': 2, 'object': 2})
  436. assert_series_equal(result, expected)
  437. # GH7405
  438. for c, d in (np.zeros(5), np.zeros(5)), \
  439. (np.arange(5, dtype='f8'), np.arange(5, 10, dtype='f8')):
  440. df = DataFrame({'A': ['a'] * 5, 'C': c, 'D': d,
  441. 'B': pd.date_range('2012-01-01', periods=5)})
  442. right = df.iloc[:3].copy(deep=True)
  443. df = df.set_index(['A', 'B'])
  444. df['D'] = df['D'].astype('int64')
  445. left = df.iloc[:3].unstack(0)
  446. right = right.set_index(['A', 'B']).unstack(0)
  447. right[('D', 'a')] = right[('D', 'a')].astype('int64')
  448. assert left.shape == (3, 2)
  449. tm.assert_frame_equal(left, right)
  450. def test_unstack_non_unique_index_names(self):
  451. idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')],
  452. names=['c1', 'c1'])
  453. df = DataFrame([1, 2], index=idx)
  454. with pytest.raises(ValueError):
  455. df.unstack('c1')
  456. with pytest.raises(ValueError):
  457. df.T.stack('c1')
  458. def test_unstack_unused_levels(self):
  459. # GH 17845: unused codes in index make unstack() cast int to float
  460. idx = pd.MultiIndex.from_product([['a'], ['A', 'B', 'C', 'D']])[:-1]
  461. df = pd.DataFrame([[1, 0]] * 3, index=idx)
  462. result = df.unstack()
  463. exp_col = pd.MultiIndex.from_product([[0, 1], ['A', 'B', 'C']])
  464. expected = pd.DataFrame([[1, 1, 1, 0, 0, 0]], index=['a'],
  465. columns=exp_col)
  466. tm.assert_frame_equal(result, expected)
  467. assert((result.columns.levels[1] == idx.levels[1]).all())
  468. # Unused items on both levels
  469. levels = [[0, 1, 7], [0, 1, 2, 3]]
  470. codes = [[0, 0, 1, 1], [0, 2, 0, 2]]
  471. idx = pd.MultiIndex(levels, codes)
  472. block = np.arange(4).reshape(2, 2)
  473. df = pd.DataFrame(np.concatenate([block, block + 4]), index=idx)
  474. result = df.unstack()
  475. expected = pd.DataFrame(np.concatenate([block * 2, block * 2 + 1],
  476. axis=1),
  477. columns=idx)
  478. tm.assert_frame_equal(result, expected)
  479. assert((result.columns.levels[1] == idx.levels[1]).all())
  480. # With mixed dtype and NaN
  481. levels = [['a', 2, 'c'], [1, 3, 5, 7]]
  482. codes = [[0, -1, 1, 1], [0, 2, -1, 2]]
  483. idx = pd.MultiIndex(levels, codes)
  484. data = np.arange(8)
  485. df = pd.DataFrame(data.reshape(4, 2), index=idx)
  486. cases = ((0, [13, 16, 6, 9, 2, 5, 8, 11],
  487. [np.nan, 'a', 2], [np.nan, 5, 1]),
  488. (1, [8, 11, 1, 4, 12, 15, 13, 16],
  489. [np.nan, 5, 1], [np.nan, 'a', 2]))
  490. for level, idces, col_level, idx_level in cases:
  491. result = df.unstack(level=level)
  492. exp_data = np.zeros(18) * np.nan
  493. exp_data[idces] = data
  494. cols = pd.MultiIndex.from_product([[0, 1], col_level])
  495. expected = pd.DataFrame(exp_data.reshape(3, 6),
  496. index=idx_level, columns=cols)
  497. tm.assert_frame_equal(result, expected)
  498. @pytest.mark.parametrize("cols", [['A', 'C'], slice(None)])
  499. def test_unstack_unused_level(self, cols):
  500. # GH 18562 : unused codes on the unstacked level
  501. df = pd.DataFrame([[2010, 'a', 'I'],
  502. [2011, 'b', 'II']],
  503. columns=['A', 'B', 'C'])
  504. ind = df.set_index(['A', 'B', 'C'], drop=False)
  505. selection = ind.loc[(slice(None), slice(None), 'I'), cols]
  506. result = selection.unstack()
  507. expected = ind.iloc[[0]][cols]
  508. expected.columns = MultiIndex.from_product([expected.columns, ['I']],
  509. names=[None, 'C'])
  510. expected.index = expected.index.droplevel('C')
  511. tm.assert_frame_equal(result, expected)
  512. def test_unstack_nan_index(self): # GH7466
  513. cast = lambda val: '{0:1}'.format('' if val != val else val)
  514. def verify(df):
  515. mk_list = lambda a: list(a) if isinstance(a, tuple) else [a]
  516. rows, cols = df.notna().values.nonzero()
  517. for i, j in zip(rows, cols):
  518. left = sorted(df.iloc[i, j].split('.'))
  519. right = mk_list(df.index[i]) + mk_list(df.columns[j])
  520. right = sorted(list(map(cast, right)))
  521. assert left == right
  522. df = DataFrame({'jim': ['a', 'b', np.nan, 'd'],
  523. 'joe': ['w', 'x', 'y', 'z'],
  524. 'jolie': ['a.w', 'b.x', ' .y', 'd.z']})
  525. left = df.set_index(['jim', 'joe']).unstack()['jolie']
  526. right = df.set_index(['joe', 'jim']).unstack()['jolie'].T
  527. assert_frame_equal(left, right)
  528. for idx in itertools.permutations(df.columns[:2]):
  529. mi = df.set_index(list(idx))
  530. for lev in range(2):
  531. udf = mi.unstack(level=lev)
  532. assert udf.notna().values.sum() == len(df)
  533. verify(udf['jolie'])
  534. df = DataFrame({'1st': ['d'] * 3 + [np.nan] * 5 + ['a'] * 2 +
  535. ['c'] * 3 + ['e'] * 2 + ['b'] * 5,
  536. '2nd': ['y'] * 2 + ['w'] * 3 + [np.nan] * 3 +
  537. ['z'] * 4 + [np.nan] * 3 + ['x'] * 3 + [np.nan] * 2,
  538. '3rd': [67, 39, 53, 72, 57, 80, 31, 18, 11, 30, 59,
  539. 50, 62, 59, 76, 52, 14, 53, 60, 51]})
  540. df['4th'], df['5th'] = \
  541. df.apply(lambda r: '.'.join(map(cast, r)), axis=1), \
  542. df.apply(lambda r: '.'.join(map(cast, r.iloc[::-1])), axis=1)
  543. for idx in itertools.permutations(['1st', '2nd', '3rd']):
  544. mi = df.set_index(list(idx))
  545. for lev in range(3):
  546. udf = mi.unstack(level=lev)
  547. assert udf.notna().values.sum() == 2 * len(df)
  548. for col in ['4th', '5th']:
  549. verify(udf[col])
  550. # GH7403
  551. df = pd.DataFrame(
  552. {'A': list('aaaabbbb'), 'B': range(8), 'C': range(8)})
  553. df.iloc[3, 1] = np.NaN
  554. left = df.set_index(['A', 'B']).unstack(0)
  555. vals = [[3, 0, 1, 2, np.nan, np.nan, np.nan, np.nan],
  556. [np.nan, np.nan, np.nan, np.nan, 4, 5, 6, 7]]
  557. vals = list(map(list, zip(*vals)))
  558. idx = Index([np.nan, 0, 1, 2, 4, 5, 6, 7], name='B')
  559. cols = MultiIndex(levels=[['C'], ['a', 'b']],
  560. codes=[[0, 0], [0, 1]],
  561. names=[None, 'A'])
  562. right = DataFrame(vals, columns=cols, index=idx)
  563. assert_frame_equal(left, right)
  564. df = DataFrame({'A': list('aaaabbbb'), 'B': list(range(4)) * 2,
  565. 'C': range(8)})
  566. df.iloc[2, 1] = np.NaN
  567. left = df.set_index(['A', 'B']).unstack(0)
  568. vals = [[2, np.nan], [0, 4], [1, 5], [np.nan, 6], [3, 7]]
  569. cols = MultiIndex(levels=[['C'], ['a', 'b']],
  570. codes=[[0, 0], [0, 1]],
  571. names=[None, 'A'])
  572. idx = Index([np.nan, 0, 1, 2, 3], name='B')
  573. right = DataFrame(vals, columns=cols, index=idx)
  574. assert_frame_equal(left, right)
  575. df = pd.DataFrame({'A': list('aaaabbbb'), 'B': list(range(4)) * 2,
  576. 'C': range(8)})
  577. df.iloc[3, 1] = np.NaN
  578. left = df.set_index(['A', 'B']).unstack(0)
  579. vals = [[3, np.nan], [0, 4], [1, 5], [2, 6], [np.nan, 7]]
  580. cols = MultiIndex(levels=[['C'], ['a', 'b']],
  581. codes=[[0, 0], [0, 1]],
  582. names=[None, 'A'])
  583. idx = Index([np.nan, 0, 1, 2, 3], name='B')
  584. right = DataFrame(vals, columns=cols, index=idx)
  585. assert_frame_equal(left, right)
  586. # GH7401
  587. df = pd.DataFrame({'A': list('aaaaabbbbb'),
  588. 'B': (date_range('2012-01-01', periods=5)
  589. .tolist() * 2),
  590. 'C': np.arange(10)})
  591. df.iloc[3, 1] = np.NaN
  592. left = df.set_index(['A', 'B']).unstack()
  593. vals = np.array([[3, 0, 1, 2, np.nan, 4], [np.nan, 5, 6, 7, 8, 9]])
  594. idx = Index(['a', 'b'], name='A')
  595. cols = MultiIndex(levels=[['C'], date_range('2012-01-01', periods=5)],
  596. codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]],
  597. names=[None, 'B'])
  598. right = DataFrame(vals, columns=cols, index=idx)
  599. assert_frame_equal(left, right)
  600. # GH4862
  601. vals = [['Hg', np.nan, np.nan, 680585148],
  602. ['U', 0.0, np.nan, 680585148],
  603. ['Pb', 7.07e-06, np.nan, 680585148],
  604. ['Sn', 2.3614e-05, 0.0133, 680607017],
  605. ['Ag', 0.0, 0.0133, 680607017],
  606. ['Hg', -0.00015, 0.0133, 680607017]]
  607. df = DataFrame(vals, columns=['agent', 'change', 'dosage', 's_id'],
  608. index=[17263, 17264, 17265, 17266, 17267, 17268])
  609. left = df.copy().set_index(['s_id', 'dosage', 'agent']).unstack()
  610. vals = [[np.nan, np.nan, 7.07e-06, np.nan, 0.0],
  611. [0.0, -0.00015, np.nan, 2.3614e-05, np.nan]]
  612. idx = MultiIndex(levels=[[680585148, 680607017], [0.0133]],
  613. codes=[[0, 1], [-1, 0]],
  614. names=['s_id', 'dosage'])
  615. cols = MultiIndex(levels=[['change'], ['Ag', 'Hg', 'Pb', 'Sn', 'U']],
  616. codes=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]],
  617. names=[None, 'agent'])
  618. right = DataFrame(vals, columns=cols, index=idx)
  619. assert_frame_equal(left, right)
  620. left = df.loc[17264:].copy().set_index(['s_id', 'dosage', 'agent'])
  621. assert_frame_equal(left.unstack(), right)
  622. # GH9497 - multiple unstack with nulls
  623. df = DataFrame({'1st': [1, 2, 1, 2, 1, 2],
  624. '2nd': pd.date_range('2014-02-01', periods=6,
  625. freq='D'),
  626. 'jim': 100 + np.arange(6),
  627. 'joe': (np.random.randn(6) * 10).round(2)})
  628. df['3rd'] = df['2nd'] - pd.Timestamp('2014-02-02')
  629. df.loc[1, '2nd'] = df.loc[3, '2nd'] = np.nan
  630. df.loc[1, '3rd'] = df.loc[4, '3rd'] = np.nan
  631. left = df.set_index(['1st', '2nd', '3rd']).unstack(['2nd', '3rd'])
  632. assert left.notna().values.sum() == 2 * len(df)
  633. for col in ['jim', 'joe']:
  634. for _, r in df.iterrows():
  635. key = r['1st'], (col, r['2nd'], r['3rd'])
  636. assert r[col] == left.loc[key]
  637. def test_stack_datetime_column_multiIndex(self):
  638. # GH 8039
  639. t = datetime(2014, 1, 1)
  640. df = DataFrame(
  641. [1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, 'A', 'B')]))
  642. result = df.stack()
  643. eidx = MultiIndex.from_product([(0, 1, 2, 3), ('B',)])
  644. ecols = MultiIndex.from_tuples([(t, 'A')])
  645. expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols)
  646. assert_frame_equal(result, expected)
  647. def test_stack_partial_multiIndex(self):
  648. # GH 8844
  649. def _test_stack_with_multiindex(multiindex):
  650. df = DataFrame(np.arange(3 * len(multiindex))
  651. .reshape(3, len(multiindex)),
  652. columns=multiindex)
  653. for level in (-1, 0, 1, [0, 1], [1, 0]):
  654. result = df.stack(level=level, dropna=False)
  655. if isinstance(level, int):
  656. # Stacking a single level should not make any all-NaN rows,
  657. # so df.stack(level=level, dropna=False) should be the same
  658. # as df.stack(level=level, dropna=True).
  659. expected = df.stack(level=level, dropna=True)
  660. if isinstance(expected, Series):
  661. assert_series_equal(result, expected)
  662. else:
  663. assert_frame_equal(result, expected)
  664. df.columns = MultiIndex.from_tuples(df.columns.get_values(),
  665. names=df.columns.names)
  666. expected = df.stack(level=level, dropna=False)
  667. if isinstance(expected, Series):
  668. assert_series_equal(result, expected)
  669. else:
  670. assert_frame_equal(result, expected)
  671. full_multiindex = MultiIndex.from_tuples([('B', 'x'), ('B', 'z'),
  672. ('A', 'y'),
  673. ('C', 'x'), ('C', 'u')],
  674. names=['Upper', 'Lower'])
  675. for multiindex_columns in ([0, 1, 2, 3, 4],
  676. [0, 1, 2, 3], [0, 1, 2, 4],
  677. [0, 1, 2], [1, 2, 3], [2, 3, 4],
  678. [0, 1], [0, 2], [0, 3],
  679. [0], [2], [4]):
  680. _test_stack_with_multiindex(full_multiindex[multiindex_columns])
  681. if len(multiindex_columns) > 1:
  682. multiindex_columns.reverse()
  683. _test_stack_with_multiindex(
  684. full_multiindex[multiindex_columns])
  685. df = DataFrame(np.arange(6).reshape(2, 3),
  686. columns=full_multiindex[[0, 1, 3]])
  687. result = df.stack(dropna=False)
  688. expected = DataFrame([[0, 2], [1, np.nan], [3, 5], [4, np.nan]],
  689. index=MultiIndex(
  690. levels=[[0, 1], ['u', 'x', 'y', 'z']],
  691. codes=[[0, 0, 1, 1],
  692. [1, 3, 1, 3]],
  693. names=[None, 'Lower']),
  694. columns=Index(['B', 'C'], name='Upper'),
  695. dtype=df.dtypes[0])
  696. assert_frame_equal(result, expected)
  697. @pytest.mark.parametrize('ordered', [False, True])
  698. @pytest.mark.parametrize('labels', [list("yxz"), list("yxy")])
  699. def test_stack_preserve_categorical_dtype(self, ordered, labels):
  700. # GH13854
  701. cidx = pd.CategoricalIndex(labels, categories=list("xyz"),
  702. ordered=ordered)
  703. df = DataFrame([[10, 11, 12]], columns=cidx)
  704. result = df.stack()
  705. # `MutliIndex.from_product` preserves categorical dtype -
  706. # it's tested elsewhere.
  707. midx = pd.MultiIndex.from_product([df.index, cidx])
  708. expected = Series([10, 11, 12], index=midx)
  709. tm.assert_series_equal(result, expected)
  710. def test_stack_preserve_categorical_dtype_values(self):
  711. # GH-23077
  712. cat = pd.Categorical(['a', 'a', 'b', 'c'])
  713. df = pd.DataFrame({"A": cat, "B": cat})
  714. result = df.stack()
  715. index = pd.MultiIndex.from_product([[0, 1, 2, 3], ['A', 'B']])
  716. expected = pd.Series(pd.Categorical(['a', 'a', 'a', 'a',
  717. 'b', 'b', 'c', 'c']),
  718. index=index)
  719. tm.assert_series_equal(result, expected)
  720. @pytest.mark.parametrize('level', [0, 1])
  721. def test_unstack_mixed_extension_types(self, level):
  722. index = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 1)],
  723. names=['a', 'b'])
  724. df = pd.DataFrame({"A": pd.core.arrays.integer_array([0, 1, None]),
  725. "B": pd.Categorical(['a', 'a', 'b'])}, index=index)
  726. result = df.unstack(level=level)
  727. expected = df.astype(object).unstack(level=level)
  728. expected_dtypes = pd.Series([df.A.dtype] * 2 + [df.B.dtype] * 2,
  729. index=result.columns)
  730. tm.assert_series_equal(result.dtypes, expected_dtypes)
  731. tm.assert_frame_equal(result.astype(object), expected)
  732. @pytest.mark.parametrize("level", [0, 'baz'])
  733. def test_unstack_swaplevel_sortlevel(self, level):
  734. # GH 20994
  735. mi = pd.MultiIndex.from_product([[0], ['d', 'c']],
  736. names=['bar', 'baz'])
  737. df = pd.DataFrame([[0, 2], [1, 3]], index=mi, columns=['B', 'A'])
  738. df.columns.name = 'foo'
  739. expected = pd.DataFrame([
  740. [3, 1, 2, 0]], columns=pd.MultiIndex.from_tuples([
  741. ('c', 'A'), ('c', 'B'), ('d', 'A'), ('d', 'B')], names=[
  742. 'baz', 'foo']))
  743. expected.index.name = 'bar'
  744. result = df.unstack().swaplevel(axis=1).sort_index(axis=1, level=level)
  745. tm.assert_frame_equal(result, expected)
  746. def test_unstack_fill_frame_object():
  747. # GH12815 Test unstacking with object.
  748. data = pd.Series(['a', 'b', 'c', 'a'], dtype='object')
  749. data.index = pd.MultiIndex.from_tuples(
  750. [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
  751. # By default missing values will be NaN
  752. result = data.unstack()
  753. expected = pd.DataFrame(
  754. {'a': ['a', np.nan, 'a'], 'b': ['b', 'c', np.nan]},
  755. index=list('xyz')
  756. )
  757. assert_frame_equal(result, expected)
  758. # Fill with any value replaces missing values as expected
  759. result = data.unstack(fill_value='d')
  760. expected = pd.DataFrame(
  761. {'a': ['a', 'd', 'a'], 'b': ['b', 'c', 'd']},
  762. index=list('xyz')
  763. )
  764. assert_frame_equal(result, expected)
  765. def test_unstack_timezone_aware_values():
  766. # GH 18338
  767. df = pd.DataFrame({
  768. 'timestamp': [
  769. pd.Timestamp('2017-08-27 01:00:00.709949+0000', tz='UTC')],
  770. 'a': ['a'],
  771. 'b': ['b'],
  772. 'c': ['c'],
  773. }, columns=['timestamp', 'a', 'b', 'c'])
  774. result = df.set_index(['a', 'b']).unstack()
  775. expected = pd.DataFrame([[pd.Timestamp('2017-08-27 01:00:00.709949+0000',
  776. tz='UTC'),
  777. 'c']],
  778. index=pd.Index(['a'], name='a'),
  779. columns=pd.MultiIndex(
  780. levels=[['timestamp', 'c'], ['b']],
  781. codes=[[0, 1], [0, 0]],
  782. names=[None, 'b']))
  783. assert_frame_equal(result, expected)
  784. def test_stack_timezone_aware_values():
  785. # GH 19420
  786. ts = pd.date_range(freq="D", start="20180101", end="20180103",
  787. tz="America/New_York")
  788. df = pd.DataFrame({"A": ts}, index=["a", "b", "c"])
  789. result = df.stack()
  790. expected = pd.Series(ts,
  791. index=pd.MultiIndex(levels=[['a', 'b', 'c'], ['A']],
  792. codes=[[0, 1, 2], [0, 0, 0]]))
  793. assert_series_equal(result, expected)