test_melt.py 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718
  1. # -*- coding: utf-8 -*-
  2. # pylint: disable-msg=W0612,E1101
  3. import numpy as np
  4. from numpy import nan
  5. import pytest
  6. from pandas.compat import range
  7. import pandas as pd
  8. from pandas import DataFrame, lreshape, melt, wide_to_long
  9. import pandas.util.testing as tm
  10. class TestMelt(object):
  11. def setup_method(self, method):
  12. self.df = tm.makeTimeDataFrame()[:10]
  13. self.df['id1'] = (self.df['A'] > 0).astype(np.int64)
  14. self.df['id2'] = (self.df['B'] > 0).astype(np.int64)
  15. self.var_name = 'var'
  16. self.value_name = 'val'
  17. self.df1 = pd.DataFrame([[1.067683, -1.110463, 0.20867
  18. ], [-1.321405, 0.368915, -1.055342],
  19. [-0.807333, 0.08298, -0.873361]])
  20. self.df1.columns = [list('ABC'), list('abc')]
  21. self.df1.columns.names = ['CAP', 'low']
  22. def test_top_level_method(self):
  23. result = melt(self.df)
  24. assert result.columns.tolist() == ['variable', 'value']
  25. def test_method_signatures(self):
  26. tm.assert_frame_equal(self.df.melt(),
  27. melt(self.df))
  28. tm.assert_frame_equal(self.df.melt(id_vars=['id1', 'id2'],
  29. value_vars=['A', 'B']),
  30. melt(self.df,
  31. id_vars=['id1', 'id2'],
  32. value_vars=['A', 'B']))
  33. tm.assert_frame_equal(self.df.melt(var_name=self.var_name,
  34. value_name=self.value_name),
  35. melt(self.df,
  36. var_name=self.var_name,
  37. value_name=self.value_name))
  38. tm.assert_frame_equal(self.df1.melt(col_level=0),
  39. melt(self.df1, col_level=0))
  40. def test_default_col_names(self):
  41. result = self.df.melt()
  42. assert result.columns.tolist() == ['variable', 'value']
  43. result1 = self.df.melt(id_vars=['id1'])
  44. assert result1.columns.tolist() == ['id1', 'variable', 'value']
  45. result2 = self.df.melt(id_vars=['id1', 'id2'])
  46. assert result2.columns.tolist() == ['id1', 'id2', 'variable', 'value']
  47. def test_value_vars(self):
  48. result3 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A')
  49. assert len(result3) == 10
  50. result4 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'])
  51. expected4 = DataFrame({'id1': self.df['id1'].tolist() * 2,
  52. 'id2': self.df['id2'].tolist() * 2,
  53. 'variable': ['A'] * 10 + ['B'] * 10,
  54. 'value': (self.df['A'].tolist() +
  55. self.df['B'].tolist())},
  56. columns=['id1', 'id2', 'variable', 'value'])
  57. tm.assert_frame_equal(result4, expected4)
  58. def test_value_vars_types(self):
  59. # GH 15348
  60. expected = DataFrame({'id1': self.df['id1'].tolist() * 2,
  61. 'id2': self.df['id2'].tolist() * 2,
  62. 'variable': ['A'] * 10 + ['B'] * 10,
  63. 'value': (self.df['A'].tolist() +
  64. self.df['B'].tolist())},
  65. columns=['id1', 'id2', 'variable', 'value'])
  66. for type_ in (tuple, list, np.array):
  67. result = self.df.melt(id_vars=['id1', 'id2'],
  68. value_vars=type_(('A', 'B')))
  69. tm.assert_frame_equal(result, expected)
  70. def test_vars_work_with_multiindex(self):
  71. expected = DataFrame({
  72. ('A', 'a'): self.df1[('A', 'a')],
  73. 'CAP': ['B'] * len(self.df1),
  74. 'low': ['b'] * len(self.df1),
  75. 'value': self.df1[('B', 'b')],
  76. }, columns=[('A', 'a'), 'CAP', 'low', 'value'])
  77. result = self.df1.melt(id_vars=[('A', 'a')], value_vars=[('B', 'b')])
  78. tm.assert_frame_equal(result, expected)
  79. def test_single_vars_work_with_multiindex(self):
  80. expected = DataFrame({
  81. 'A': {0: 1.067683, 1: -1.321405, 2: -0.807333},
  82. 'CAP': {0: 'B', 1: 'B', 2: 'B'},
  83. 'value': {0: -1.110463, 1: 0.368915, 2: 0.08298}})
  84. result = self.df1.melt(['A'], ['B'], col_level=0)
  85. tm.assert_frame_equal(result, expected)
  86. def test_tuple_vars_fail_with_multiindex(self):
  87. # melt should fail with an informative error message if
  88. # the columns have a MultiIndex and a tuple is passed
  89. # for id_vars or value_vars.
  90. tuple_a = ('A', 'a')
  91. list_a = [tuple_a]
  92. tuple_b = ('B', 'b')
  93. list_b = [tuple_b]
  94. msg = (r"(id|value)_vars must be a list of tuples when columns are"
  95. " a MultiIndex")
  96. for id_vars, value_vars in ((tuple_a, list_b), (list_a, tuple_b),
  97. (tuple_a, tuple_b)):
  98. with pytest.raises(ValueError, match=msg):
  99. self.df1.melt(id_vars=id_vars, value_vars=value_vars)
  100. def test_custom_var_name(self):
  101. result5 = self.df.melt(var_name=self.var_name)
  102. assert result5.columns.tolist() == ['var', 'value']
  103. result6 = self.df.melt(id_vars=['id1'], var_name=self.var_name)
  104. assert result6.columns.tolist() == ['id1', 'var', 'value']
  105. result7 = self.df.melt(id_vars=['id1', 'id2'], var_name=self.var_name)
  106. assert result7.columns.tolist() == ['id1', 'id2', 'var', 'value']
  107. result8 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A',
  108. var_name=self.var_name)
  109. assert result8.columns.tolist() == ['id1', 'id2', 'var', 'value']
  110. result9 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'],
  111. var_name=self.var_name)
  112. expected9 = DataFrame({'id1': self.df['id1'].tolist() * 2,
  113. 'id2': self.df['id2'].tolist() * 2,
  114. self.var_name: ['A'] * 10 + ['B'] * 10,
  115. 'value': (self.df['A'].tolist() +
  116. self.df['B'].tolist())},
  117. columns=['id1', 'id2', self.var_name, 'value'])
  118. tm.assert_frame_equal(result9, expected9)
  119. def test_custom_value_name(self):
  120. result10 = self.df.melt(value_name=self.value_name)
  121. assert result10.columns.tolist() == ['variable', 'val']
  122. result11 = self.df.melt(id_vars=['id1'], value_name=self.value_name)
  123. assert result11.columns.tolist() == ['id1', 'variable', 'val']
  124. result12 = self.df.melt(id_vars=['id1', 'id2'],
  125. value_name=self.value_name)
  126. assert result12.columns.tolist() == ['id1', 'id2', 'variable', 'val']
  127. result13 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A',
  128. value_name=self.value_name)
  129. assert result13.columns.tolist() == ['id1', 'id2', 'variable', 'val']
  130. result14 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'],
  131. value_name=self.value_name)
  132. expected14 = DataFrame({'id1': self.df['id1'].tolist() * 2,
  133. 'id2': self.df['id2'].tolist() * 2,
  134. 'variable': ['A'] * 10 + ['B'] * 10,
  135. self.value_name: (self.df['A'].tolist() +
  136. self.df['B'].tolist())},
  137. columns=['id1', 'id2', 'variable',
  138. self.value_name])
  139. tm.assert_frame_equal(result14, expected14)
  140. def test_custom_var_and_value_name(self):
  141. result15 = self.df.melt(var_name=self.var_name,
  142. value_name=self.value_name)
  143. assert result15.columns.tolist() == ['var', 'val']
  144. result16 = self.df.melt(id_vars=['id1'], var_name=self.var_name,
  145. value_name=self.value_name)
  146. assert result16.columns.tolist() == ['id1', 'var', 'val']
  147. result17 = self.df.melt(id_vars=['id1', 'id2'],
  148. var_name=self.var_name,
  149. value_name=self.value_name)
  150. assert result17.columns.tolist() == ['id1', 'id2', 'var', 'val']
  151. result18 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A',
  152. var_name=self.var_name,
  153. value_name=self.value_name)
  154. assert result18.columns.tolist() == ['id1', 'id2', 'var', 'val']
  155. result19 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'],
  156. var_name=self.var_name,
  157. value_name=self.value_name)
  158. expected19 = DataFrame({'id1': self.df['id1'].tolist() * 2,
  159. 'id2': self.df['id2'].tolist() * 2,
  160. self.var_name: ['A'] * 10 + ['B'] * 10,
  161. self.value_name: (self.df['A'].tolist() +
  162. self.df['B'].tolist())},
  163. columns=['id1', 'id2', self.var_name,
  164. self.value_name])
  165. tm.assert_frame_equal(result19, expected19)
  166. df20 = self.df.copy()
  167. df20.columns.name = 'foo'
  168. result20 = df20.melt()
  169. assert result20.columns.tolist() == ['foo', 'value']
  170. def test_col_level(self):
  171. res1 = self.df1.melt(col_level=0)
  172. res2 = self.df1.melt(col_level='CAP')
  173. assert res1.columns.tolist() == ['CAP', 'value']
  174. assert res2.columns.tolist() == ['CAP', 'value']
  175. def test_multiindex(self):
  176. res = self.df1.melt()
  177. assert res.columns.tolist() == ['CAP', 'low', 'value']
  178. @pytest.mark.parametrize("col", [
  179. pd.Series(pd.date_range('2010', periods=5, tz='US/Pacific')),
  180. pd.Series(["a", "b", "c", "a", "d"], dtype="category"),
  181. pd.Series([0, 1, 0, 0, 0])])
  182. def test_pandas_dtypes(self, col):
  183. # GH 15785
  184. df = DataFrame({'klass': range(5),
  185. 'col': col,
  186. 'attr1': [1, 0, 0, 0, 0],
  187. 'attr2': col})
  188. expected_value = pd.concat([pd.Series([1, 0, 0, 0, 0]), col],
  189. ignore_index=True)
  190. result = melt(df, id_vars=['klass', 'col'], var_name='attribute',
  191. value_name='value')
  192. expected = DataFrame({0: list(range(5)) * 2,
  193. 1: pd.concat([col] * 2, ignore_index=True),
  194. 2: ['attr1'] * 5 + ['attr2'] * 5,
  195. 3: expected_value})
  196. expected.columns = ['klass', 'col', 'attribute', 'value']
  197. tm.assert_frame_equal(result, expected)
  198. def test_melt_missing_columns_raises(self):
  199. # GH-23575
  200. # This test is to ensure that pandas raises an error if melting is
  201. # attempted with column names absent from the dataframe
  202. # Generate data
  203. df = pd.DataFrame(np.random.randn(5, 4), columns=list('abcd'))
  204. # Try to melt with missing `value_vars` column name
  205. msg = "The following '{Var}' are not present in the DataFrame: {Col}"
  206. with pytest.raises(
  207. KeyError,
  208. match=msg.format(Var='value_vars', Col="\\['C'\\]")):
  209. df.melt(['a', 'b'], ['C', 'd'])
  210. # Try to melt with missing `id_vars` column name
  211. with pytest.raises(
  212. KeyError,
  213. match=msg.format(Var='id_vars', Col="\\['A'\\]")):
  214. df.melt(['A', 'b'], ['c', 'd'])
  215. # Multiple missing
  216. with pytest.raises(
  217. KeyError,
  218. match=msg.format(Var='id_vars',
  219. Col="\\['not_here', 'or_there'\\]")):
  220. df.melt(['a', 'b', 'not_here', 'or_there'], ['c', 'd'])
  221. # Multiindex melt fails if column is missing from multilevel melt
  222. multi = df.copy()
  223. multi.columns = [list('ABCD'), list('abcd')]
  224. with pytest.raises(
  225. KeyError,
  226. match=msg.format(Var='id_vars',
  227. Col="\\['E'\\]")):
  228. multi.melt([('E', 'a')], [('B', 'b')])
  229. # Multiindex fails if column is missing from single level melt
  230. with pytest.raises(
  231. KeyError,
  232. match=msg.format(Var='value_vars',
  233. Col="\\['F'\\]")):
  234. multi.melt(['A'], ['F'], col_level=0)
  235. class TestLreshape(object):
  236. def test_pairs(self):
  237. data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', '21dec2008',
  238. '11jan2009'],
  239. 'birthwt': [1766, 3301, 1454, 3139, 4133],
  240. 'id': [101, 102, 103, 104, 105],
  241. 'sex': ['Male', 'Female', 'Female', 'Female', 'Female'],
  242. 'visitdt1': ['11jan2009', '22dec2008', '04jan2009',
  243. '29dec2008', '20jan2009'],
  244. 'visitdt2':
  245. ['21jan2009', nan, '22jan2009', '31dec2008', '03feb2009'],
  246. 'visitdt3': ['05feb2009', nan, nan, '02jan2009', '15feb2009'],
  247. 'wt1': [1823, 3338, 1549, 3298, 4306],
  248. 'wt2': [2011.0, nan, 1892.0, 3338.0, 4575.0],
  249. 'wt3': [2293.0, nan, nan, 3377.0, 4805.0]}
  250. df = DataFrame(data)
  251. spec = {'visitdt': ['visitdt%d' % i for i in range(1, 4)],
  252. 'wt': ['wt%d' % i for i in range(1, 4)]}
  253. result = lreshape(df, spec)
  254. exp_data = {'birthdt':
  255. ['08jan2009', '20dec2008', '30dec2008', '21dec2008',
  256. '11jan2009', '08jan2009', '30dec2008', '21dec2008',
  257. '11jan2009', '08jan2009', '21dec2008', '11jan2009'],
  258. 'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 1454, 3139,
  259. 4133, 1766, 3139, 4133],
  260. 'id': [101, 102, 103, 104, 105, 101, 103, 104, 105, 101,
  261. 104, 105],
  262. 'sex': ['Male', 'Female', 'Female', 'Female', 'Female',
  263. 'Male', 'Female', 'Female', 'Female', 'Male',
  264. 'Female', 'Female'],
  265. 'visitdt': ['11jan2009', '22dec2008', '04jan2009',
  266. '29dec2008', '20jan2009', '21jan2009',
  267. '22jan2009', '31dec2008', '03feb2009',
  268. '05feb2009', '02jan2009', '15feb2009'],
  269. 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0,
  270. 1892.0, 3338.0, 4575.0, 2293.0, 3377.0, 4805.0]}
  271. exp = DataFrame(exp_data, columns=result.columns)
  272. tm.assert_frame_equal(result, exp)
  273. result = lreshape(df, spec, dropna=False)
  274. exp_data = {'birthdt':
  275. ['08jan2009', '20dec2008', '30dec2008', '21dec2008',
  276. '11jan2009', '08jan2009', '20dec2008', '30dec2008',
  277. '21dec2008', '11jan2009', '08jan2009', '20dec2008',
  278. '30dec2008', '21dec2008', '11jan2009'],
  279. 'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 3301, 1454,
  280. 3139, 4133, 1766, 3301, 1454, 3139, 4133],
  281. 'id': [101, 102, 103, 104, 105, 101, 102, 103, 104, 105,
  282. 101, 102, 103, 104, 105],
  283. 'sex': ['Male', 'Female', 'Female', 'Female', 'Female',
  284. 'Male', 'Female', 'Female', 'Female', 'Female',
  285. 'Male', 'Female', 'Female', 'Female', 'Female'],
  286. 'visitdt': ['11jan2009', '22dec2008', '04jan2009',
  287. '29dec2008', '20jan2009', '21jan2009', nan,
  288. '22jan2009', '31dec2008', '03feb2009',
  289. '05feb2009', nan, nan, '02jan2009',
  290. '15feb2009'],
  291. 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, nan,
  292. 1892.0, 3338.0, 4575.0, 2293.0, nan, nan, 3377.0,
  293. 4805.0]}
  294. exp = DataFrame(exp_data, columns=result.columns)
  295. tm.assert_frame_equal(result, exp)
  296. spec = {'visitdt': ['visitdt%d' % i for i in range(1, 3)],
  297. 'wt': ['wt%d' % i for i in range(1, 4)]}
  298. msg = "All column lists must be same length"
  299. with pytest.raises(ValueError, match=msg):
  300. lreshape(df, spec)
  301. class TestWideToLong(object):
  302. def test_simple(self):
  303. np.random.seed(123)
  304. x = np.random.randn(3)
  305. df = pd.DataFrame({"A1970": {0: "a",
  306. 1: "b",
  307. 2: "c"},
  308. "A1980": {0: "d",
  309. 1: "e",
  310. 2: "f"},
  311. "B1970": {0: 2.5,
  312. 1: 1.2,
  313. 2: .7},
  314. "B1980": {0: 3.2,
  315. 1: 1.3,
  316. 2: .1},
  317. "X": dict(zip(
  318. range(3), x))})
  319. df["id"] = df.index
  320. exp_data = {"X": x.tolist() + x.tolist(),
  321. "A": ['a', 'b', 'c', 'd', 'e', 'f'],
  322. "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
  323. "year": [1970, 1970, 1970, 1980, 1980, 1980],
  324. "id": [0, 1, 2, 0, 1, 2]}
  325. expected = DataFrame(exp_data)
  326. expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
  327. result = wide_to_long(df, ["A", "B"], i="id", j="year")
  328. tm.assert_frame_equal(result, expected)
  329. def test_stubs(self):
  330. # GH9204
  331. df = pd.DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]])
  332. df.columns = ['id', 'inc1', 'inc2', 'edu1', 'edu2']
  333. stubs = ['inc', 'edu']
  334. # TODO: unused?
  335. df_long = pd.wide_to_long(df, stubs, i='id', j='age') # noqa
  336. assert stubs == ['inc', 'edu']
  337. def test_separating_character(self):
  338. # GH14779
  339. np.random.seed(123)
  340. x = np.random.randn(3)
  341. df = pd.DataFrame({"A.1970": {0: "a",
  342. 1: "b",
  343. 2: "c"},
  344. "A.1980": {0: "d",
  345. 1: "e",
  346. 2: "f"},
  347. "B.1970": {0: 2.5,
  348. 1: 1.2,
  349. 2: .7},
  350. "B.1980": {0: 3.2,
  351. 1: 1.3,
  352. 2: .1},
  353. "X": dict(zip(
  354. range(3), x))})
  355. df["id"] = df.index
  356. exp_data = {"X": x.tolist() + x.tolist(),
  357. "A": ['a', 'b', 'c', 'd', 'e', 'f'],
  358. "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
  359. "year": [1970, 1970, 1970, 1980, 1980, 1980],
  360. "id": [0, 1, 2, 0, 1, 2]}
  361. expected = DataFrame(exp_data)
  362. expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
  363. result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".")
  364. tm.assert_frame_equal(result, expected)
  365. def test_escapable_characters(self):
  366. np.random.seed(123)
  367. x = np.random.randn(3)
  368. df = pd.DataFrame({"A(quarterly)1970": {0: "a",
  369. 1: "b",
  370. 2: "c"},
  371. "A(quarterly)1980": {0: "d",
  372. 1: "e",
  373. 2: "f"},
  374. "B(quarterly)1970": {0: 2.5,
  375. 1: 1.2,
  376. 2: .7},
  377. "B(quarterly)1980": {0: 3.2,
  378. 1: 1.3,
  379. 2: .1},
  380. "X": dict(zip(
  381. range(3), x))})
  382. df["id"] = df.index
  383. exp_data = {"X": x.tolist() + x.tolist(),
  384. "A(quarterly)": ['a', 'b', 'c', 'd', 'e', 'f'],
  385. "B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
  386. "year": [1970, 1970, 1970, 1980, 1980, 1980],
  387. "id": [0, 1, 2, 0, 1, 2]}
  388. expected = DataFrame(exp_data)
  389. expected = expected.set_index(
  390. ['id', 'year'])[["X", "A(quarterly)", "B(quarterly)"]]
  391. result = wide_to_long(df, ["A(quarterly)", "B(quarterly)"],
  392. i="id", j="year")
  393. tm.assert_frame_equal(result, expected)
  394. def test_unbalanced(self):
  395. # test that we can have a varying amount of time variables
  396. df = pd.DataFrame({'A2010': [1.0, 2.0],
  397. 'A2011': [3.0, 4.0],
  398. 'B2010': [5.0, 6.0],
  399. 'X': ['X1', 'X2']})
  400. df['id'] = df.index
  401. exp_data = {'X': ['X1', 'X1', 'X2', 'X2'],
  402. 'A': [1.0, 3.0, 2.0, 4.0],
  403. 'B': [5.0, np.nan, 6.0, np.nan],
  404. 'id': [0, 0, 1, 1],
  405. 'year': [2010, 2011, 2010, 2011]}
  406. expected = pd.DataFrame(exp_data)
  407. expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
  408. result = wide_to_long(df, ['A', 'B'], i='id', j='year')
  409. tm.assert_frame_equal(result, expected)
  410. def test_character_overlap(self):
  411. # Test we handle overlapping characters in both id_vars and value_vars
  412. df = pd.DataFrame({
  413. 'A11': ['a11', 'a22', 'a33'],
  414. 'A12': ['a21', 'a22', 'a23'],
  415. 'B11': ['b11', 'b12', 'b13'],
  416. 'B12': ['b21', 'b22', 'b23'],
  417. 'BB11': [1, 2, 3],
  418. 'BB12': [4, 5, 6],
  419. 'BBBX': [91, 92, 93],
  420. 'BBBZ': [91, 92, 93]
  421. })
  422. df['id'] = df.index
  423. expected = pd.DataFrame({
  424. 'BBBX': [91, 92, 93, 91, 92, 93],
  425. 'BBBZ': [91, 92, 93, 91, 92, 93],
  426. 'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'],
  427. 'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'],
  428. 'BB': [1, 2, 3, 4, 5, 6],
  429. 'id': [0, 1, 2, 0, 1, 2],
  430. 'year': [11, 11, 11, 12, 12, 12]})
  431. expected = expected.set_index(['id', 'year'])[
  432. ['BBBX', 'BBBZ', 'A', 'B', 'BB']]
  433. result = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year')
  434. tm.assert_frame_equal(result.sort_index(axis=1),
  435. expected.sort_index(axis=1))
  436. def test_invalid_separator(self):
  437. # if an invalid separator is supplied a empty data frame is returned
  438. sep = 'nope!'
  439. df = pd.DataFrame({'A2010': [1.0, 2.0],
  440. 'A2011': [3.0, 4.0],
  441. 'B2010': [5.0, 6.0],
  442. 'X': ['X1', 'X2']})
  443. df['id'] = df.index
  444. exp_data = {'X': '',
  445. 'A2010': [],
  446. 'A2011': [],
  447. 'B2010': [],
  448. 'id': [],
  449. 'year': [],
  450. 'A': [],
  451. 'B': []}
  452. expected = pd.DataFrame(exp_data).astype({'year': 'int'})
  453. expected = expected.set_index(['id', 'year'])[[
  454. 'X', 'A2010', 'A2011', 'B2010', 'A', 'B']]
  455. expected.index.set_levels([0, 1], level=0, inplace=True)
  456. result = wide_to_long(df, ['A', 'B'], i='id', j='year', sep=sep)
  457. tm.assert_frame_equal(result.sort_index(axis=1),
  458. expected.sort_index(axis=1))
  459. def test_num_string_disambiguation(self):
  460. # Test that we can disambiguate number value_vars from
  461. # string value_vars
  462. df = pd.DataFrame({
  463. 'A11': ['a11', 'a22', 'a33'],
  464. 'A12': ['a21', 'a22', 'a23'],
  465. 'B11': ['b11', 'b12', 'b13'],
  466. 'B12': ['b21', 'b22', 'b23'],
  467. 'BB11': [1, 2, 3],
  468. 'BB12': [4, 5, 6],
  469. 'Arating': [91, 92, 93],
  470. 'Arating_old': [91, 92, 93]
  471. })
  472. df['id'] = df.index
  473. expected = pd.DataFrame({
  474. 'Arating': [91, 92, 93, 91, 92, 93],
  475. 'Arating_old': [91, 92, 93, 91, 92, 93],
  476. 'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'],
  477. 'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'],
  478. 'BB': [1, 2, 3, 4, 5, 6],
  479. 'id': [0, 1, 2, 0, 1, 2],
  480. 'year': [11, 11, 11, 12, 12, 12]})
  481. expected = expected.set_index(['id', 'year'])[
  482. ['Arating', 'Arating_old', 'A', 'B', 'BB']]
  483. result = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year')
  484. tm.assert_frame_equal(result.sort_index(axis=1),
  485. expected.sort_index(axis=1))
  486. def test_invalid_suffixtype(self):
  487. # If all stubs names end with a string, but a numeric suffix is
  488. # assumed, an empty data frame is returned
  489. df = pd.DataFrame({'Aone': [1.0, 2.0],
  490. 'Atwo': [3.0, 4.0],
  491. 'Bone': [5.0, 6.0],
  492. 'X': ['X1', 'X2']})
  493. df['id'] = df.index
  494. exp_data = {'X': '',
  495. 'Aone': [],
  496. 'Atwo': [],
  497. 'Bone': [],
  498. 'id': [],
  499. 'year': [],
  500. 'A': [],
  501. 'B': []}
  502. expected = pd.DataFrame(exp_data).astype({'year': 'int'})
  503. expected = expected.set_index(['id', 'year'])
  504. expected.index.set_levels([0, 1], level=0, inplace=True)
  505. result = wide_to_long(df, ['A', 'B'], i='id', j='year')
  506. tm.assert_frame_equal(result.sort_index(axis=1),
  507. expected.sort_index(axis=1))
  508. def test_multiple_id_columns(self):
  509. # Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm
  510. df = pd.DataFrame({
  511. 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
  512. 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
  513. 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
  514. 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
  515. })
  516. expected = pd.DataFrame({
  517. 'ht': [2.8, 3.4, 2.9, 3.8, 2.2, 2.9, 2.0, 3.2, 1.8,
  518. 2.8, 1.9, 2.4, 2.2, 3.3, 2.3, 3.4, 2.1, 2.9],
  519. 'famid': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3],
  520. 'birth': [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3],
  521. 'age': [1, 2, 1, 2, 1, 2, 1, 2, 1,
  522. 2, 1, 2, 1, 2, 1, 2, 1, 2]
  523. })
  524. expected = expected.set_index(['famid', 'birth', 'age'])[['ht']]
  525. result = wide_to_long(df, 'ht', i=['famid', 'birth'], j='age')
  526. tm.assert_frame_equal(result, expected)
  527. def test_non_unique_idvars(self):
  528. # GH16382
  529. # Raise an error message if non unique id vars (i) are passed
  530. df = pd.DataFrame({
  531. 'A_A1': [1, 2, 3, 4, 5],
  532. 'B_B1': [1, 2, 3, 4, 5],
  533. 'x': [1, 1, 1, 1, 1]
  534. })
  535. msg = "the id variables need to uniquely identify each row"
  536. with pytest.raises(ValueError, match=msg):
  537. wide_to_long(df, ['A_A', 'B_B'], i='x', j='colname')
  538. def test_cast_j_int(self):
  539. df = pd.DataFrame({
  540. 'actor_1': ['CCH Pounder', 'Johnny Depp', 'Christoph Waltz'],
  541. 'actor_2': ['Joel David Moore', 'Orlando Bloom', 'Rory Kinnear'],
  542. 'actor_fb_likes_1': [1000.0, 40000.0, 11000.0],
  543. 'actor_fb_likes_2': [936.0, 5000.0, 393.0],
  544. 'title': ['Avatar', "Pirates of the Caribbean", 'Spectre']})
  545. expected = pd.DataFrame({
  546. 'actor': ['CCH Pounder',
  547. 'Johnny Depp',
  548. 'Christoph Waltz',
  549. 'Joel David Moore',
  550. 'Orlando Bloom',
  551. 'Rory Kinnear'],
  552. 'actor_fb_likes': [1000.0, 40000.0, 11000.0, 936.0, 5000.0, 393.0],
  553. 'num': [1, 1, 1, 2, 2, 2],
  554. 'title': ['Avatar',
  555. 'Pirates of the Caribbean',
  556. 'Spectre',
  557. 'Avatar',
  558. 'Pirates of the Caribbean',
  559. 'Spectre']}).set_index(['title', 'num'])
  560. result = wide_to_long(df, ['actor', 'actor_fb_likes'],
  561. i='title', j='num', sep='_')
  562. tm.assert_frame_equal(result, expected)
  563. def test_identical_stubnames(self):
  564. df = pd.DataFrame({'A2010': [1.0, 2.0],
  565. 'A2011': [3.0, 4.0],
  566. 'B2010': [5.0, 6.0],
  567. 'A': ['X1', 'X2']})
  568. msg = "stubname can't be identical to a column name"
  569. with pytest.raises(ValueError, match=msg):
  570. wide_to_long(df, ['A', 'B'], i='A', j='colname')
  571. def test_nonnumeric_suffix(self):
  572. df = pd.DataFrame({'treatment_placebo': [1.0, 2.0],
  573. 'treatment_test': [3.0, 4.0],
  574. 'result_placebo': [5.0, 6.0],
  575. 'A': ['X1', 'X2']})
  576. expected = pd.DataFrame({
  577. 'A': ['X1', 'X1', 'X2', 'X2'],
  578. 'colname': ['placebo', 'test', 'placebo', 'test'],
  579. 'result': [5.0, np.nan, 6.0, np.nan],
  580. 'treatment': [1.0, 3.0, 2.0, 4.0]})
  581. expected = expected.set_index(['A', 'colname'])
  582. result = wide_to_long(df, ['result', 'treatment'],
  583. i='A', j='colname', suffix='[a-z]+', sep='_')
  584. tm.assert_frame_equal(result, expected)
  585. def test_mixed_type_suffix(self):
  586. df = pd.DataFrame({
  587. 'A': ['X1', 'X2'],
  588. 'result_1': [0, 9],
  589. 'result_foo': [5.0, 6.0],
  590. 'treatment_1': [1.0, 2.0],
  591. 'treatment_foo': [3.0, 4.0]})
  592. expected = pd.DataFrame({
  593. 'A': ['X1', 'X2', 'X1', 'X2'],
  594. 'colname': ['1', '1', 'foo', 'foo'],
  595. 'result': [0.0, 9.0, 5.0, 6.0],
  596. 'treatment': [1.0, 2.0, 3.0, 4.0]}).set_index(['A', 'colname'])
  597. result = wide_to_long(df, ['result', 'treatment'],
  598. i='A', j='colname', suffix='.+', sep='_')
  599. tm.assert_frame_equal(result, expected)
  600. def test_float_suffix(self):
  601. df = pd.DataFrame({
  602. 'treatment_1.1': [1.0, 2.0],
  603. 'treatment_2.1': [3.0, 4.0],
  604. 'result_1.2': [5.0, 6.0],
  605. 'result_1': [0, 9],
  606. 'A': ['X1', 'X2']})
  607. expected = pd.DataFrame({
  608. 'A': ['X1', 'X1', 'X1', 'X1', 'X2', 'X2', 'X2', 'X2'],
  609. 'colname': [1, 1.1, 1.2, 2.1, 1, 1.1, 1.2, 2.1],
  610. 'result': [0.0, np.nan, 5.0, np.nan, 9.0, np.nan, 6.0, np.nan],
  611. 'treatment': [np.nan, 1.0, np.nan, 3.0, np.nan, 2.0, np.nan, 4.0]})
  612. expected = expected.set_index(['A', 'colname'])
  613. result = wide_to_long(df, ['result', 'treatment'],
  614. i='A', j='colname', suffix='[0-9.]+', sep='_')
  615. tm.assert_frame_equal(result, expected)
  616. def test_col_substring_of_stubname(self):
  617. # GH22468
  618. # Don't raise ValueError when a column name is a substring
  619. # of a stubname that's been passed as a string
  620. wide_data = {'node_id': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4},
  621. 'A': {0: 0.80, 1: 0.0, 2: 0.25, 3: 1.0, 4: 0.81},
  622. 'PA0': {0: 0.74, 1: 0.56, 2: 0.56, 3: 0.98, 4: 0.6},
  623. 'PA1': {0: 0.77, 1: 0.64, 2: 0.52, 3: 0.98, 4: 0.67},
  624. 'PA3': {0: 0.34, 1: 0.70, 2: 0.52, 3: 0.98, 4: 0.67}
  625. }
  626. wide_df = pd.DataFrame.from_dict(wide_data)
  627. expected = pd.wide_to_long(wide_df,
  628. stubnames=['PA'],
  629. i=['node_id', 'A'],
  630. j='time')
  631. result = pd.wide_to_long(wide_df,
  632. stubnames='PA',
  633. i=['node_id', 'A'],
  634. j='time')
  635. tm.assert_frame_equal(result, expected)