melt.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461
  1. # pylint: disable=E1101,E1103
  2. # pylint: disable=W0703,W0622,W0613,W0201
  3. import re
  4. import numpy as np
  5. from pandas.util._decorators import Appender
  6. from pandas.core.dtypes.common import is_extension_type, is_list_like
  7. from pandas.core.dtypes.generic import ABCMultiIndex
  8. from pandas.core.dtypes.missing import notna
  9. from pandas import compat
  10. from pandas.core.arrays import Categorical
  11. from pandas.core.frame import _shared_docs
  12. from pandas.core.indexes.base import Index
  13. from pandas.core.reshape.concat import concat
  14. from pandas.core.tools.numeric import to_numeric
  15. @Appender(_shared_docs['melt'] %
  16. dict(caller='pd.melt(df, ',
  17. versionadded="",
  18. other='DataFrame.melt'))
  19. def melt(frame, id_vars=None, value_vars=None, var_name=None,
  20. value_name='value', col_level=None):
  21. # TODO: what about the existing index?
  22. # If multiindex, gather names of columns on all level for checking presence
  23. # of `id_vars` and `value_vars`
  24. if isinstance(frame.columns, ABCMultiIndex):
  25. cols = [x for c in frame.columns for x in c]
  26. else:
  27. cols = list(frame.columns)
  28. if id_vars is not None:
  29. if not is_list_like(id_vars):
  30. id_vars = [id_vars]
  31. elif (isinstance(frame.columns, ABCMultiIndex) and
  32. not isinstance(id_vars, list)):
  33. raise ValueError('id_vars must be a list of tuples when columns'
  34. ' are a MultiIndex')
  35. else:
  36. # Check that `id_vars` are in frame
  37. id_vars = list(id_vars)
  38. missing = Index(np.ravel(id_vars)).difference(cols)
  39. if not missing.empty:
  40. raise KeyError("The following 'id_vars' are not present"
  41. " in the DataFrame: {missing}"
  42. "".format(missing=list(missing)))
  43. else:
  44. id_vars = []
  45. if value_vars is not None:
  46. if not is_list_like(value_vars):
  47. value_vars = [value_vars]
  48. elif (isinstance(frame.columns, ABCMultiIndex) and
  49. not isinstance(value_vars, list)):
  50. raise ValueError('value_vars must be a list of tuples when'
  51. ' columns are a MultiIndex')
  52. else:
  53. value_vars = list(value_vars)
  54. # Check that `value_vars` are in frame
  55. missing = Index(np.ravel(value_vars)).difference(cols)
  56. if not missing.empty:
  57. raise KeyError("The following 'value_vars' are not present in"
  58. " the DataFrame: {missing}"
  59. "".format(missing=list(missing)))
  60. frame = frame.loc[:, id_vars + value_vars]
  61. else:
  62. frame = frame.copy()
  63. if col_level is not None: # allow list or other?
  64. # frame is a copy
  65. frame.columns = frame.columns.get_level_values(col_level)
  66. if var_name is None:
  67. if isinstance(frame.columns, ABCMultiIndex):
  68. if len(frame.columns.names) == len(set(frame.columns.names)):
  69. var_name = frame.columns.names
  70. else:
  71. var_name = ['variable_{i}'.format(i=i)
  72. for i in range(len(frame.columns.names))]
  73. else:
  74. var_name = [frame.columns.name if frame.columns.name is not None
  75. else 'variable']
  76. if isinstance(var_name, compat.string_types):
  77. var_name = [var_name]
  78. N, K = frame.shape
  79. K -= len(id_vars)
  80. mdata = {}
  81. for col in id_vars:
  82. id_data = frame.pop(col)
  83. if is_extension_type(id_data):
  84. id_data = concat([id_data] * K, ignore_index=True)
  85. else:
  86. id_data = np.tile(id_data.values, K)
  87. mdata[col] = id_data
  88. mcolumns = id_vars + var_name + [value_name]
  89. mdata[value_name] = frame.values.ravel('F')
  90. for i, col in enumerate(var_name):
  91. # asanyarray will keep the columns as an Index
  92. mdata[col] = np.asanyarray(frame.columns
  93. ._get_level_values(i)).repeat(N)
  94. return frame._constructor(mdata, columns=mcolumns)
  95. def lreshape(data, groups, dropna=True, label=None):
  96. """
  97. Reshape long-format data to wide. Generalized inverse of DataFrame.pivot
  98. Parameters
  99. ----------
  100. data : DataFrame
  101. groups : dict
  102. {new_name : list_of_columns}
  103. dropna : boolean, default True
  104. Examples
  105. --------
  106. >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526],
  107. ... 'team': ['Red Sox', 'Yankees'],
  108. ... 'year1': [2007, 2007], 'year2': [2008, 2008]})
  109. >>> data
  110. hr1 hr2 team year1 year2
  111. 0 514 545 Red Sox 2007 2008
  112. 1 573 526 Yankees 2007 2008
  113. >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']})
  114. team year hr
  115. 0 Red Sox 2007 514
  116. 1 Yankees 2007 573
  117. 2 Red Sox 2008 545
  118. 3 Yankees 2008 526
  119. Returns
  120. -------
  121. reshaped : DataFrame
  122. """
  123. if isinstance(groups, dict):
  124. keys = list(groups.keys())
  125. values = list(groups.values())
  126. else:
  127. keys, values = zip(*groups)
  128. all_cols = list(set.union(*[set(x) for x in values]))
  129. id_cols = list(data.columns.difference(all_cols))
  130. K = len(values[0])
  131. for seq in values:
  132. if len(seq) != K:
  133. raise ValueError('All column lists must be same length')
  134. mdata = {}
  135. pivot_cols = []
  136. for target, names in zip(keys, values):
  137. to_concat = [data[col].values for col in names]
  138. import pandas.core.dtypes.concat as _concat
  139. mdata[target] = _concat._concat_compat(to_concat)
  140. pivot_cols.append(target)
  141. for col in id_cols:
  142. mdata[col] = np.tile(data[col].values, K)
  143. if dropna:
  144. mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool)
  145. for c in pivot_cols:
  146. mask &= notna(mdata[c])
  147. if not mask.all():
  148. mdata = {k: v[mask] for k, v in compat.iteritems(mdata)}
  149. return data._constructor(mdata, columns=id_cols + pivot_cols)
  150. def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'):
  151. r"""
  152. Wide panel to long format. Less flexible but more user-friendly than melt.
  153. With stubnames ['A', 'B'], this function expects to find one or more
  154. group of columns with format
  155. A-suffix1, A-suffix2,..., B-suffix1, B-suffix2,...
  156. You specify what you want to call this suffix in the resulting long format
  157. with `j` (for example `j='year'`)
  158. Each row of these wide variables are assumed to be uniquely identified by
  159. `i` (can be a single column name or a list of column names)
  160. All remaining variables in the data frame are left intact.
  161. Parameters
  162. ----------
  163. df : DataFrame
  164. The wide-format DataFrame
  165. stubnames : str or list-like
  166. The stub name(s). The wide format variables are assumed to
  167. start with the stub names.
  168. i : str or list-like
  169. Column(s) to use as id variable(s)
  170. j : str
  171. The name of the sub-observation variable. What you wish to name your
  172. suffix in the long format.
  173. sep : str, default ""
  174. A character indicating the separation of the variable names
  175. in the wide format, to be stripped from the names in the long format.
  176. For example, if your column names are A-suffix1, A-suffix2, you
  177. can strip the hyphen by specifying `sep='-'`
  178. .. versionadded:: 0.20.0
  179. suffix : str, default '\\d+'
  180. A regular expression capturing the wanted suffixes. '\\d+' captures
  181. numeric suffixes. Suffixes with no numbers could be specified with the
  182. negated character class '\\D+'. You can also further disambiguate
  183. suffixes, for example, if your wide variables are of the form
  184. A-one, B-two,.., and you have an unrelated column A-rating, you can
  185. ignore the last one by specifying `suffix='(!?one|two)'`
  186. .. versionadded:: 0.20.0
  187. .. versionchanged:: 0.23.0
  188. When all suffixes are numeric, they are cast to int64/float64.
  189. Returns
  190. -------
  191. DataFrame
  192. A DataFrame that contains each stub name as a variable, with new index
  193. (i, j)
  194. Notes
  195. -----
  196. All extra variables are left untouched. This simply uses
  197. `pandas.melt` under the hood, but is hard-coded to "do the right thing"
  198. in a typical case.
  199. Examples
  200. --------
  201. >>> np.random.seed(123)
  202. >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
  203. ... "A1980" : {0 : "d", 1 : "e", 2 : "f"},
  204. ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
  205. ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
  206. ... "X" : dict(zip(range(3), np.random.randn(3)))
  207. ... })
  208. >>> df["id"] = df.index
  209. >>> df
  210. A1970 A1980 B1970 B1980 X id
  211. 0 a d 2.5 3.2 -1.085631 0
  212. 1 b e 1.2 1.3 0.997345 1
  213. 2 c f 0.7 0.1 0.282978 2
  214. >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year")
  215. ... # doctest: +NORMALIZE_WHITESPACE
  216. X A B
  217. id year
  218. 0 1970 -1.085631 a 2.5
  219. 1 1970 0.997345 b 1.2
  220. 2 1970 0.282978 c 0.7
  221. 0 1980 -1.085631 d 3.2
  222. 1 1980 0.997345 e 1.3
  223. 2 1980 0.282978 f 0.1
  224. With multiple id columns
  225. >>> df = pd.DataFrame({
  226. ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
  227. ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
  228. ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
  229. ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
  230. ... })
  231. >>> df
  232. birth famid ht1 ht2
  233. 0 1 1 2.8 3.4
  234. 1 2 1 2.9 3.8
  235. 2 3 1 2.2 2.9
  236. 3 1 2 2.0 3.2
  237. 4 2 2 1.8 2.8
  238. 5 3 2 1.9 2.4
  239. 6 1 3 2.2 3.3
  240. 7 2 3 2.3 3.4
  241. 8 3 3 2.1 2.9
  242. >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')
  243. >>> l
  244. ... # doctest: +NORMALIZE_WHITESPACE
  245. ht
  246. famid birth age
  247. 1 1 1 2.8
  248. 2 3.4
  249. 2 1 2.9
  250. 2 3.8
  251. 3 1 2.2
  252. 2 2.9
  253. 2 1 1 2.0
  254. 2 3.2
  255. 2 1 1.8
  256. 2 2.8
  257. 3 1 1.9
  258. 2 2.4
  259. 3 1 1 2.2
  260. 2 3.3
  261. 2 1 2.3
  262. 2 3.4
  263. 3 1 2.1
  264. 2 2.9
  265. Going from long back to wide just takes some creative use of `unstack`
  266. >>> w = l.unstack()
  267. >>> w.columns = w.columns.map('{0[0]}{0[1]}'.format)
  268. >>> w.reset_index()
  269. famid birth ht1 ht2
  270. 0 1 1 2.8 3.4
  271. 1 1 2 2.9 3.8
  272. 2 1 3 2.2 2.9
  273. 3 2 1 2.0 3.2
  274. 4 2 2 1.8 2.8
  275. 5 2 3 1.9 2.4
  276. 6 3 1 2.2 3.3
  277. 7 3 2 2.3 3.4
  278. 8 3 3 2.1 2.9
  279. Less wieldy column names are also handled
  280. >>> np.random.seed(0)
  281. >>> df = pd.DataFrame({'A(quarterly)-2010': np.random.rand(3),
  282. ... 'A(quarterly)-2011': np.random.rand(3),
  283. ... 'B(quarterly)-2010': np.random.rand(3),
  284. ... 'B(quarterly)-2011': np.random.rand(3),
  285. ... 'X' : np.random.randint(3, size=3)})
  286. >>> df['id'] = df.index
  287. >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
  288. A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 ...
  289. 0 0.548814 0.544883 0.437587 ...
  290. 1 0.715189 0.423655 0.891773 ...
  291. 2 0.602763 0.645894 0.963663 ...
  292. X id
  293. 0 0 0
  294. 1 1 1
  295. 2 1 2
  296. >>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'], i='id',
  297. ... j='year', sep='-')
  298. ... # doctest: +NORMALIZE_WHITESPACE
  299. X A(quarterly) B(quarterly)
  300. id year
  301. 0 2010 0 0.548814 0.437587
  302. 1 2010 1 0.715189 0.891773
  303. 2 2010 1 0.602763 0.963663
  304. 0 2011 0 0.544883 0.383442
  305. 1 2011 1 0.423655 0.791725
  306. 2 2011 1 0.645894 0.528895
  307. If we have many columns, we could also use a regex to find our
  308. stubnames and pass that list on to wide_to_long
  309. >>> stubnames = sorted(
  310. ... set([match[0] for match in df.columns.str.findall(
  311. ... r'[A-B]\(.*\)').values if match != [] ])
  312. ... )
  313. >>> list(stubnames)
  314. ['A(quarterly)', 'B(quarterly)']
  315. All of the above examples have integers as suffixes. It is possible to
  316. have non-integers as suffixes.
  317. >>> df = pd.DataFrame({
  318. ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
  319. ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
  320. ... 'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
  321. ... 'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
  322. ... })
  323. >>> df
  324. birth famid ht_one ht_two
  325. 0 1 1 2.8 3.4
  326. 1 2 1 2.9 3.8
  327. 2 3 1 2.2 2.9
  328. 3 1 2 2.0 3.2
  329. 4 2 2 1.8 2.8
  330. 5 3 2 1.9 2.4
  331. 6 1 3 2.2 3.3
  332. 7 2 3 2.3 3.4
  333. 8 3 3 2.1 2.9
  334. >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age',
  335. sep='_', suffix='\w')
  336. >>> l
  337. ... # doctest: +NORMALIZE_WHITESPACE
  338. ht
  339. famid birth age
  340. 1 1 one 2.8
  341. two 3.4
  342. 2 one 2.9
  343. two 3.8
  344. 3 one 2.2
  345. two 2.9
  346. 2 1 one 2.0
  347. two 3.2
  348. 2 one 1.8
  349. two 2.8
  350. 3 one 1.9
  351. two 2.4
  352. 3 1 one 2.2
  353. two 3.3
  354. 2 one 2.3
  355. two 3.4
  356. 3 one 2.1
  357. two 2.9
  358. """
  359. def get_var_names(df, stub, sep, suffix):
  360. regex = r'^{stub}{sep}{suffix}$'.format(
  361. stub=re.escape(stub), sep=re.escape(sep), suffix=suffix)
  362. pattern = re.compile(regex)
  363. return [col for col in df.columns if pattern.match(col)]
  364. def melt_stub(df, stub, i, j, value_vars, sep):
  365. newdf = melt(df, id_vars=i, value_vars=value_vars,
  366. value_name=stub.rstrip(sep), var_name=j)
  367. newdf[j] = Categorical(newdf[j])
  368. newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "")
  369. # GH17627 Cast numerics suffixes to int/float
  370. newdf[j] = to_numeric(newdf[j], errors='ignore')
  371. return newdf.set_index(i + [j])
  372. if not is_list_like(stubnames):
  373. stubnames = [stubnames]
  374. else:
  375. stubnames = list(stubnames)
  376. if any(col in stubnames for col in df.columns):
  377. raise ValueError("stubname can't be identical to a column name")
  378. if not is_list_like(i):
  379. i = [i]
  380. else:
  381. i = list(i)
  382. if df[i].duplicated().any():
  383. raise ValueError("the id variables need to uniquely identify each row")
  384. value_vars = [get_var_names(df, stub, sep, suffix) for stub in stubnames]
  385. value_vars_flattened = [e for sublist in value_vars for e in sublist]
  386. id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened))
  387. melted = [melt_stub(df, s, i, j, v, sep)
  388. for s, v in zip(stubnames, value_vars)]
  389. melted = melted[0].join(melted[1:], how='outer')
  390. if len(i) == 1:
  391. new = df[id_vars].set_index(i).join(melted)
  392. return new
  393. new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j])
  394. return new