reshape.py 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044
  1. # pylint: disable=E1101,E1103
  2. # pylint: disable=W0703,W0622,W0613,W0201
  3. from functools import partial
  4. import itertools
  5. import numpy as np
  6. from pandas._libs import algos as _algos, reshape as _reshape
  7. from pandas._libs.sparse import IntIndex
  8. from pandas.compat import PY2, range, text_type, u, zip
  9. from pandas.core.dtypes.cast import maybe_promote
  10. from pandas.core.dtypes.common import (
  11. ensure_platform_int, is_bool_dtype, is_extension_array_dtype,
  12. is_integer_dtype, is_list_like, is_object_dtype, needs_i8_conversion)
  13. from pandas.core.dtypes.missing import notna
  14. from pandas import compat
  15. import pandas.core.algorithms as algos
  16. from pandas.core.arrays import SparseArray
  17. from pandas.core.arrays.categorical import _factorize_from_iterable
  18. from pandas.core.frame import DataFrame
  19. from pandas.core.index import Index, MultiIndex
  20. from pandas.core.internals.arrays import extract_array
  21. from pandas.core.series import Series
  22. from pandas.core.sorting import (
  23. compress_group_index, decons_obs_group_ids, get_compressed_ids,
  24. get_group_index)
  25. class _Unstacker(object):
  26. """
  27. Helper class to unstack data / pivot with multi-level index
  28. Parameters
  29. ----------
  30. values : ndarray
  31. Values of DataFrame to "Unstack"
  32. index : object
  33. Pandas ``Index``
  34. level : int or str, default last level
  35. Level to "unstack". Accepts a name for the level.
  36. value_columns : Index, optional
  37. Pandas ``Index`` or ``MultiIndex`` object if unstacking a DataFrame
  38. fill_value : scalar, optional
  39. Default value to fill in missing values if subgroups do not have the
  40. same set of labels. By default, missing values will be replaced with
  41. the default fill value for that data type, NaN for float, NaT for
  42. datetimelike, etc. For integer types, by default data will converted to
  43. float and missing values will be set to NaN.
  44. constructor : object
  45. Pandas ``DataFrame`` or subclass used to create unstacked
  46. response. If None, DataFrame or SparseDataFrame will be used.
  47. Examples
  48. --------
  49. >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
  50. ... ('two', 'a'), ('two', 'b')])
  51. >>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index)
  52. >>> s
  53. one a 1
  54. b 2
  55. two a 3
  56. b 4
  57. dtype: int64
  58. >>> s.unstack(level=-1)
  59. a b
  60. one 1 2
  61. two 3 4
  62. >>> s.unstack(level=0)
  63. one two
  64. a 1 3
  65. b 2 4
  66. Returns
  67. -------
  68. unstacked : DataFrame
  69. """
  70. def __init__(self, values, index, level=-1, value_columns=None,
  71. fill_value=None, constructor=None):
  72. if values.ndim == 1:
  73. values = values[:, np.newaxis]
  74. self.values = values
  75. self.value_columns = value_columns
  76. self.fill_value = fill_value
  77. if constructor is None:
  78. constructor = DataFrame
  79. self.constructor = constructor
  80. if value_columns is None and values.shape[1] != 1: # pragma: no cover
  81. raise ValueError('must pass column labels for multi-column data')
  82. self.index = index.remove_unused_levels()
  83. self.level = self.index._get_level_number(level)
  84. # when index includes `nan`, need to lift levels/strides by 1
  85. self.lift = 1 if -1 in self.index.codes[self.level] else 0
  86. self.new_index_levels = list(self.index.levels)
  87. self.new_index_names = list(self.index.names)
  88. self.removed_name = self.new_index_names.pop(self.level)
  89. self.removed_level = self.new_index_levels.pop(self.level)
  90. self.removed_level_full = index.levels[self.level]
  91. # Bug fix GH 20601
  92. # If the data frame is too big, the number of unique index combination
  93. # will cause int32 overflow on windows environments.
  94. # We want to check and raise an error before this happens
  95. num_rows = np.max([index_level.size for index_level
  96. in self.new_index_levels])
  97. num_columns = self.removed_level.size
  98. # GH20601: This forces an overflow if the number of cells is too high.
  99. num_cells = np.multiply(num_rows, num_columns, dtype=np.int32)
  100. if num_rows > 0 and num_columns > 0 and num_cells <= 0:
  101. raise ValueError('Unstacked DataFrame is too big, '
  102. 'causing int32 overflow')
  103. self._make_sorted_values_labels()
  104. self._make_selectors()
  105. def _make_sorted_values_labels(self):
  106. v = self.level
  107. codes = list(self.index.codes)
  108. levs = list(self.index.levels)
  109. to_sort = codes[:v] + codes[v + 1:] + [codes[v]]
  110. sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]]
  111. comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
  112. ngroups = len(obs_ids)
  113. indexer = _algos.groupsort_indexer(comp_index, ngroups)[0]
  114. indexer = ensure_platform_int(indexer)
  115. self.sorted_values = algos.take_nd(self.values, indexer, axis=0)
  116. self.sorted_labels = [l.take(indexer) for l in to_sort]
  117. def _make_selectors(self):
  118. new_levels = self.new_index_levels
  119. # make the mask
  120. remaining_labels = self.sorted_labels[:-1]
  121. level_sizes = [len(x) for x in new_levels]
  122. comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes)
  123. ngroups = len(obs_ids)
  124. comp_index = ensure_platform_int(comp_index)
  125. stride = self.index.levshape[self.level] + self.lift
  126. self.full_shape = ngroups, stride
  127. selector = self.sorted_labels[-1] + stride * comp_index + self.lift
  128. mask = np.zeros(np.prod(self.full_shape), dtype=bool)
  129. mask.put(selector, True)
  130. if mask.sum() < len(self.index):
  131. raise ValueError('Index contains duplicate entries, '
  132. 'cannot reshape')
  133. self.group_index = comp_index
  134. self.mask = mask
  135. self.unique_groups = obs_ids
  136. self.compressor = comp_index.searchsorted(np.arange(ngroups))
  137. def get_result(self):
  138. values, _ = self.get_new_values()
  139. columns = self.get_new_columns()
  140. index = self.get_new_index()
  141. return self.constructor(values, index=index, columns=columns)
  142. def get_new_values(self):
  143. values = self.values
  144. # place the values
  145. length, width = self.full_shape
  146. stride = values.shape[1]
  147. result_width = width * stride
  148. result_shape = (length, result_width)
  149. mask = self.mask
  150. mask_all = mask.all()
  151. # we can simply reshape if we don't have a mask
  152. if mask_all and len(values):
  153. new_values = (self.sorted_values
  154. .reshape(length, width, stride)
  155. .swapaxes(1, 2)
  156. .reshape(result_shape)
  157. )
  158. new_mask = np.ones(result_shape, dtype=bool)
  159. return new_values, new_mask
  160. # if our mask is all True, then we can use our existing dtype
  161. if mask_all:
  162. dtype = values.dtype
  163. new_values = np.empty(result_shape, dtype=dtype)
  164. else:
  165. dtype, fill_value = maybe_promote(values.dtype, self.fill_value)
  166. new_values = np.empty(result_shape, dtype=dtype)
  167. new_values.fill(fill_value)
  168. new_mask = np.zeros(result_shape, dtype=bool)
  169. name = np.dtype(dtype).name
  170. sorted_values = self.sorted_values
  171. # we need to convert to a basic dtype
  172. # and possibly coerce an input to our output dtype
  173. # e.g. ints -> floats
  174. if needs_i8_conversion(values):
  175. sorted_values = sorted_values.view('i8')
  176. new_values = new_values.view('i8')
  177. name = 'int64'
  178. elif is_bool_dtype(values):
  179. sorted_values = sorted_values.astype('object')
  180. new_values = new_values.astype('object')
  181. name = 'object'
  182. else:
  183. sorted_values = sorted_values.astype(name, copy=False)
  184. # fill in our values & mask
  185. f = getattr(_reshape, "unstack_{name}".format(name=name))
  186. f(sorted_values,
  187. mask.view('u1'),
  188. stride,
  189. length,
  190. width,
  191. new_values,
  192. new_mask.view('u1'))
  193. # reconstruct dtype if needed
  194. if needs_i8_conversion(values):
  195. new_values = new_values.view(values.dtype)
  196. return new_values, new_mask
  197. def get_new_columns(self):
  198. if self.value_columns is None:
  199. if self.lift == 0:
  200. return self.removed_level
  201. lev = self.removed_level
  202. return lev.insert(0, lev._na_value)
  203. stride = len(self.removed_level) + self.lift
  204. width = len(self.value_columns)
  205. propagator = np.repeat(np.arange(width), stride)
  206. if isinstance(self.value_columns, MultiIndex):
  207. new_levels = self.value_columns.levels + (self.removed_level_full,)
  208. new_names = self.value_columns.names + (self.removed_name,)
  209. new_codes = [lab.take(propagator)
  210. for lab in self.value_columns.codes]
  211. else:
  212. new_levels = [self.value_columns, self.removed_level_full]
  213. new_names = [self.value_columns.name, self.removed_name]
  214. new_codes = [propagator]
  215. # The two indices differ only if the unstacked level had unused items:
  216. if len(self.removed_level_full) != len(self.removed_level):
  217. # In this case, we remap the new codes to the original level:
  218. repeater = self.removed_level_full.get_indexer(self.removed_level)
  219. if self.lift:
  220. repeater = np.insert(repeater, 0, -1)
  221. else:
  222. # Otherwise, we just use each level item exactly once:
  223. repeater = np.arange(stride) - self.lift
  224. # The entire level is then just a repetition of the single chunk:
  225. new_codes.append(np.tile(repeater, width))
  226. return MultiIndex(levels=new_levels, codes=new_codes,
  227. names=new_names, verify_integrity=False)
  228. def get_new_index(self):
  229. result_codes = [lab.take(self.compressor)
  230. for lab in self.sorted_labels[:-1]]
  231. # construct the new index
  232. if len(self.new_index_levels) == 1:
  233. lev, lab = self.new_index_levels[0], result_codes[0]
  234. if (lab == -1).any():
  235. lev = lev.insert(len(lev), lev._na_value)
  236. return lev.take(lab)
  237. return MultiIndex(levels=self.new_index_levels, codes=result_codes,
  238. names=self.new_index_names, verify_integrity=False)
  239. def _unstack_multiple(data, clocs, fill_value=None):
  240. if len(clocs) == 0:
  241. return data
  242. # NOTE: This doesn't deal with hierarchical columns yet
  243. index = data.index
  244. clocs = [index._get_level_number(i) for i in clocs]
  245. rlocs = [i for i in range(index.nlevels) if i not in clocs]
  246. clevels = [index.levels[i] for i in clocs]
  247. ccodes = [index.codes[i] for i in clocs]
  248. cnames = [index.names[i] for i in clocs]
  249. rlevels = [index.levels[i] for i in rlocs]
  250. rcodes = [index.codes[i] for i in rlocs]
  251. rnames = [index.names[i] for i in rlocs]
  252. shape = [len(x) for x in clevels]
  253. group_index = get_group_index(ccodes, shape, sort=False, xnull=False)
  254. comp_ids, obs_ids = compress_group_index(group_index, sort=False)
  255. recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes,
  256. xnull=False)
  257. if rlocs == []:
  258. # Everything is in clocs, so the dummy df has a regular index
  259. dummy_index = Index(obs_ids, name='__placeholder__')
  260. else:
  261. dummy_index = MultiIndex(levels=rlevels + [obs_ids],
  262. codes=rcodes + [comp_ids],
  263. names=rnames + ['__placeholder__'],
  264. verify_integrity=False)
  265. if isinstance(data, Series):
  266. dummy = data.copy()
  267. dummy.index = dummy_index
  268. unstacked = dummy.unstack('__placeholder__', fill_value=fill_value)
  269. new_levels = clevels
  270. new_names = cnames
  271. new_codes = recons_codes
  272. else:
  273. if isinstance(data.columns, MultiIndex):
  274. result = data
  275. for i in range(len(clocs)):
  276. val = clocs[i]
  277. result = result.unstack(val)
  278. clocs = [v if i > v else v - 1 for v in clocs]
  279. return result
  280. dummy = data.copy()
  281. dummy.index = dummy_index
  282. unstacked = dummy.unstack('__placeholder__', fill_value=fill_value)
  283. if isinstance(unstacked, Series):
  284. unstcols = unstacked.index
  285. else:
  286. unstcols = unstacked.columns
  287. new_levels = [unstcols.levels[0]] + clevels
  288. new_names = [data.columns.name] + cnames
  289. new_codes = [unstcols.codes[0]]
  290. for rec in recons_codes:
  291. new_codes.append(rec.take(unstcols.codes[-1]))
  292. new_columns = MultiIndex(levels=new_levels, codes=new_codes,
  293. names=new_names, verify_integrity=False)
  294. if isinstance(unstacked, Series):
  295. unstacked.index = new_columns
  296. else:
  297. unstacked.columns = new_columns
  298. return unstacked
  299. def unstack(obj, level, fill_value=None):
  300. if isinstance(level, (tuple, list)):
  301. if len(level) != 1:
  302. # _unstack_multiple only handles MultiIndexes,
  303. # and isn't needed for a single level
  304. return _unstack_multiple(obj, level, fill_value=fill_value)
  305. else:
  306. level = level[0]
  307. if isinstance(obj, DataFrame):
  308. if isinstance(obj.index, MultiIndex):
  309. return _unstack_frame(obj, level, fill_value=fill_value)
  310. else:
  311. return obj.T.stack(dropna=False)
  312. else:
  313. if is_extension_array_dtype(obj.dtype):
  314. return _unstack_extension_series(obj, level, fill_value)
  315. unstacker = _Unstacker(obj.values, obj.index, level=level,
  316. fill_value=fill_value,
  317. constructor=obj._constructor_expanddim)
  318. return unstacker.get_result()
  319. def _unstack_frame(obj, level, fill_value=None):
  320. if obj._is_mixed_type:
  321. unstacker = partial(_Unstacker, index=obj.index,
  322. level=level, fill_value=fill_value)
  323. blocks = obj._data.unstack(unstacker,
  324. fill_value=fill_value)
  325. return obj._constructor(blocks)
  326. else:
  327. unstacker = _Unstacker(obj.values, obj.index, level=level,
  328. value_columns=obj.columns,
  329. fill_value=fill_value,
  330. constructor=obj._constructor)
  331. return unstacker.get_result()
  332. def _unstack_extension_series(series, level, fill_value):
  333. """
  334. Unstack an ExtensionArray-backed Series.
  335. The ExtensionDtype is preserved.
  336. Parameters
  337. ----------
  338. series : Series
  339. A Series with an ExtensionArray for values
  340. level : Any
  341. The level name or number.
  342. fill_value : Any
  343. The user-level (not physical storage) fill value to use for
  344. missing values introduced by the reshape. Passed to
  345. ``series.values.take``.
  346. Returns
  347. -------
  348. DataFrame
  349. Each column of the DataFrame will have the same dtype as
  350. the input Series.
  351. """
  352. # Implementation note: the basic idea is to
  353. # 1. Do a regular unstack on a dummy array of integers
  354. # 2. Followup with a columnwise take.
  355. # We use the dummy take to discover newly-created missing values
  356. # introduced by the reshape.
  357. from pandas.core.reshape.concat import concat
  358. dummy_arr = np.arange(len(series))
  359. # fill_value=-1, since we will do a series.values.take later
  360. result = _Unstacker(dummy_arr, series.index,
  361. level=level, fill_value=-1).get_result()
  362. out = []
  363. values = extract_array(series, extract_numpy=False)
  364. for col, indices in result.iteritems():
  365. out.append(Series(values.take(indices.values,
  366. allow_fill=True,
  367. fill_value=fill_value),
  368. name=col, index=result.index))
  369. return concat(out, axis='columns', copy=False, keys=result.columns)
  370. def stack(frame, level=-1, dropna=True):
  371. """
  372. Convert DataFrame to Series with multi-level Index. Columns become the
  373. second level of the resulting hierarchical index
  374. Returns
  375. -------
  376. stacked : Series
  377. """
  378. def factorize(index):
  379. if index.is_unique:
  380. return index, np.arange(len(index))
  381. codes, categories = _factorize_from_iterable(index)
  382. return categories, codes
  383. N, K = frame.shape
  384. # Will also convert negative level numbers and check if out of bounds.
  385. level_num = frame.columns._get_level_number(level)
  386. if isinstance(frame.columns, MultiIndex):
  387. return _stack_multi_columns(frame, level_num=level_num, dropna=dropna)
  388. elif isinstance(frame.index, MultiIndex):
  389. new_levels = list(frame.index.levels)
  390. new_codes = [lab.repeat(K) for lab in frame.index.codes]
  391. clev, clab = factorize(frame.columns)
  392. new_levels.append(clev)
  393. new_codes.append(np.tile(clab, N).ravel())
  394. new_names = list(frame.index.names)
  395. new_names.append(frame.columns.name)
  396. new_index = MultiIndex(levels=new_levels, codes=new_codes,
  397. names=new_names, verify_integrity=False)
  398. else:
  399. levels, (ilab, clab) = zip(*map(factorize, (frame.index,
  400. frame.columns)))
  401. codes = ilab.repeat(K), np.tile(clab, N).ravel()
  402. new_index = MultiIndex(levels=levels, codes=codes,
  403. names=[frame.index.name, frame.columns.name],
  404. verify_integrity=False)
  405. if frame._is_homogeneous_type:
  406. # For homogeneous EAs, frame.values will coerce to object. So
  407. # we concatenate instead.
  408. dtypes = list(frame.dtypes.values)
  409. dtype = dtypes[0]
  410. if is_extension_array_dtype(dtype):
  411. arr = dtype.construct_array_type()
  412. new_values = arr._concat_same_type([
  413. col._values for _, col in frame.iteritems()
  414. ])
  415. new_values = _reorder_for_extension_array_stack(new_values, N, K)
  416. else:
  417. # homogeneous, non-EA
  418. new_values = frame.values.ravel()
  419. else:
  420. # non-homogeneous
  421. new_values = frame.values.ravel()
  422. if dropna:
  423. mask = notna(new_values)
  424. new_values = new_values[mask]
  425. new_index = new_index[mask]
  426. return frame._constructor_sliced(new_values, index=new_index)
  427. def stack_multiple(frame, level, dropna=True):
  428. # If all passed levels match up to column names, no
  429. # ambiguity about what to do
  430. if all(lev in frame.columns.names for lev in level):
  431. result = frame
  432. for lev in level:
  433. result = stack(result, lev, dropna=dropna)
  434. # Otherwise, level numbers may change as each successive level is stacked
  435. elif all(isinstance(lev, int) for lev in level):
  436. # As each stack is done, the level numbers decrease, so we need
  437. # to account for that when level is a sequence of ints
  438. result = frame
  439. # _get_level_number() checks level numbers are in range and converts
  440. # negative numbers to positive
  441. level = [frame.columns._get_level_number(lev) for lev in level]
  442. # Can't iterate directly through level as we might need to change
  443. # values as we go
  444. for index in range(len(level)):
  445. lev = level[index]
  446. result = stack(result, lev, dropna=dropna)
  447. # Decrement all level numbers greater than current, as these
  448. # have now shifted down by one
  449. updated_level = []
  450. for other in level:
  451. if other > lev:
  452. updated_level.append(other - 1)
  453. else:
  454. updated_level.append(other)
  455. level = updated_level
  456. else:
  457. raise ValueError("level should contain all level names or all level "
  458. "numbers, not a mixture of the two.")
  459. return result
  460. def _stack_multi_columns(frame, level_num=-1, dropna=True):
  461. def _convert_level_number(level_num, columns):
  462. """
  463. Logic for converting the level number to something we can safely pass
  464. to swaplevel:
  465. We generally want to convert the level number into a level name, except
  466. when columns do not have names, in which case we must leave as a level
  467. number
  468. """
  469. if level_num in columns.names:
  470. return columns.names[level_num]
  471. else:
  472. if columns.names[level_num] is None:
  473. return level_num
  474. else:
  475. return columns.names[level_num]
  476. this = frame.copy()
  477. # this makes life much simpler
  478. if level_num != frame.columns.nlevels - 1:
  479. # roll levels to put selected level at end
  480. roll_columns = this.columns
  481. for i in range(level_num, frame.columns.nlevels - 1):
  482. # Need to check if the ints conflict with level names
  483. lev1 = _convert_level_number(i, roll_columns)
  484. lev2 = _convert_level_number(i + 1, roll_columns)
  485. roll_columns = roll_columns.swaplevel(lev1, lev2)
  486. this.columns = roll_columns
  487. if not this.columns.is_lexsorted():
  488. # Workaround the edge case where 0 is one of the column names,
  489. # which interferes with trying to sort based on the first
  490. # level
  491. level_to_sort = _convert_level_number(0, this.columns)
  492. this = this.sort_index(level=level_to_sort, axis=1)
  493. # tuple list excluding level for grouping columns
  494. if len(frame.columns.levels) > 2:
  495. tuples = list(zip(*[lev.take(level_codes) for lev, level_codes
  496. in zip(this.columns.levels[:-1],
  497. this.columns.codes[:-1])]))
  498. unique_groups = [key for key, _ in itertools.groupby(tuples)]
  499. new_names = this.columns.names[:-1]
  500. new_columns = MultiIndex.from_tuples(unique_groups, names=new_names)
  501. else:
  502. new_columns = unique_groups = this.columns.levels[0]
  503. # time to ravel the values
  504. new_data = {}
  505. level_vals = this.columns.levels[-1]
  506. level_codes = sorted(set(this.columns.codes[-1]))
  507. level_vals_used = level_vals[level_codes]
  508. levsize = len(level_codes)
  509. drop_cols = []
  510. for key in unique_groups:
  511. try:
  512. loc = this.columns.get_loc(key)
  513. except KeyError:
  514. drop_cols.append(key)
  515. continue
  516. # can make more efficient?
  517. # we almost always return a slice
  518. # but if unsorted can get a boolean
  519. # indexer
  520. if not isinstance(loc, slice):
  521. slice_len = len(loc)
  522. else:
  523. slice_len = loc.stop - loc.start
  524. if slice_len != levsize:
  525. chunk = this.loc[:, this.columns[loc]]
  526. chunk.columns = level_vals.take(chunk.columns.codes[-1])
  527. value_slice = chunk.reindex(columns=level_vals_used).values
  528. else:
  529. if (frame._is_homogeneous_type and
  530. is_extension_array_dtype(frame.dtypes.iloc[0])):
  531. dtype = this[this.columns[loc]].dtypes.iloc[0]
  532. subset = this[this.columns[loc]]
  533. value_slice = dtype.construct_array_type()._concat_same_type(
  534. [x._values for _, x in subset.iteritems()]
  535. )
  536. N, K = this.shape
  537. idx = np.arange(N * K).reshape(K, N).T.ravel()
  538. value_slice = value_slice.take(idx)
  539. elif frame._is_mixed_type:
  540. value_slice = this[this.columns[loc]].values
  541. else:
  542. value_slice = this.values[:, loc]
  543. if value_slice.ndim > 1:
  544. # i.e. not extension
  545. value_slice = value_slice.ravel()
  546. new_data[key] = value_slice
  547. if len(drop_cols) > 0:
  548. new_columns = new_columns.difference(drop_cols)
  549. N = len(this)
  550. if isinstance(this.index, MultiIndex):
  551. new_levels = list(this.index.levels)
  552. new_names = list(this.index.names)
  553. new_codes = [lab.repeat(levsize) for lab in this.index.codes]
  554. else:
  555. new_levels = [this.index]
  556. new_codes = [np.arange(N).repeat(levsize)]
  557. new_names = [this.index.name] # something better?
  558. new_levels.append(level_vals)
  559. new_codes.append(np.tile(level_codes, N))
  560. new_names.append(frame.columns.names[level_num])
  561. new_index = MultiIndex(levels=new_levels, codes=new_codes,
  562. names=new_names, verify_integrity=False)
  563. result = frame._constructor(new_data, index=new_index, columns=new_columns)
  564. # more efficient way to go about this? can do the whole masking biz but
  565. # will only save a small amount of time...
  566. if dropna:
  567. result = result.dropna(axis=0, how='all')
  568. return result
  569. def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
  570. columns=None, sparse=False, drop_first=False, dtype=None):
  571. """
  572. Convert categorical variable into dummy/indicator variables
  573. Parameters
  574. ----------
  575. data : array-like, Series, or DataFrame
  576. prefix : string, list of strings, or dict of strings, default None
  577. String to append DataFrame column names.
  578. Pass a list with length equal to the number of columns
  579. when calling get_dummies on a DataFrame. Alternatively, `prefix`
  580. can be a dictionary mapping column names to prefixes.
  581. prefix_sep : string, default '_'
  582. If appending prefix, separator/delimiter to use. Or pass a
  583. list or dictionary as with `prefix.`
  584. dummy_na : bool, default False
  585. Add a column to indicate NaNs, if False NaNs are ignored.
  586. columns : list-like, default None
  587. Column names in the DataFrame to be encoded.
  588. If `columns` is None then all the columns with
  589. `object` or `category` dtype will be converted.
  590. sparse : bool, default False
  591. Whether the dummy-encoded columns should be be backed by
  592. a :class:`SparseArray` (True) or a regular NumPy array (False).
  593. drop_first : bool, default False
  594. Whether to get k-1 dummies out of k categorical levels by removing the
  595. first level.
  596. .. versionadded:: 0.18.0
  597. dtype : dtype, default np.uint8
  598. Data type for new columns. Only a single dtype is allowed.
  599. .. versionadded:: 0.23.0
  600. Returns
  601. -------
  602. dummies : DataFrame
  603. See Also
  604. --------
  605. Series.str.get_dummies
  606. Examples
  607. --------
  608. >>> s = pd.Series(list('abca'))
  609. >>> pd.get_dummies(s)
  610. a b c
  611. 0 1 0 0
  612. 1 0 1 0
  613. 2 0 0 1
  614. 3 1 0 0
  615. >>> s1 = ['a', 'b', np.nan]
  616. >>> pd.get_dummies(s1)
  617. a b
  618. 0 1 0
  619. 1 0 1
  620. 2 0 0
  621. >>> pd.get_dummies(s1, dummy_na=True)
  622. a b NaN
  623. 0 1 0 0
  624. 1 0 1 0
  625. 2 0 0 1
  626. >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
  627. ... 'C': [1, 2, 3]})
  628. >>> pd.get_dummies(df, prefix=['col1', 'col2'])
  629. C col1_a col1_b col2_a col2_b col2_c
  630. 0 1 1 0 0 1 0
  631. 1 2 0 1 1 0 0
  632. 2 3 1 0 0 0 1
  633. >>> pd.get_dummies(pd.Series(list('abcaa')))
  634. a b c
  635. 0 1 0 0
  636. 1 0 1 0
  637. 2 0 0 1
  638. 3 1 0 0
  639. 4 1 0 0
  640. >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)
  641. b c
  642. 0 0 0
  643. 1 1 0
  644. 2 0 1
  645. 3 0 0
  646. 4 0 0
  647. >>> pd.get_dummies(pd.Series(list('abc')), dtype=float)
  648. a b c
  649. 0 1.0 0.0 0.0
  650. 1 0.0 1.0 0.0
  651. 2 0.0 0.0 1.0
  652. """
  653. from pandas.core.reshape.concat import concat
  654. from itertools import cycle
  655. dtypes_to_encode = ['object', 'category']
  656. if isinstance(data, DataFrame):
  657. # determine columns being encoded
  658. if columns is None:
  659. data_to_encode = data.select_dtypes(
  660. include=dtypes_to_encode)
  661. else:
  662. data_to_encode = data[columns]
  663. # validate prefixes and separator to avoid silently dropping cols
  664. def check_len(item, name):
  665. len_msg = ("Length of '{name}' ({len_item}) did not match the "
  666. "length of the columns being encoded ({len_enc}).")
  667. if is_list_like(item):
  668. if not len(item) == data_to_encode.shape[1]:
  669. len_msg = len_msg.format(name=name, len_item=len(item),
  670. len_enc=data_to_encode.shape[1])
  671. raise ValueError(len_msg)
  672. check_len(prefix, 'prefix')
  673. check_len(prefix_sep, 'prefix_sep')
  674. if isinstance(prefix, compat.string_types):
  675. prefix = cycle([prefix])
  676. if isinstance(prefix, dict):
  677. prefix = [prefix[col] for col in data_to_encode.columns]
  678. if prefix is None:
  679. prefix = data_to_encode.columns
  680. # validate separators
  681. if isinstance(prefix_sep, compat.string_types):
  682. prefix_sep = cycle([prefix_sep])
  683. elif isinstance(prefix_sep, dict):
  684. prefix_sep = [prefix_sep[col] for col in data_to_encode.columns]
  685. if data_to_encode.shape == data.shape:
  686. # Encoding the entire df, do not prepend any dropped columns
  687. with_dummies = []
  688. elif columns is not None:
  689. # Encoding only cols specified in columns. Get all cols not in
  690. # columns to prepend to result.
  691. with_dummies = [data.drop(columns, axis=1)]
  692. else:
  693. # Encoding only object and category dtype columns. Get remaining
  694. # columns to prepend to result.
  695. with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)]
  696. for (col, pre, sep) in zip(data_to_encode.iteritems(), prefix,
  697. prefix_sep):
  698. # col is (column_name, column), use just column data here
  699. dummy = _get_dummies_1d(col[1], prefix=pre, prefix_sep=sep,
  700. dummy_na=dummy_na, sparse=sparse,
  701. drop_first=drop_first, dtype=dtype)
  702. with_dummies.append(dummy)
  703. result = concat(with_dummies, axis=1)
  704. else:
  705. result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na,
  706. sparse=sparse,
  707. drop_first=drop_first,
  708. dtype=dtype)
  709. return result
  710. def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
  711. sparse=False, drop_first=False, dtype=None):
  712. from pandas.core.reshape.concat import concat
  713. # Series avoids inconsistent NaN handling
  714. codes, levels = _factorize_from_iterable(Series(data))
  715. if dtype is None:
  716. dtype = np.uint8
  717. dtype = np.dtype(dtype)
  718. if is_object_dtype(dtype):
  719. raise ValueError("dtype=object is not a valid dtype for get_dummies")
  720. def get_empty_frame(data):
  721. if isinstance(data, Series):
  722. index = data.index
  723. else:
  724. index = np.arange(len(data))
  725. return DataFrame(index=index)
  726. # if all NaN
  727. if not dummy_na and len(levels) == 0:
  728. return get_empty_frame(data)
  729. codes = codes.copy()
  730. if dummy_na:
  731. codes[codes == -1] = len(levels)
  732. levels = np.append(levels, np.nan)
  733. # if dummy_na, we just fake a nan level. drop_first will drop it again
  734. if drop_first and len(levels) == 1:
  735. return get_empty_frame(data)
  736. number_of_cols = len(levels)
  737. if prefix is None:
  738. dummy_cols = levels
  739. else:
  740. # PY2 embedded unicode, gh-22084
  741. def _make_col_name(prefix, prefix_sep, level):
  742. fstr = '{prefix}{prefix_sep}{level}'
  743. if PY2 and (isinstance(prefix, text_type) or
  744. isinstance(prefix_sep, text_type) or
  745. isinstance(level, text_type)):
  746. fstr = u(fstr)
  747. return fstr.format(prefix=prefix,
  748. prefix_sep=prefix_sep,
  749. level=level)
  750. dummy_cols = [_make_col_name(prefix, prefix_sep, level)
  751. for level in levels]
  752. if isinstance(data, Series):
  753. index = data.index
  754. else:
  755. index = None
  756. if sparse:
  757. if is_integer_dtype(dtype):
  758. fill_value = 0
  759. elif dtype == bool:
  760. fill_value = False
  761. else:
  762. fill_value = 0.0
  763. sparse_series = []
  764. N = len(data)
  765. sp_indices = [[] for _ in range(len(dummy_cols))]
  766. mask = codes != -1
  767. codes = codes[mask]
  768. n_idx = np.arange(N)[mask]
  769. for ndx, code in zip(n_idx, codes):
  770. sp_indices[code].append(ndx)
  771. if drop_first:
  772. # remove first categorical level to avoid perfect collinearity
  773. # GH12042
  774. sp_indices = sp_indices[1:]
  775. dummy_cols = dummy_cols[1:]
  776. for col, ixs in zip(dummy_cols, sp_indices):
  777. sarr = SparseArray(np.ones(len(ixs), dtype=dtype),
  778. sparse_index=IntIndex(N, ixs),
  779. fill_value=fill_value,
  780. dtype=dtype)
  781. sparse_series.append(Series(data=sarr, index=index, name=col))
  782. out = concat(sparse_series, axis=1, copy=False)
  783. return out
  784. else:
  785. dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0)
  786. if not dummy_na:
  787. # reset NaN GH4446
  788. dummy_mat[codes == -1] = 0
  789. if drop_first:
  790. # remove first GH12042
  791. dummy_mat = dummy_mat[:, 1:]
  792. dummy_cols = dummy_cols[1:]
  793. return DataFrame(dummy_mat, index=index, columns=dummy_cols)
  794. def make_axis_dummies(frame, axis='minor', transform=None):
  795. """
  796. Construct 1-0 dummy variables corresponding to designated axis
  797. labels
  798. Parameters
  799. ----------
  800. frame : DataFrame
  801. axis : {'major', 'minor'}, default 'minor'
  802. transform : function, default None
  803. Function to apply to axis labels first. For example, to
  804. get "day of week" dummies in a time series regression
  805. you might call::
  806. make_axis_dummies(panel, axis='major',
  807. transform=lambda d: d.weekday())
  808. Returns
  809. -------
  810. dummies : DataFrame
  811. Column names taken from chosen axis
  812. """
  813. numbers = {'major': 0, 'minor': 1}
  814. num = numbers.get(axis, axis)
  815. items = frame.index.levels[num]
  816. codes = frame.index.codes[num]
  817. if transform is not None:
  818. mapped_items = items.map(transform)
  819. codes, items = _factorize_from_iterable(mapped_items.take(codes))
  820. values = np.eye(len(items), dtype=float)
  821. values = values.take(codes, axis=0)
  822. return DataFrame(values, columns=items, index=frame.index)
  823. def _reorder_for_extension_array_stack(arr, n_rows, n_columns):
  824. """
  825. Re-orders the values when stacking multiple extension-arrays.
  826. The indirect stacking method used for EAs requires a followup
  827. take to get the order correct.
  828. Parameters
  829. ----------
  830. arr : ExtensionArray
  831. n_rows, n_columns : int
  832. The number of rows and columns in the original DataFrame.
  833. Returns
  834. -------
  835. taken : ExtensionArray
  836. The original `arr` with elements re-ordered appropriately
  837. Examples
  838. --------
  839. >>> arr = np.array(['a', 'b', 'c', 'd', 'e', 'f'])
  840. >>> _reorder_for_extension_array_stack(arr, 2, 3)
  841. array(['a', 'c', 'e', 'b', 'd', 'f'], dtype='<U1')
  842. >>> _reorder_for_extension_array_stack(arr, 3, 2)
  843. array(['a', 'd', 'b', 'e', 'c', 'f'], dtype='<U1')
  844. """
  845. # final take to get the order correct.
  846. # idx is an indexer like
  847. # [c0r0, c1r0, c2r0, ...,
  848. # c0r1, c1r1, c2r1, ...]
  849. idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel()
  850. return arr.take(idx)