test_multilevel.py 80 KB


  1. # -*- coding: utf-8 -*-
  2. # pylint: disable-msg=W0612,E1101,W0141
  3. import datetime
  4. import itertools
  5. from warnings import catch_warnings, simplefilter
  6. import numpy as np
  7. from numpy.random import randn
  8. import pytest
  9. import pytz
  10. from pandas.compat import (
  11. StringIO, lrange, lzip, product as cart_product, range, u, zip)
  12. from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype
  13. import pandas as pd
  14. from pandas import DataFrame, Panel, Series, Timestamp, isna
  15. from pandas.core.index import Index, MultiIndex
  16. import pandas.util.testing as tm
  17. AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', 'mad',
  18. 'std', 'var', 'sem']
  19. class Base(object):
  20. def setup_method(self, method):
  21. index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two',
  22. 'three']],
  23. codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
  24. [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
  25. names=['first', 'second'])
  26. self.frame = DataFrame(np.random.randn(10, 3), index=index,
  27. columns=Index(['A', 'B', 'C'], name='exp'))
  28. self.single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']],
  29. codes=[[0, 1, 2, 3]], names=['first'])
  30. # create test series object
  31. arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'],
  32. ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
  33. tuples = lzip(*arrays)
  34. index = MultiIndex.from_tuples(tuples)
  35. s = Series(randn(8), index=index)
  36. s[3] = np.NaN
  37. self.series = s
  38. self.tdf = tm.makeTimeDataFrame(100)
  39. self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month,
  40. lambda x: x.day]).sum()
  41. # use Int64Index, to make sure things work
  42. self.ymd.index.set_levels([lev.astype('i8')
  43. for lev in self.ymd.index.levels],
  44. inplace=True)
  45. self.ymd.index.set_names(['year', 'month', 'day'], inplace=True)
  46. class TestMultiLevel(Base):
  47. def test_append(self):
  48. a, b = self.frame[:5], self.frame[5:]
  49. result = a.append(b)
  50. tm.assert_frame_equal(result, self.frame)
  51. result = a['A'].append(b['A'])
  52. tm.assert_series_equal(result, self.frame['A'])
  53. def test_append_index(self):
  54. idx1 = Index([1.1, 1.2, 1.3])
  55. idx2 = pd.date_range('2011-01-01', freq='D', periods=3,
  56. tz='Asia/Tokyo')
  57. idx3 = Index(['A', 'B', 'C'])
  58. midx_lv2 = MultiIndex.from_arrays([idx1, idx2])
  59. midx_lv3 = MultiIndex.from_arrays([idx1, idx2, idx3])
  60. result = idx1.append(midx_lv2)
  61. # see gh-7112
  62. tz = pytz.timezone('Asia/Tokyo')
  63. expected_tuples = [(1.1, tz.localize(datetime.datetime(2011, 1, 1))),
  64. (1.2, tz.localize(datetime.datetime(2011, 1, 2))),
  65. (1.3, tz.localize(datetime.datetime(2011, 1, 3)))]
  66. expected = Index([1.1, 1.2, 1.3] + expected_tuples)
  67. tm.assert_index_equal(result, expected)
  68. result = midx_lv2.append(idx1)
  69. expected = Index(expected_tuples + [1.1, 1.2, 1.3])
  70. tm.assert_index_equal(result, expected)
  71. result = midx_lv2.append(midx_lv2)
  72. expected = MultiIndex.from_arrays([idx1.append(idx1),
  73. idx2.append(idx2)])
  74. tm.assert_index_equal(result, expected)
  75. result = midx_lv2.append(midx_lv3)
  76. tm.assert_index_equal(result, expected)
  77. result = midx_lv3.append(midx_lv2)
  78. expected = Index._simple_new(
  79. np.array([(1.1, tz.localize(datetime.datetime(2011, 1, 1)), 'A'),
  80. (1.2, tz.localize(datetime.datetime(2011, 1, 2)), 'B'),
  81. (1.3, tz.localize(datetime.datetime(2011, 1, 3)), 'C')] +
  82. expected_tuples), None)
  83. tm.assert_index_equal(result, expected)
  84. def test_dataframe_constructor(self):
  85. multi = DataFrame(np.random.randn(4, 4),
  86. index=[np.array(['a', 'a', 'b', 'b']),
  87. np.array(['x', 'y', 'x', 'y'])])
  88. assert isinstance(multi.index, MultiIndex)
  89. assert not isinstance(multi.columns, MultiIndex)
  90. multi = DataFrame(np.random.randn(4, 4),
  91. columns=[['a', 'a', 'b', 'b'],
  92. ['x', 'y', 'x', 'y']])
  93. assert isinstance(multi.columns, MultiIndex)
  94. def test_series_constructor(self):
  95. multi = Series(1., index=[np.array(['a', 'a', 'b', 'b']), np.array(
  96. ['x', 'y', 'x', 'y'])])
  97. assert isinstance(multi.index, MultiIndex)
  98. multi = Series(1., index=[['a', 'a', 'b', 'b'], ['x', 'y', 'x', 'y']])
  99. assert isinstance(multi.index, MultiIndex)
  100. multi = Series(lrange(4), index=[['a', 'a', 'b', 'b'],
  101. ['x', 'y', 'x', 'y']])
  102. assert isinstance(multi.index, MultiIndex)
  103. def test_reindex_level(self):
  104. # axis=0
  105. month_sums = self.ymd.sum(level='month')
  106. result = month_sums.reindex(self.ymd.index, level=1)
  107. expected = self.ymd.groupby(level='month').transform(np.sum)
  108. tm.assert_frame_equal(result, expected)
  109. # Series
  110. result = month_sums['A'].reindex(self.ymd.index, level=1)
  111. expected = self.ymd['A'].groupby(level='month').transform(np.sum)
  112. tm.assert_series_equal(result, expected, check_names=False)
  113. # axis=1
  114. month_sums = self.ymd.T.sum(axis=1, level='month')
  115. result = month_sums.reindex(columns=self.ymd.index, level=1)
  116. expected = self.ymd.groupby(level='month').transform(np.sum).T
  117. tm.assert_frame_equal(result, expected)
  118. def test_binops_level(self):
  119. def _check_op(opname):
  120. op = getattr(DataFrame, opname)
  121. month_sums = self.ymd.sum(level='month')
  122. result = op(self.ymd, month_sums, level='month')
  123. broadcasted = self.ymd.groupby(level='month').transform(np.sum)
  124. expected = op(self.ymd, broadcasted)
  125. tm.assert_frame_equal(result, expected)
  126. # Series
  127. op = getattr(Series, opname)
  128. result = op(self.ymd['A'], month_sums['A'], level='month')
  129. broadcasted = self.ymd['A'].groupby(level='month').transform(
  130. np.sum)
  131. expected = op(self.ymd['A'], broadcasted)
  132. expected.name = 'A'
  133. tm.assert_series_equal(result, expected)
  134. _check_op('sub')
  135. _check_op('add')
  136. _check_op('mul')
  137. _check_op('div')
  138. def test_pickle(self):
  139. def _test_roundtrip(frame):
  140. unpickled = tm.round_trip_pickle(frame)
  141. tm.assert_frame_equal(frame, unpickled)
  142. _test_roundtrip(self.frame)
  143. _test_roundtrip(self.frame.T)
  144. _test_roundtrip(self.ymd)
  145. _test_roundtrip(self.ymd.T)
  146. def test_reindex(self):
  147. expected = self.frame.iloc[[0, 3]]
  148. reindexed = self.frame.loc[[('foo', 'one'), ('bar', 'one')]]
  149. tm.assert_frame_equal(reindexed, expected)
  150. with catch_warnings(record=True):
  151. simplefilter("ignore", DeprecationWarning)
  152. reindexed = self.frame.ix[[('foo', 'one'), ('bar', 'one')]]
  153. tm.assert_frame_equal(reindexed, expected)
  154. def test_reindex_preserve_levels(self):
  155. new_index = self.ymd.index[::10]
  156. chunk = self.ymd.reindex(new_index)
  157. assert chunk.index is new_index
  158. chunk = self.ymd.loc[new_index]
  159. assert chunk.index is new_index
  160. with catch_warnings(record=True):
  161. simplefilter("ignore", DeprecationWarning)
  162. chunk = self.ymd.ix[new_index]
  163. assert chunk.index is new_index
  164. ymdT = self.ymd.T
  165. chunk = ymdT.reindex(columns=new_index)
  166. assert chunk.columns is new_index
  167. chunk = ymdT.loc[:, new_index]
  168. assert chunk.columns is new_index
  169. def test_repr_to_string(self):
  170. repr(self.frame)
  171. repr(self.ymd)
  172. repr(self.frame.T)
  173. repr(self.ymd.T)
  174. buf = StringIO()
  175. self.frame.to_string(buf=buf)
  176. self.ymd.to_string(buf=buf)
  177. self.frame.T.to_string(buf=buf)
  178. self.ymd.T.to_string(buf=buf)
  179. def test_repr_name_coincide(self):
  180. index = MultiIndex.from_tuples([('a', 0, 'foo'), ('b', 1, 'bar')],
  181. names=['a', 'b', 'c'])
  182. df = DataFrame({'value': [0, 1]}, index=index)
  183. lines = repr(df).split('\n')
  184. assert lines[2].startswith('a 0 foo')
  185. def test_delevel_infer_dtype(self):
  186. tuples = [tuple
  187. for tuple in cart_product(
  188. ['foo', 'bar'], [10, 20], [1.0, 1.1])]
  189. index = MultiIndex.from_tuples(tuples, names=['prm0', 'prm1', 'prm2'])
  190. df = DataFrame(np.random.randn(8, 3), columns=['A', 'B', 'C'],
  191. index=index)
  192. deleveled = df.reset_index()
  193. assert is_integer_dtype(deleveled['prm1'])
  194. assert is_float_dtype(deleveled['prm2'])
  195. def test_reset_index_with_drop(self):
  196. deleveled = self.ymd.reset_index(drop=True)
  197. assert len(deleveled.columns) == len(self.ymd.columns)
  198. assert deleveled.index.name == self.ymd.index.name
  199. deleveled = self.series.reset_index()
  200. assert isinstance(deleveled, DataFrame)
  201. assert len(deleveled.columns) == len(self.series.index.levels) + 1
  202. assert deleveled.index.name == self.series.index.name
  203. deleveled = self.series.reset_index(drop=True)
  204. assert isinstance(deleveled, Series)
  205. assert deleveled.index.name == self.series.index.name
  206. def test_count_level(self):
  207. def _check_counts(frame, axis=0):
  208. index = frame._get_axis(axis)
  209. for i in range(index.nlevels):
  210. result = frame.count(axis=axis, level=i)
  211. expected = frame.groupby(axis=axis, level=i).count()
  212. expected = expected.reindex_like(result).astype('i8')
  213. tm.assert_frame_equal(result, expected)
  214. self.frame.iloc[1, [1, 2]] = np.nan
  215. self.frame.iloc[7, [0, 1]] = np.nan
  216. self.ymd.iloc[1, [1, 2]] = np.nan
  217. self.ymd.iloc[7, [0, 1]] = np.nan
  218. _check_counts(self.frame)
  219. _check_counts(self.ymd)
  220. _check_counts(self.frame.T, axis=1)
  221. _check_counts(self.ymd.T, axis=1)
  222. # can't call with level on regular DataFrame
  223. df = tm.makeTimeDataFrame()
  224. with pytest.raises(TypeError, match='hierarchical'):
  225. df.count(level=0)
  226. self.frame['D'] = 'foo'
  227. result = self.frame.count(level=0, numeric_only=True)
  228. tm.assert_index_equal(result.columns, Index(list('ABC'), name='exp'))
  229. def test_count_level_series(self):
  230. index = MultiIndex(levels=[['foo', 'bar', 'baz'], ['one', 'two',
  231. 'three', 'four']],
  232. codes=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]])
  233. s = Series(np.random.randn(len(index)), index=index)
  234. result = s.count(level=0)
  235. expected = s.groupby(level=0).count()
  236. tm.assert_series_equal(
  237. result.astype('f8'), expected.reindex(result.index).fillna(0))
  238. result = s.count(level=1)
  239. expected = s.groupby(level=1).count()
  240. tm.assert_series_equal(
  241. result.astype('f8'), expected.reindex(result.index).fillna(0))
  242. def test_count_level_corner(self):
  243. s = self.frame['A'][:0]
  244. result = s.count(level=0)
  245. expected = Series(0, index=s.index.levels[0], name='A')
  246. tm.assert_series_equal(result, expected)
  247. df = self.frame[:0]
  248. result = df.count(level=0)
  249. expected = DataFrame({}, index=s.index.levels[0],
  250. columns=df.columns).fillna(0).astype(np.int64)
  251. tm.assert_frame_equal(result, expected)
  252. def test_get_level_number_out_of_bounds(self):
  253. with pytest.raises(IndexError, match="Too many levels"):
  254. self.frame.index._get_level_number(2)
  255. with pytest.raises(IndexError, match="not a valid level number"):
  256. self.frame.index._get_level_number(-3)
  257. def test_unstack(self):
  258. # just check that it works for now
  259. unstacked = self.ymd.unstack()
  260. unstacked.unstack()
  261. # test that ints work
  262. self.ymd.astype(int).unstack()
  263. # test that int32 work
  264. self.ymd.astype(np.int32).unstack()
  265. def test_unstack_multiple_no_empty_columns(self):
  266. index = MultiIndex.from_tuples([(0, 'foo', 0), (0, 'bar', 0), (
  267. 1, 'baz', 1), (1, 'qux', 1)])
  268. s = Series(np.random.randn(4), index=index)
  269. unstacked = s.unstack([1, 2])
  270. expected = unstacked.dropna(axis=1, how='all')
  271. tm.assert_frame_equal(unstacked, expected)
  272. def test_stack(self):
  273. # regular roundtrip
  274. unstacked = self.ymd.unstack()
  275. restacked = unstacked.stack()
  276. tm.assert_frame_equal(restacked, self.ymd)
  277. unlexsorted = self.ymd.sort_index(level=2)
  278. unstacked = unlexsorted.unstack(2)
  279. restacked = unstacked.stack()
  280. tm.assert_frame_equal(restacked.sort_index(level=0), self.ymd)
  281. unlexsorted = unlexsorted[::-1]
  282. unstacked = unlexsorted.unstack(1)
  283. restacked = unstacked.stack().swaplevel(1, 2)
  284. tm.assert_frame_equal(restacked.sort_index(level=0), self.ymd)
  285. unlexsorted = unlexsorted.swaplevel(0, 1)
  286. unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1)
  287. restacked = unstacked.stack(0).swaplevel(1, 2)
  288. tm.assert_frame_equal(restacked.sort_index(level=0), self.ymd)
  289. # columns unsorted
  290. unstacked = self.ymd.unstack()
  291. unstacked = unstacked.sort_index(axis=1, ascending=False)
  292. restacked = unstacked.stack()
  293. tm.assert_frame_equal(restacked, self.ymd)
  294. # more than 2 levels in the columns
  295. unstacked = self.ymd.unstack(1).unstack(1)
  296. result = unstacked.stack(1)
  297. expected = self.ymd.unstack()
  298. tm.assert_frame_equal(result, expected)
  299. result = unstacked.stack(2)
  300. expected = self.ymd.unstack(1)
  301. tm.assert_frame_equal(result, expected)
  302. result = unstacked.stack(0)
  303. expected = self.ymd.stack().unstack(1).unstack(1)
  304. tm.assert_frame_equal(result, expected)
  305. # not all levels present in each echelon
  306. unstacked = self.ymd.unstack(2).loc[:, ::3]
  307. stacked = unstacked.stack().stack()
  308. ymd_stacked = self.ymd.stack()
  309. tm.assert_series_equal(stacked, ymd_stacked.reindex(stacked.index))
  310. # stack with negative number
  311. result = self.ymd.unstack(0).stack(-2)
  312. expected = self.ymd.unstack(0).stack(0)
  313. # GH10417
  314. def check(left, right):
  315. tm.assert_series_equal(left, right)
  316. assert left.index.is_unique is False
  317. li, ri = left.index, right.index
  318. tm.assert_index_equal(li, ri)
  319. df = DataFrame(np.arange(12).reshape(4, 3),
  320. index=list('abab'),
  321. columns=['1st', '2nd', '3rd'])
  322. mi = MultiIndex(levels=[['a', 'b'], ['1st', '2nd', '3rd']],
  323. codes=[np.tile(
  324. np.arange(2).repeat(3), 2), np.tile(
  325. np.arange(3), 4)])
  326. left, right = df.stack(), Series(np.arange(12), index=mi)
  327. check(left, right)
  328. df.columns = ['1st', '2nd', '1st']
  329. mi = MultiIndex(levels=[['a', 'b'], ['1st', '2nd']], codes=[np.tile(
  330. np.arange(2).repeat(3), 2), np.tile(
  331. [0, 1, 0], 4)])
  332. left, right = df.stack(), Series(np.arange(12), index=mi)
  333. check(left, right)
  334. tpls = ('a', 2), ('b', 1), ('a', 1), ('b', 2)
  335. df.index = MultiIndex.from_tuples(tpls)
  336. mi = MultiIndex(levels=[['a', 'b'], [1, 2], ['1st', '2nd']],
  337. codes=[np.tile(
  338. np.arange(2).repeat(3), 2), np.repeat(
  339. [1, 0, 1], [3, 6, 3]), np.tile(
  340. [0, 1, 0], 4)])
  341. left, right = df.stack(), Series(np.arange(12), index=mi)
  342. check(left, right)
  343. def test_unstack_odd_failure(self):
  344. data = """day,time,smoker,sum,len
  345. Fri,Dinner,No,8.25,3.
  346. Fri,Dinner,Yes,27.03,9
  347. Fri,Lunch,No,3.0,1
  348. Fri,Lunch,Yes,13.68,6
  349. Sat,Dinner,No,139.63,45
  350. Sat,Dinner,Yes,120.77,42
  351. Sun,Dinner,No,180.57,57
  352. Sun,Dinner,Yes,66.82,19
  353. Thur,Dinner,No,3.0,1
  354. Thur,Lunch,No,117.32,44
  355. Thur,Lunch,Yes,51.51,17"""
  356. df = pd.read_csv(StringIO(data)).set_index(['day', 'time', 'smoker'])
  357. # it works, #2100
  358. result = df.unstack(2)
  359. recons = result.stack()
  360. tm.assert_frame_equal(recons, df)
  361. def test_stack_mixed_dtype(self):
  362. df = self.frame.T
  363. df['foo', 'four'] = 'foo'
  364. df = df.sort_index(level=1, axis=1)
  365. stacked = df.stack()
  366. result = df['foo'].stack().sort_index()
  367. tm.assert_series_equal(stacked['foo'], result, check_names=False)
  368. assert result.name is None
  369. assert stacked['bar'].dtype == np.float_
  370. def test_unstack_bug(self):
  371. df = DataFrame({'state': ['naive', 'naive', 'naive', 'activ', 'activ',
  372. 'activ'],
  373. 'exp': ['a', 'b', 'b', 'b', 'a', 'a'],
  374. 'barcode': [1, 2, 3, 4, 1, 3],
  375. 'v': ['hi', 'hi', 'bye', 'bye', 'bye', 'peace'],
  376. 'extra': np.arange(6.)})
  377. result = df.groupby(['state', 'exp', 'barcode', 'v']).apply(len)
  378. unstacked = result.unstack()
  379. restacked = unstacked.stack()
  380. tm.assert_series_equal(
  381. restacked, result.reindex(restacked.index).astype(float))
  382. def test_stack_unstack_preserve_names(self):
  383. unstacked = self.frame.unstack()
  384. assert unstacked.index.name == 'first'
  385. assert unstacked.columns.names == ['exp', 'second']
  386. restacked = unstacked.stack()
  387. assert restacked.index.names == self.frame.index.names
  388. def test_unstack_level_name(self):
  389. result = self.frame.unstack('second')
  390. expected = self.frame.unstack(level=1)
  391. tm.assert_frame_equal(result, expected)
  392. def test_stack_level_name(self):
  393. unstacked = self.frame.unstack('second')
  394. result = unstacked.stack('exp')
  395. expected = self.frame.unstack().stack(0)
  396. tm.assert_frame_equal(result, expected)
  397. result = self.frame.stack('exp')
  398. expected = self.frame.stack()
  399. tm.assert_series_equal(result, expected)
  400. def test_stack_unstack_multiple(self):
  401. unstacked = self.ymd.unstack(['year', 'month'])
  402. expected = self.ymd.unstack('year').unstack('month')
  403. tm.assert_frame_equal(unstacked, expected)
  404. assert unstacked.columns.names == expected.columns.names
  405. # series
  406. s = self.ymd['A']
  407. s_unstacked = s.unstack(['year', 'month'])
  408. tm.assert_frame_equal(s_unstacked, expected['A'])
  409. restacked = unstacked.stack(['year', 'month'])
  410. restacked = restacked.swaplevel(0, 1).swaplevel(1, 2)
  411. restacked = restacked.sort_index(level=0)
  412. tm.assert_frame_equal(restacked, self.ymd)
  413. assert restacked.index.names == self.ymd.index.names
  414. # GH #451
  415. unstacked = self.ymd.unstack([1, 2])
  416. expected = self.ymd.unstack(1).unstack(1).dropna(axis=1, how='all')
  417. tm.assert_frame_equal(unstacked, expected)
  418. unstacked = self.ymd.unstack([2, 1])
  419. expected = self.ymd.unstack(2).unstack(1).dropna(axis=1, how='all')
  420. tm.assert_frame_equal(unstacked, expected.loc[:, unstacked.columns])
  421. def test_stack_names_and_numbers(self):
  422. unstacked = self.ymd.unstack(['year', 'month'])
  423. # Can't use mixture of names and numbers to stack
  424. with pytest.raises(ValueError, match="level should contain"):
  425. unstacked.stack([0, 'month'])
  426. def test_stack_multiple_out_of_bounds(self):
  427. # nlevels == 3
  428. unstacked = self.ymd.unstack(['year', 'month'])
  429. with pytest.raises(IndexError, match="Too many levels"):
  430. unstacked.stack([2, 3])
  431. with pytest.raises(IndexError, match="not a valid level number"):
  432. unstacked.stack([-4, -3])
  433. def test_unstack_period_series(self):
  434. # GH 4342
  435. idx1 = pd.PeriodIndex(['2013-01', '2013-01', '2013-02', '2013-02',
  436. '2013-03', '2013-03'], freq='M', name='period')
  437. idx2 = Index(['A', 'B'] * 3, name='str')
  438. value = [1, 2, 3, 4, 5, 6]
  439. idx = MultiIndex.from_arrays([idx1, idx2])
  440. s = Series(value, index=idx)
  441. result1 = s.unstack()
  442. result2 = s.unstack(level=1)
  443. result3 = s.unstack(level=0)
  444. e_idx = pd.PeriodIndex(
  445. ['2013-01', '2013-02', '2013-03'], freq='M', name='period')
  446. expected = DataFrame({'A': [1, 3, 5], 'B': [2, 4, 6]}, index=e_idx,
  447. columns=['A', 'B'])
  448. expected.columns.name = 'str'
  449. tm.assert_frame_equal(result1, expected)
  450. tm.assert_frame_equal(result2, expected)
  451. tm.assert_frame_equal(result3, expected.T)
  452. idx1 = pd.PeriodIndex(['2013-01', '2013-01', '2013-02', '2013-02',
  453. '2013-03', '2013-03'], freq='M', name='period1')
  454. idx2 = pd.PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09',
  455. '2013-08', '2013-07'], freq='M', name='period2')
  456. idx = MultiIndex.from_arrays([idx1, idx2])
  457. s = Series(value, index=idx)
  458. result1 = s.unstack()
  459. result2 = s.unstack(level=1)
  460. result3 = s.unstack(level=0)
  461. e_idx = pd.PeriodIndex(
  462. ['2013-01', '2013-02', '2013-03'], freq='M', name='period1')
  463. e_cols = pd.PeriodIndex(['2013-07', '2013-08', '2013-09', '2013-10',
  464. '2013-11', '2013-12'],
  465. freq='M', name='period2')
  466. expected = DataFrame([[np.nan, np.nan, np.nan, np.nan, 2, 1],
  467. [np.nan, np.nan, 4, 3, np.nan, np.nan],
  468. [6, 5, np.nan, np.nan, np.nan, np.nan]],
  469. index=e_idx, columns=e_cols)
  470. tm.assert_frame_equal(result1, expected)
  471. tm.assert_frame_equal(result2, expected)
  472. tm.assert_frame_equal(result3, expected.T)
  473. def test_unstack_period_frame(self):
  474. # GH 4342
  475. idx1 = pd.PeriodIndex(['2014-01', '2014-02', '2014-02', '2014-02',
  476. '2014-01', '2014-01'],
  477. freq='M', name='period1')
  478. idx2 = pd.PeriodIndex(['2013-12', '2013-12', '2014-02', '2013-10',
  479. '2013-10', '2014-02'],
  480. freq='M', name='period2')
  481. value = {'A': [1, 2, 3, 4, 5, 6], 'B': [6, 5, 4, 3, 2, 1]}
  482. idx = MultiIndex.from_arrays([idx1, idx2])
  483. df = DataFrame(value, index=idx)
  484. result1 = df.unstack()
  485. result2 = df.unstack(level=1)
  486. result3 = df.unstack(level=0)
  487. e_1 = pd.PeriodIndex(['2014-01', '2014-02'], freq='M', name='period1')
  488. e_2 = pd.PeriodIndex(['2013-10', '2013-12', '2014-02', '2013-10',
  489. '2013-12', '2014-02'], freq='M', name='period2')
  490. e_cols = MultiIndex.from_arrays(['A A A B B B'.split(), e_2])
  491. expected = DataFrame([[5, 1, 6, 2, 6, 1], [4, 2, 3, 3, 5, 4]],
  492. index=e_1, columns=e_cols)
  493. tm.assert_frame_equal(result1, expected)
  494. tm.assert_frame_equal(result2, expected)
  495. e_1 = pd.PeriodIndex(['2014-01', '2014-02', '2014-01',
  496. '2014-02'], freq='M', name='period1')
  497. e_2 = pd.PeriodIndex(
  498. ['2013-10', '2013-12', '2014-02'], freq='M', name='period2')
  499. e_cols = MultiIndex.from_arrays(['A A B B'.split(), e_1])
  500. expected = DataFrame([[5, 4, 2, 3], [1, 2, 6, 5], [6, 3, 1, 4]],
  501. index=e_2, columns=e_cols)
  502. tm.assert_frame_equal(result3, expected)
  503. def test_stack_multiple_bug(self):
  504. """ bug when some uniques are not present in the data #3170"""
  505. id_col = ([1] * 3) + ([2] * 3)
  506. name = (['a'] * 3) + (['b'] * 3)
  507. date = pd.to_datetime(['2013-01-03', '2013-01-04', '2013-01-05'] * 2)
  508. var1 = np.random.randint(0, 100, 6)
  509. df = DataFrame(dict(ID=id_col, NAME=name, DATE=date, VAR1=var1))
  510. multi = df.set_index(['DATE', 'ID'])
  511. multi.columns.name = 'Params'
  512. unst = multi.unstack('ID')
  513. down = unst.resample('W-THU').mean()
  514. rs = down.stack('ID')
  515. xp = unst.loc[:, ['VAR1']].resample('W-THU').mean().stack('ID')
  516. xp.columns.name = 'Params'
  517. tm.assert_frame_equal(rs, xp)
  518. def test_stack_dropna(self):
  519. # GH #3997
  520. df = DataFrame({'A': ['a1', 'a2'], 'B': ['b1', 'b2'], 'C': [1, 1]})
  521. df = df.set_index(['A', 'B'])
  522. stacked = df.unstack().stack(dropna=False)
  523. assert len(stacked) > len(stacked.dropna())
  524. stacked = df.unstack().stack(dropna=True)
  525. tm.assert_frame_equal(stacked, stacked.dropna())
  526. def test_unstack_multiple_hierarchical(self):
  527. df = DataFrame(index=[[0, 0, 0, 0, 1, 1, 1, 1],
  528. [0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 0, 1, 0, 1
  529. ]],
  530. columns=[[0, 0, 1, 1], [0, 1, 0, 1]])
  531. df.index.names = ['a', 'b', 'c']
  532. df.columns.names = ['d', 'e']
  533. # it works!
  534. df.unstack(['b', 'c'])
  535. def test_groupby_transform(self):
  536. s = self.frame['A']
  537. grouper = s.index.get_level_values(0)
  538. grouped = s.groupby(grouper)
  539. applied = grouped.apply(lambda x: x * 2)
  540. expected = grouped.transform(lambda x: x * 2)
  541. result = applied.reindex(expected.index)
  542. tm.assert_series_equal(result, expected, check_names=False)
  543. def test_unstack_sparse_keyspace(self):
  544. # memory problems with naive impl #2278
  545. # Generate Long File & Test Pivot
  546. NUM_ROWS = 1000
  547. df = DataFrame({'A': np.random.randint(100, size=NUM_ROWS),
  548. 'B': np.random.randint(300, size=NUM_ROWS),
  549. 'C': np.random.randint(-7, 7, size=NUM_ROWS),
  550. 'D': np.random.randint(-19, 19, size=NUM_ROWS),
  551. 'E': np.random.randint(3000, size=NUM_ROWS),
  552. 'F': np.random.randn(NUM_ROWS)})
  553. idf = df.set_index(['A', 'B', 'C', 'D', 'E'])
  554. # it works! is sufficient
  555. idf.unstack('E')
  556. def test_unstack_unobserved_keys(self):
  557. # related to #2278 refactoring
  558. levels = [[0, 1], [0, 1, 2, 3]]
  559. codes = [[0, 0, 1, 1], [0, 2, 0, 2]]
  560. index = MultiIndex(levels, codes)
  561. df = DataFrame(np.random.randn(4, 2), index=index)
  562. result = df.unstack()
  563. assert len(result.columns) == 4
  564. recons = result.stack()
  565. tm.assert_frame_equal(recons, df)
  566. @pytest.mark.slow
  567. def test_unstack_number_of_levels_larger_than_int32(self):
  568. # GH 20601
  569. df = DataFrame(np.random.randn(2 ** 16, 2),
  570. index=[np.arange(2 ** 16), np.arange(2 ** 16)])
  571. with pytest.raises(ValueError, match='int32 overflow'):
  572. df.unstack()
  573. def test_stack_order_with_unsorted_levels(self):
  574. # GH 16323
  575. def manual_compare_stacked(df, df_stacked, lev0, lev1):
  576. assert all(df.loc[row, col] ==
  577. df_stacked.loc[(row, col[lev0]), col[lev1]]
  578. for row in df.index for col in df.columns)
  579. # deep check for 1-row case
  580. for width in [2, 3]:
  581. levels_poss = itertools.product(
  582. itertools.permutations([0, 1, 2], width),
  583. repeat=2)
  584. for levels in levels_poss:
  585. columns = MultiIndex(levels=levels,
  586. codes=[[0, 0, 1, 1],
  587. [0, 1, 0, 1]])
  588. df = DataFrame(columns=columns, data=[range(4)])
  589. for stack_lev in range(2):
  590. df_stacked = df.stack(stack_lev)
  591. manual_compare_stacked(df, df_stacked,
  592. stack_lev, 1 - stack_lev)
  593. # check multi-row case
  594. mi = MultiIndex(levels=[["A", "C", "B"], ["B", "A", "C"]],
  595. codes=[np.repeat(range(3), 3), np.tile(range(3), 3)])
  596. df = DataFrame(columns=mi, index=range(5),
  597. data=np.arange(5 * len(mi)).reshape(5, -1))
  598. manual_compare_stacked(df, df.stack(0), 0, 1)
  599. def test_groupby_corner(self):
  600. midx = MultiIndex(levels=[['foo'], ['bar'], ['baz']],
  601. codes=[[0], [0], [0]],
  602. names=['one', 'two', 'three'])
  603. df = DataFrame([np.random.rand(4)], columns=['a', 'b', 'c', 'd'],
  604. index=midx)
  605. # should work
  606. df.groupby(level='three')
  607. def test_groupby_level_no_obs(self):
  608. # #1697
  609. midx = MultiIndex.from_tuples([('f1', 's1'), ('f1', 's2'), (
  610. 'f2', 's1'), ('f2', 's2'), ('f3', 's1'), ('f3', 's2')])
  611. df = DataFrame(
  612. [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], columns=midx)
  613. df1 = df.loc(axis=1)[df.columns.map(
  614. lambda u: u[0] in ['f2', 'f3'])]
  615. grouped = df1.groupby(axis=1, level=0)
  616. result = grouped.sum()
  617. assert (result.columns == ['f2', 'f3']).all()
  618. def test_join(self):
  619. a = self.frame.loc[self.frame.index[:5], ['A']]
  620. b = self.frame.loc[self.frame.index[2:], ['B', 'C']]
  621. joined = a.join(b, how='outer').reindex(self.frame.index)
  622. expected = self.frame.copy()
  623. expected.values[np.isnan(joined.values)] = np.nan
  624. assert not np.isnan(joined.values).all()
  625. # TODO what should join do with names ?
  626. tm.assert_frame_equal(joined, expected, check_names=False)
  627. def test_swaplevel(self):
  628. swapped = self.frame['A'].swaplevel()
  629. swapped2 = self.frame['A'].swaplevel(0)
  630. swapped3 = self.frame['A'].swaplevel(0, 1)
  631. swapped4 = self.frame['A'].swaplevel('first', 'second')
  632. assert not swapped.index.equals(self.frame.index)
  633. tm.assert_series_equal(swapped, swapped2)
  634. tm.assert_series_equal(swapped, swapped3)
  635. tm.assert_series_equal(swapped, swapped4)
  636. back = swapped.swaplevel()
  637. back2 = swapped.swaplevel(0)
  638. back3 = swapped.swaplevel(0, 1)
  639. back4 = swapped.swaplevel('second', 'first')
  640. assert back.index.equals(self.frame.index)
  641. tm.assert_series_equal(back, back2)
  642. tm.assert_series_equal(back, back3)
  643. tm.assert_series_equal(back, back4)
  644. ft = self.frame.T
  645. swapped = ft.swaplevel('first', 'second', axis=1)
  646. exp = self.frame.swaplevel('first', 'second').T
  647. tm.assert_frame_equal(swapped, exp)
  648. def test_swaplevel_panel(self):
  649. with catch_warnings(record=True):
  650. simplefilter("ignore", FutureWarning)
  651. panel = Panel({'ItemA': self.frame, 'ItemB': self.frame * 2})
  652. expected = panel.copy()
  653. expected.major_axis = expected.major_axis.swaplevel(0, 1)
  654. for result in (panel.swaplevel(axis='major'),
  655. panel.swaplevel(0, axis='major'),
  656. panel.swaplevel(0, 1, axis='major')):
  657. tm.assert_panel_equal(result, expected)
  658. def test_reorder_levels(self):
  659. result = self.ymd.reorder_levels(['month', 'day', 'year'])
  660. expected = self.ymd.swaplevel(0, 1).swaplevel(1, 2)
  661. tm.assert_frame_equal(result, expected)
  662. result = self.ymd['A'].reorder_levels(['month', 'day', 'year'])
  663. expected = self.ymd['A'].swaplevel(0, 1).swaplevel(1, 2)
  664. tm.assert_series_equal(result, expected)
  665. result = self.ymd.T.reorder_levels(['month', 'day', 'year'], axis=1)
  666. expected = self.ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1)
  667. tm.assert_frame_equal(result, expected)
  668. with pytest.raises(TypeError, match='hierarchical axis'):
  669. self.ymd.reorder_levels([1, 2], axis=1)
  670. with pytest.raises(IndexError, match='Too many levels'):
  671. self.ymd.index.reorder_levels([1, 2, 3])
  672. def test_insert_index(self):
  673. df = self.ymd[:5].T
  674. df[2000, 1, 10] = df[2000, 1, 7]
  675. assert isinstance(df.columns, MultiIndex)
  676. assert (df[2000, 1, 10] == df[2000, 1, 7]).all()
  677. def test_alignment(self):
  678. x = Series(data=[1, 2, 3], index=MultiIndex.from_tuples([("A", 1), (
  679. "A", 2), ("B", 3)]))
  680. y = Series(data=[4, 5, 6], index=MultiIndex.from_tuples([("Z", 1), (
  681. "Z", 2), ("B", 3)]))
  682. res = x - y
  683. exp_index = x.index.union(y.index)
  684. exp = x.reindex(exp_index) - y.reindex(exp_index)
  685. tm.assert_series_equal(res, exp)
  686. # hit non-monotonic code path
  687. res = x[::-1] - y[::-1]
  688. exp_index = x.index.union(y.index)
  689. exp = x.reindex(exp_index) - y.reindex(exp_index)
  690. tm.assert_series_equal(res, exp)
  691. def test_count(self):
  692. frame = self.frame.copy()
  693. frame.index.names = ['a', 'b']
  694. result = frame.count(level='b')
  695. expect = self.frame.count(level=1)
  696. tm.assert_frame_equal(result, expect, check_names=False)
  697. result = frame.count(level='a')
  698. expect = self.frame.count(level=0)
  699. tm.assert_frame_equal(result, expect, check_names=False)
  700. series = self.series.copy()
  701. series.index.names = ['a', 'b']
  702. result = series.count(level='b')
  703. expect = self.series.count(level=1)
  704. tm.assert_series_equal(result, expect, check_names=False)
  705. assert result.index.name == 'b'
  706. result = series.count(level='a')
  707. expect = self.series.count(level=0)
  708. tm.assert_series_equal(result, expect, check_names=False)
  709. assert result.index.name == 'a'
  710. pytest.raises(KeyError, series.count, 'x')
  711. pytest.raises(KeyError, frame.count, level='x')
  712. @pytest.mark.parametrize('op', AGG_FUNCTIONS)
  713. @pytest.mark.parametrize('level', [0, 1])
  714. @pytest.mark.parametrize('skipna', [True, False])
  715. @pytest.mark.parametrize('sort', [True, False])
  716. def test_series_group_min_max(self, op, level, skipna, sort):
  717. # GH 17537
  718. grouped = self.series.groupby(level=level, sort=sort)
  719. # skipna=True
  720. leftside = grouped.agg(lambda x: getattr(x, op)(skipna=skipna))
  721. rightside = getattr(self.series, op)(level=level, skipna=skipna)
  722. if sort:
  723. rightside = rightside.sort_index(level=level)
  724. tm.assert_series_equal(leftside, rightside)
  725. @pytest.mark.parametrize('op', AGG_FUNCTIONS)
  726. @pytest.mark.parametrize('level', [0, 1])
  727. @pytest.mark.parametrize('axis', [0, 1])
  728. @pytest.mark.parametrize('skipna', [True, False])
  729. @pytest.mark.parametrize('sort', [True, False])
  730. def test_frame_group_ops(self, op, level, axis, skipna, sort):
  731. # GH 17537
  732. self.frame.iloc[1, [1, 2]] = np.nan
  733. self.frame.iloc[7, [0, 1]] = np.nan
  734. if axis == 0:
  735. frame = self.frame
  736. else:
  737. frame = self.frame.T
  738. grouped = frame.groupby(level=level, axis=axis, sort=sort)
  739. pieces = []
  740. def aggf(x):
  741. pieces.append(x)
  742. return getattr(x, op)(skipna=skipna, axis=axis)
  743. leftside = grouped.agg(aggf)
  744. rightside = getattr(frame, op)(level=level, axis=axis,
  745. skipna=skipna)
  746. if sort:
  747. rightside = rightside.sort_index(level=level, axis=axis)
  748. frame = frame.sort_index(level=level, axis=axis)
  749. # for good measure, groupby detail
  750. level_index = frame._get_axis(axis).levels[level]
  751. tm.assert_index_equal(leftside._get_axis(axis), level_index)
  752. tm.assert_index_equal(rightside._get_axis(axis), level_index)
  753. tm.assert_frame_equal(leftside, rightside)
  754. def test_stat_op_corner(self):
  755. obj = Series([10.0], index=MultiIndex.from_tuples([(2, 3)]))
  756. result = obj.sum(level=0)
  757. expected = Series([10.0], index=[2])
  758. tm.assert_series_equal(result, expected)
  759. def test_frame_any_all_group(self):
  760. df = DataFrame(
  761. {'data': [False, False, True, False, True, False, True]},
  762. index=[
  763. ['one', 'one', 'two', 'one', 'two', 'two', 'two'],
  764. [0, 1, 0, 2, 1, 2, 3]])
  765. result = df.any(level=0)
  766. ex = DataFrame({'data': [False, True]}, index=['one', 'two'])
  767. tm.assert_frame_equal(result, ex)
  768. result = df.all(level=0)
  769. ex = DataFrame({'data': [False, False]}, index=['one', 'two'])
  770. tm.assert_frame_equal(result, ex)
  771. def test_std_var_pass_ddof(self):
  772. index = MultiIndex.from_arrays([np.arange(5).repeat(10), np.tile(
  773. np.arange(10), 5)])
  774. df = DataFrame(np.random.randn(len(index), 5), index=index)
  775. for meth in ['var', 'std']:
  776. ddof = 4
  777. alt = lambda x: getattr(x, meth)(ddof=ddof)
  778. result = getattr(df[0], meth)(level=0, ddof=ddof)
  779. expected = df[0].groupby(level=0).agg(alt)
  780. tm.assert_series_equal(result, expected)
  781. result = getattr(df, meth)(level=0, ddof=ddof)
  782. expected = df.groupby(level=0).agg(alt)
  783. tm.assert_frame_equal(result, expected)
  784. def test_frame_series_agg_multiple_levels(self):
  785. result = self.ymd.sum(level=['year', 'month'])
  786. expected = self.ymd.groupby(level=['year', 'month']).sum()
  787. tm.assert_frame_equal(result, expected)
  788. result = self.ymd['A'].sum(level=['year', 'month'])
  789. expected = self.ymd['A'].groupby(level=['year', 'month']).sum()
  790. tm.assert_series_equal(result, expected)
  791. def test_groupby_multilevel(self):
  792. result = self.ymd.groupby(level=[0, 1]).mean()
  793. k1 = self.ymd.index.get_level_values(0)
  794. k2 = self.ymd.index.get_level_values(1)
  795. expected = self.ymd.groupby([k1, k2]).mean()
  796. # TODO groupby with level_values drops names
  797. tm.assert_frame_equal(result, expected, check_names=False)
  798. assert result.index.names == self.ymd.index.names[:2]
  799. result2 = self.ymd.groupby(level=self.ymd.index.names[:2]).mean()
  800. tm.assert_frame_equal(result, result2)
  801. def test_groupby_multilevel_with_transform(self):
  802. pass
  803. def test_multilevel_consolidate(self):
  804. index = MultiIndex.from_tuples([('foo', 'one'), ('foo', 'two'), (
  805. 'bar', 'one'), ('bar', 'two')])
  806. df = DataFrame(np.random.randn(4, 4), index=index, columns=index)
  807. df['Totals', ''] = df.sum(1)
  808. df = df._consolidate()
  809. def test_ix_preserve_names(self):
  810. result = self.ymd.loc[2000]
  811. result2 = self.ymd['A'].loc[2000]
  812. assert result.index.names == self.ymd.index.names[1:]
  813. assert result2.index.names == self.ymd.index.names[1:]
  814. result = self.ymd.loc[2000, 2]
  815. result2 = self.ymd['A'].loc[2000, 2]
  816. assert result.index.name == self.ymd.index.names[2]
  817. assert result2.index.name == self.ymd.index.names[2]
  818. def test_unstack_preserve_types(self):
  819. # GH #403
  820. self.ymd['E'] = 'foo'
  821. self.ymd['F'] = 2
  822. unstacked = self.ymd.unstack('month')
  823. assert unstacked['A', 1].dtype == np.float64
  824. assert unstacked['E', 1].dtype == np.object_
  825. assert unstacked['F', 1].dtype == np.float64
  826. def test_unstack_group_index_overflow(self):
  827. codes = np.tile(np.arange(500), 2)
  828. level = np.arange(500)
  829. index = MultiIndex(levels=[level] * 8 + [[0, 1]],
  830. codes=[codes] * 8 + [np.arange(2).repeat(500)])
  831. s = Series(np.arange(1000), index=index)
  832. result = s.unstack()
  833. assert result.shape == (500, 2)
  834. # test roundtrip
  835. stacked = result.stack()
  836. tm.assert_series_equal(s, stacked.reindex(s.index))
  837. # put it at beginning
  838. index = MultiIndex(levels=[[0, 1]] + [level] * 8,
  839. codes=[np.arange(2).repeat(500)] + [codes] * 8)
  840. s = Series(np.arange(1000), index=index)
  841. result = s.unstack(0)
  842. assert result.shape == (500, 2)
  843. # put it in middle
  844. index = MultiIndex(levels=[level] * 4 + [[0, 1]] + [level] * 4,
  845. codes=([codes] * 4 + [np.arange(2).repeat(500)] +
  846. [codes] * 4))
  847. s = Series(np.arange(1000), index=index)
  848. result = s.unstack(4)
  849. assert result.shape == (500, 2)
  850. def test_pyint_engine(self):
  851. # GH 18519 : when combinations of codes cannot be represented in 64
  852. # bits, the index underlying the MultiIndex engine works with Python
  853. # integers, rather than uint64.
  854. N = 5
  855. keys = [tuple(l) for l in [[0] * 10 * N,
  856. [1] * 10 * N,
  857. [2] * 10 * N,
  858. [np.nan] * N + [2] * 9 * N,
  859. [0] * N + [2] * 9 * N,
  860. [np.nan] * N + [2] * 8 * N + [0] * N]]
  861. # Each level contains 4 elements (including NaN), so it is represented
  862. # in 2 bits, for a total of 2*N*10 = 100 > 64 bits. If we were using a
  863. # 64 bit engine and truncating the first levels, the fourth and fifth
  864. # keys would collide; if truncating the last levels, the fifth and
  865. # sixth; if rotating bits rather than shifting, the third and fifth.
  866. for idx in range(len(keys)):
  867. index = MultiIndex.from_tuples(keys)
  868. assert index.get_loc(keys[idx]) == idx
  869. expected = np.arange(idx + 1, dtype=np.intp)
  870. result = index.get_indexer([keys[i] for i in expected])
  871. tm.assert_numpy_array_equal(result, expected)
  872. # With missing key:
  873. idces = range(len(keys))
  874. expected = np.array([-1] + list(idces), dtype=np.intp)
  875. missing = tuple([0, 1] * 5 * N)
  876. result = index.get_indexer([missing] + [keys[i] for i in idces])
  877. tm.assert_numpy_array_equal(result, expected)
  878. def test_to_html(self):
  879. self.ymd.columns.name = 'foo'
  880. self.ymd.to_html()
  881. self.ymd.T.to_html()
  882. def test_level_with_tuples(self):
  883. index = MultiIndex(levels=[[('foo', 'bar', 0), ('foo', 'baz', 0), (
  884. 'foo', 'qux', 0)], [0, 1]],
  885. codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])
  886. series = Series(np.random.randn(6), index=index)
  887. frame = DataFrame(np.random.randn(6, 4), index=index)
  888. result = series[('foo', 'bar', 0)]
  889. result2 = series.loc[('foo', 'bar', 0)]
  890. expected = series[:2]
  891. expected.index = expected.index.droplevel(0)
  892. tm.assert_series_equal(result, expected)
  893. tm.assert_series_equal(result2, expected)
  894. pytest.raises(KeyError, series.__getitem__, (('foo', 'bar', 0), 2))
  895. result = frame.loc[('foo', 'bar', 0)]
  896. result2 = frame.xs(('foo', 'bar', 0))
  897. expected = frame[:2]
  898. expected.index = expected.index.droplevel(0)
  899. tm.assert_frame_equal(result, expected)
  900. tm.assert_frame_equal(result2, expected)
  901. index = MultiIndex(levels=[[('foo', 'bar'), ('foo', 'baz'), (
  902. 'foo', 'qux')], [0, 1]],
  903. codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])
  904. series = Series(np.random.randn(6), index=index)
  905. frame = DataFrame(np.random.randn(6, 4), index=index)
  906. result = series[('foo', 'bar')]
  907. result2 = series.loc[('foo', 'bar')]
  908. expected = series[:2]
  909. expected.index = expected.index.droplevel(0)
  910. tm.assert_series_equal(result, expected)
  911. tm.assert_series_equal(result2, expected)
  912. result = frame.loc[('foo', 'bar')]
  913. result2 = frame.xs(('foo', 'bar'))
  914. expected = frame[:2]
  915. expected.index = expected.index.droplevel(0)
  916. tm.assert_frame_equal(result, expected)
  917. tm.assert_frame_equal(result2, expected)
  918. def test_mixed_depth_drop(self):
  919. arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'],
  920. ['', 'OD', 'OD', 'result1', 'result2', 'result1'],
  921. ['', 'wx', 'wy', '', '', '']]
  922. tuples = sorted(zip(*arrays))
  923. index = MultiIndex.from_tuples(tuples)
  924. df = DataFrame(randn(4, 6), columns=index)
  925. result = df.drop('a', axis=1)
  926. expected = df.drop([('a', '', '')], axis=1)
  927. tm.assert_frame_equal(expected, result)
  928. result = df.drop(['top'], axis=1)
  929. expected = df.drop([('top', 'OD', 'wx')], axis=1)
  930. expected = expected.drop([('top', 'OD', 'wy')], axis=1)
  931. tm.assert_frame_equal(expected, result)
  932. result = df.drop(('top', 'OD', 'wx'), axis=1)
  933. expected = df.drop([('top', 'OD', 'wx')], axis=1)
  934. tm.assert_frame_equal(expected, result)
  935. expected = df.drop([('top', 'OD', 'wy')], axis=1)
  936. expected = df.drop('top', axis=1)
  937. result = df.drop('result1', level=1, axis=1)
  938. expected = df.drop([('routine1', 'result1', ''),
  939. ('routine2', 'result1', '')], axis=1)
  940. tm.assert_frame_equal(expected, result)
  941. def test_drop_nonunique(self):
  942. df = DataFrame([["x-a", "x", "a", 1.5], ["x-a", "x", "a", 1.2],
  943. ["z-c", "z", "c", 3.1], ["x-a", "x", "a", 4.1],
  944. ["x-b", "x", "b", 5.1], ["x-b", "x", "b", 4.1],
  945. ["x-b", "x", "b", 2.2],
  946. ["y-a", "y", "a", 1.2], ["z-b", "z", "b", 2.1]],
  947. columns=["var1", "var2", "var3", "var4"])
  948. grp_size = df.groupby("var1").size()
  949. drop_idx = grp_size.loc[grp_size == 1]
  950. idf = df.set_index(["var1", "var2", "var3"])
  951. # it works! #2101
  952. result = idf.drop(drop_idx.index, level=0).reset_index()
  953. expected = df[-df.var1.isin(drop_idx.index)]
  954. result.index = expected.index
  955. tm.assert_frame_equal(result, expected)
  956. def test_mixed_depth_pop(self):
  957. arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'],
  958. ['', 'OD', 'OD', 'result1', 'result2', 'result1'],
  959. ['', 'wx', 'wy', '', '', '']]
  960. tuples = sorted(zip(*arrays))
  961. index = MultiIndex.from_tuples(tuples)
  962. df = DataFrame(randn(4, 6), columns=index)
  963. df1 = df.copy()
  964. df2 = df.copy()
  965. result = df1.pop('a')
  966. expected = df2.pop(('a', '', ''))
  967. tm.assert_series_equal(expected, result, check_names=False)
  968. tm.assert_frame_equal(df1, df2)
  969. assert result.name == 'a'
  970. expected = df1['top']
  971. df1 = df1.drop(['top'], axis=1)
  972. result = df2.pop('top')
  973. tm.assert_frame_equal(expected, result)
  974. tm.assert_frame_equal(df1, df2)
  975. def test_reindex_level_partial_selection(self):
  976. result = self.frame.reindex(['foo', 'qux'], level=0)
  977. expected = self.frame.iloc[[0, 1, 2, 7, 8, 9]]
  978. tm.assert_frame_equal(result, expected)
  979. result = self.frame.T.reindex(['foo', 'qux'], axis=1, level=0)
  980. tm.assert_frame_equal(result, expected.T)
  981. result = self.frame.loc[['foo', 'qux']]
  982. tm.assert_frame_equal(result, expected)
  983. result = self.frame['A'].loc[['foo', 'qux']]
  984. tm.assert_series_equal(result, expected['A'])
  985. result = self.frame.T.loc[:, ['foo', 'qux']]
  986. tm.assert_frame_equal(result, expected.T)
  987. def test_drop_level(self):
  988. result = self.frame.drop(['bar', 'qux'], level='first')
  989. expected = self.frame.iloc[[0, 1, 2, 5, 6]]
  990. tm.assert_frame_equal(result, expected)
  991. result = self.frame.drop(['two'], level='second')
  992. expected = self.frame.iloc[[0, 2, 3, 6, 7, 9]]
  993. tm.assert_frame_equal(result, expected)
  994. result = self.frame.T.drop(['bar', 'qux'], axis=1, level='first')
  995. expected = self.frame.iloc[[0, 1, 2, 5, 6]].T
  996. tm.assert_frame_equal(result, expected)
  997. result = self.frame.T.drop(['two'], axis=1, level='second')
  998. expected = self.frame.iloc[[0, 2, 3, 6, 7, 9]].T
  999. tm.assert_frame_equal(result, expected)
  1000. def test_drop_level_nonunique_datetime(self):
  1001. # GH 12701
  1002. idx = Index([2, 3, 4, 4, 5], name='id')
  1003. idxdt = pd.to_datetime(['201603231400',
  1004. '201603231500',
  1005. '201603231600',
  1006. '201603231600',
  1007. '201603231700'])
  1008. df = DataFrame(np.arange(10).reshape(5, 2),
  1009. columns=list('ab'), index=idx)
  1010. df['tstamp'] = idxdt
  1011. df = df.set_index('tstamp', append=True)
  1012. ts = Timestamp('201603231600')
  1013. assert df.index.is_unique is False
  1014. result = df.drop(ts, level='tstamp')
  1015. expected = df.loc[idx != 4]
  1016. tm.assert_frame_equal(result, expected)
  1017. @pytest.mark.parametrize('box', [Series, DataFrame])
  1018. def test_drop_tz_aware_timestamp_across_dst(self, box):
  1019. # GH 21761
  1020. start = Timestamp('2017-10-29', tz='Europe/Berlin')
  1021. end = Timestamp('2017-10-29 04:00:00', tz='Europe/Berlin')
  1022. index = pd.date_range(start, end, freq='15min')
  1023. data = box(data=[1] * len(index), index=index)
  1024. result = data.drop(start)
  1025. expected_start = Timestamp('2017-10-29 00:15:00', tz='Europe/Berlin')
  1026. expected_idx = pd.date_range(expected_start, end, freq='15min')
  1027. expected = box(data=[1] * len(expected_idx), index=expected_idx)
  1028. tm.assert_equal(result, expected)
  1029. def test_drop_preserve_names(self):
  1030. index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1],
  1031. [1, 2, 3, 1, 2, 3]],
  1032. names=['one', 'two'])
  1033. df = DataFrame(np.random.randn(6, 3), index=index)
  1034. result = df.drop([(0, 2)])
  1035. assert result.index.names == ('one', 'two')
  1036. def test_unicode_repr_issues(self):
  1037. levels = [Index([u('a/\u03c3'), u('b/\u03c3'), u('c/\u03c3')]),
  1038. Index([0, 1])]
  1039. codes = [np.arange(3).repeat(2), np.tile(np.arange(2), 3)]
  1040. index = MultiIndex(levels=levels, codes=codes)
  1041. repr(index.levels)
  1042. # NumPy bug
  1043. # repr(index.get_level_values(1))
  1044. def test_unicode_repr_level_names(self):
  1045. index = MultiIndex.from_tuples([(0, 0), (1, 1)],
  1046. names=[u('\u0394'), 'i1'])
  1047. s = Series(lrange(2), index=index)
  1048. df = DataFrame(np.random.randn(2, 4), index=index)
  1049. repr(s)
  1050. repr(df)
  1051. def test_join_segfault(self):
  1052. # 1532
  1053. df1 = DataFrame({'a': [1, 1], 'b': [1, 2], 'x': [1, 2]})
  1054. df2 = DataFrame({'a': [2, 2], 'b': [1, 2], 'y': [1, 2]})
  1055. df1 = df1.set_index(['a', 'b'])
  1056. df2 = df2.set_index(['a', 'b'])
  1057. # it works!
  1058. for how in ['left', 'right', 'outer']:
  1059. df1.join(df2, how=how)
  1060. def test_frame_dict_constructor_empty_series(self):
  1061. s1 = Series([
  1062. 1, 2, 3, 4
  1063. ], index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2), (2, 4)]))
  1064. s2 = Series([
  1065. 1, 2, 3, 4
  1066. ], index=MultiIndex.from_tuples([(1, 2), (1, 3), (3, 2), (3, 4)]))
  1067. s3 = Series()
  1068. # it works!
  1069. DataFrame({'foo': s1, 'bar': s2, 'baz': s3})
  1070. DataFrame.from_dict({'foo': s1, 'baz': s3, 'bar': s2})
  1071. def test_multiindex_na_repr(self):
  1072. # only an issue with long columns
  1073. from numpy import nan
  1074. df3 = DataFrame({
  1075. 'A' * 30: {('A', 'A0006000', 'nuit'): 'A0006000'},
  1076. 'B' * 30: {('A', 'A0006000', 'nuit'): nan},
  1077. 'C' * 30: {('A', 'A0006000', 'nuit'): nan},
  1078. 'D' * 30: {('A', 'A0006000', 'nuit'): nan},
  1079. 'E' * 30: {('A', 'A0006000', 'nuit'): 'A'},
  1080. 'F' * 30: {('A', 'A0006000', 'nuit'): nan},
  1081. })
  1082. idf = df3.set_index(['A' * 30, 'C' * 30])
  1083. repr(idf)
  1084. def test_assign_index_sequences(self):
  1085. # #2200
  1086. df = DataFrame({"a": [1, 2, 3],
  1087. "b": [4, 5, 6],
  1088. "c": [7, 8, 9]}).set_index(["a", "b"])
  1089. index = list(df.index)
  1090. index[0] = ("faz", "boo")
  1091. df.index = index
  1092. repr(df)
  1093. # this travels an improper code path
  1094. index[0] = ["faz", "boo"]
  1095. df.index = index
  1096. repr(df)
  1097. def test_tuples_have_na(self):
  1098. index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]],
  1099. codes=[[1, 1, 1, 1, -1, 0, 0, 0],
  1100. [0, 1, 2, 3, 0, 1, 2, 3]])
  1101. assert isna(index[4][0])
  1102. assert isna(index.values[4][0])
  1103. def test_duplicate_groupby_issues(self):
  1104. idx_tp = [('600809', '20061231'), ('600809', '20070331'),
  1105. ('600809', '20070630'), ('600809', '20070331')]
  1106. dt = ['demo', 'demo', 'demo', 'demo']
  1107. idx = MultiIndex.from_tuples(idx_tp, names=['STK_ID', 'RPT_Date'])
  1108. s = Series(dt, index=idx)
  1109. result = s.groupby(s.index).first()
  1110. assert len(result) == 3
  1111. def test_duplicate_mi(self):
  1112. # GH 4516
  1113. df = DataFrame([['foo', 'bar', 1.0, 1], ['foo', 'bar', 2.0, 2],
  1114. ['bah', 'bam', 3.0, 3],
  1115. ['bah', 'bam', 4.0, 4], ['foo', 'bar', 5.0, 5],
  1116. ['bah', 'bam', 6.0, 6]],
  1117. columns=list('ABCD'))
  1118. df = df.set_index(['A', 'B'])
  1119. df = df.sort_index(level=0)
  1120. expected = DataFrame([['foo', 'bar', 1.0, 1], ['foo', 'bar', 2.0, 2],
  1121. ['foo', 'bar', 5.0, 5]],
  1122. columns=list('ABCD')).set_index(['A', 'B'])
  1123. result = df.loc[('foo', 'bar')]
  1124. tm.assert_frame_equal(result, expected)
  1125. def test_duplicated_drop_duplicates(self):
  1126. # GH 4060
  1127. idx = MultiIndex.from_arrays(([1, 2, 3, 1, 2, 3], [1, 1, 1, 1, 2, 2]))
  1128. expected = np.array(
  1129. [False, False, False, True, False, False], dtype=bool)
  1130. duplicated = idx.duplicated()
  1131. tm.assert_numpy_array_equal(duplicated, expected)
  1132. assert duplicated.dtype == bool
  1133. expected = MultiIndex.from_arrays(([1, 2, 3, 2, 3], [1, 1, 1, 2, 2]))
  1134. tm.assert_index_equal(idx.drop_duplicates(), expected)
  1135. expected = np.array([True, False, False, False, False, False])
  1136. duplicated = idx.duplicated(keep='last')
  1137. tm.assert_numpy_array_equal(duplicated, expected)
  1138. assert duplicated.dtype == bool
  1139. expected = MultiIndex.from_arrays(([2, 3, 1, 2, 3], [1, 1, 1, 2, 2]))
  1140. tm.assert_index_equal(idx.drop_duplicates(keep='last'), expected)
  1141. expected = np.array([True, False, False, True, False, False])
  1142. duplicated = idx.duplicated(keep=False)
  1143. tm.assert_numpy_array_equal(duplicated, expected)
  1144. assert duplicated.dtype == bool
  1145. expected = MultiIndex.from_arrays(([2, 3, 2, 3], [1, 1, 2, 2]))
  1146. tm.assert_index_equal(idx.drop_duplicates(keep=False), expected)
  1147. def test_multiindex_set_index(self):
  1148. # segfault in #3308
  1149. d = {'t1': [2, 2.5, 3], 't2': [4, 5, 6]}
  1150. df = DataFrame(d)
  1151. tuples = [(0, 1), (0, 2), (1, 2)]
  1152. df['tuples'] = tuples
  1153. index = MultiIndex.from_tuples(df['tuples'])
  1154. # it works!
  1155. df.set_index(index)
  1156. def test_datetimeindex(self):
  1157. idx1 = pd.DatetimeIndex(
  1158. ['2013-04-01 9:00', '2013-04-02 9:00', '2013-04-03 9:00'
  1159. ] * 2, tz='Asia/Tokyo')
  1160. idx2 = pd.date_range('2010/01/01', periods=6, freq='M',
  1161. tz='US/Eastern')
  1162. idx = MultiIndex.from_arrays([idx1, idx2])
  1163. expected1 = pd.DatetimeIndex(['2013-04-01 9:00', '2013-04-02 9:00',
  1164. '2013-04-03 9:00'], tz='Asia/Tokyo')
  1165. tm.assert_index_equal(idx.levels[0], expected1)
  1166. tm.assert_index_equal(idx.levels[1], idx2)
  1167. # from datetime combos
  1168. # GH 7888
  1169. date1 = datetime.date.today()
  1170. date2 = datetime.datetime.today()
  1171. date3 = Timestamp.today()
  1172. for d1, d2 in itertools.product(
  1173. [date1, date2, date3], [date1, date2, date3]):
  1174. index = MultiIndex.from_product([[d1], [d2]])
  1175. assert isinstance(index.levels[0], pd.DatetimeIndex)
  1176. assert isinstance(index.levels[1], pd.DatetimeIndex)
  1177. def test_constructor_with_tz(self):
  1178. index = pd.DatetimeIndex(['2013/01/01 09:00', '2013/01/02 09:00'],
  1179. name='dt1', tz='US/Pacific')
  1180. columns = pd.DatetimeIndex(['2014/01/01 09:00', '2014/01/02 09:00'],
  1181. name='dt2', tz='Asia/Tokyo')
  1182. result = MultiIndex.from_arrays([index, columns])
  1183. tm.assert_index_equal(result.levels[0], index)
  1184. tm.assert_index_equal(result.levels[1], columns)
  1185. result = MultiIndex.from_arrays([Series(index), Series(columns)])
  1186. tm.assert_index_equal(result.levels[0], index)
  1187. tm.assert_index_equal(result.levels[1], columns)
  1188. def test_set_index_datetime(self):
  1189. # GH 3950
  1190. df = DataFrame(
  1191. {'label': ['a', 'a', 'a', 'b', 'b', 'b'],
  1192. 'datetime': ['2011-07-19 07:00:00', '2011-07-19 08:00:00',
  1193. '2011-07-19 09:00:00', '2011-07-19 07:00:00',
  1194. '2011-07-19 08:00:00', '2011-07-19 09:00:00'],
  1195. 'value': range(6)})
  1196. df.index = pd.to_datetime(df.pop('datetime'), utc=True)
  1197. df.index = df.index.tz_convert('US/Pacific')
  1198. expected = pd.DatetimeIndex(['2011-07-19 07:00:00',
  1199. '2011-07-19 08:00:00',
  1200. '2011-07-19 09:00:00'], name='datetime')
  1201. expected = expected.tz_localize('UTC').tz_convert('US/Pacific')
  1202. df = df.set_index('label', append=True)
  1203. tm.assert_index_equal(df.index.levels[0], expected)
  1204. tm.assert_index_equal(df.index.levels[1],
  1205. Index(['a', 'b'], name='label'))
  1206. df = df.swaplevel(0, 1)
  1207. tm.assert_index_equal(df.index.levels[0],
  1208. Index(['a', 'b'], name='label'))
  1209. tm.assert_index_equal(df.index.levels[1], expected)
  1210. df = DataFrame(np.random.random(6))
  1211. idx1 = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00',
  1212. '2011-07-19 09:00:00', '2011-07-19 07:00:00',
  1213. '2011-07-19 08:00:00', '2011-07-19 09:00:00'],
  1214. tz='US/Eastern')
  1215. idx2 = pd.DatetimeIndex(['2012-04-01 09:00', '2012-04-01 09:00',
  1216. '2012-04-01 09:00', '2012-04-02 09:00',
  1217. '2012-04-02 09:00', '2012-04-02 09:00'],
  1218. tz='US/Eastern')
  1219. idx3 = pd.date_range('2011-01-01 09:00', periods=6, tz='Asia/Tokyo')
  1220. df = df.set_index(idx1)
  1221. df = df.set_index(idx2, append=True)
  1222. df = df.set_index(idx3, append=True)
  1223. expected1 = pd.DatetimeIndex(['2011-07-19 07:00:00',
  1224. '2011-07-19 08:00:00',
  1225. '2011-07-19 09:00:00'], tz='US/Eastern')
  1226. expected2 = pd.DatetimeIndex(['2012-04-01 09:00', '2012-04-02 09:00'],
  1227. tz='US/Eastern')
  1228. tm.assert_index_equal(df.index.levels[0], expected1)
  1229. tm.assert_index_equal(df.index.levels[1], expected2)
  1230. tm.assert_index_equal(df.index.levels[2], idx3)
  1231. # GH 7092
  1232. tm.assert_index_equal(df.index.get_level_values(0), idx1)
  1233. tm.assert_index_equal(df.index.get_level_values(1), idx2)
  1234. tm.assert_index_equal(df.index.get_level_values(2), idx3)
  1235. def test_reset_index_datetime(self):
  1236. # GH 3950
  1237. for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern']:
  1238. idx1 = pd.date_range('1/1/2011', periods=5, freq='D', tz=tz,
  1239. name='idx1')
  1240. idx2 = Index(range(5), name='idx2', dtype='int64')
  1241. idx = MultiIndex.from_arrays([idx1, idx2])
  1242. df = DataFrame(
  1243. {'a': np.arange(5, dtype='int64'),
  1244. 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx)
  1245. expected = DataFrame({'idx1': [datetime.datetime(2011, 1, 1),
  1246. datetime.datetime(2011, 1, 2),
  1247. datetime.datetime(2011, 1, 3),
  1248. datetime.datetime(2011, 1, 4),
  1249. datetime.datetime(2011, 1, 5)],
  1250. 'idx2': np.arange(5, dtype='int64'),
  1251. 'a': np.arange(5, dtype='int64'),
  1252. 'b': ['A', 'B', 'C', 'D', 'E']},
  1253. columns=['idx1', 'idx2', 'a', 'b'])
  1254. expected['idx1'] = expected['idx1'].apply(
  1255. lambda d: Timestamp(d, tz=tz))
  1256. tm.assert_frame_equal(df.reset_index(), expected)
  1257. idx3 = pd.date_range('1/1/2012', periods=5, freq='MS',
  1258. tz='Europe/Paris', name='idx3')
  1259. idx = MultiIndex.from_arrays([idx1, idx2, idx3])
  1260. df = DataFrame(
  1261. {'a': np.arange(5, dtype='int64'),
  1262. 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx)
  1263. expected = DataFrame({'idx1': [datetime.datetime(2011, 1, 1),
  1264. datetime.datetime(2011, 1, 2),
  1265. datetime.datetime(2011, 1, 3),
  1266. datetime.datetime(2011, 1, 4),
  1267. datetime.datetime(2011, 1, 5)],
  1268. 'idx2': np.arange(5, dtype='int64'),
  1269. 'idx3': [datetime.datetime(2012, 1, 1),
  1270. datetime.datetime(2012, 2, 1),
  1271. datetime.datetime(2012, 3, 1),
  1272. datetime.datetime(2012, 4, 1),
  1273. datetime.datetime(2012, 5, 1)],
  1274. 'a': np.arange(5, dtype='int64'),
  1275. 'b': ['A', 'B', 'C', 'D', 'E']},
  1276. columns=['idx1', 'idx2', 'idx3', 'a', 'b'])
  1277. expected['idx1'] = expected['idx1'].apply(
  1278. lambda d: Timestamp(d, tz=tz))
  1279. expected['idx3'] = expected['idx3'].apply(
  1280. lambda d: Timestamp(d, tz='Europe/Paris'))
  1281. tm.assert_frame_equal(df.reset_index(), expected)
  1282. # GH 7793
  1283. idx = MultiIndex.from_product([['a', 'b'], pd.date_range(
  1284. '20130101', periods=3, tz=tz)])
  1285. df = DataFrame(
  1286. np.arange(6, dtype='int64').reshape(
  1287. 6, 1), columns=['a'], index=idx)
  1288. expected = DataFrame({'level_0': 'a a a b b b'.split(),
  1289. 'level_1': [
  1290. datetime.datetime(2013, 1, 1),
  1291. datetime.datetime(2013, 1, 2),
  1292. datetime.datetime(2013, 1, 3)] * 2,
  1293. 'a': np.arange(6, dtype='int64')},
  1294. columns=['level_0', 'level_1', 'a'])
  1295. expected['level_1'] = expected['level_1'].apply(
  1296. lambda d: Timestamp(d, freq='D', tz=tz))
  1297. tm.assert_frame_equal(df.reset_index(), expected)
  1298. def test_reset_index_period(self):
  1299. # GH 7746
  1300. idx = MultiIndex.from_product(
  1301. [pd.period_range('20130101', periods=3, freq='M'), list('abc')],
  1302. names=['month', 'feature'])
  1303. df = DataFrame(np.arange(9, dtype='int64').reshape(-1, 1),
  1304. index=idx, columns=['a'])
  1305. expected = DataFrame({
  1306. 'month': ([pd.Period('2013-01', freq='M')] * 3 +
  1307. [pd.Period('2013-02', freq='M')] * 3 +
  1308. [pd.Period('2013-03', freq='M')] * 3),
  1309. 'feature': ['a', 'b', 'c'] * 3,
  1310. 'a': np.arange(9, dtype='int64')
  1311. }, columns=['month', 'feature', 'a'])
  1312. tm.assert_frame_equal(df.reset_index(), expected)
  1313. def test_reset_index_multiindex_columns(self):
  1314. levels = [['A', ''], ['B', 'b']]
  1315. df = DataFrame([[0, 2], [1, 3]],
  1316. columns=MultiIndex.from_tuples(levels))
  1317. result = df[['B']].rename_axis('A').reset_index()
  1318. tm.assert_frame_equal(result, df)
  1319. # gh-16120: already existing column
  1320. with pytest.raises(ValueError,
  1321. match=(r"cannot insert \('A', ''\), "
  1322. "already exists")):
  1323. df.rename_axis('A').reset_index()
  1324. # gh-16164: multiindex (tuple) full key
  1325. result = df.set_index([('A', '')]).reset_index()
  1326. tm.assert_frame_equal(result, df)
  1327. # with additional (unnamed) index level
  1328. idx_col = DataFrame([[0], [1]],
  1329. columns=MultiIndex.from_tuples([('level_0', '')]))
  1330. expected = pd.concat([idx_col, df[[('B', 'b'), ('A', '')]]], axis=1)
  1331. result = df.set_index([('B', 'b')], append=True).reset_index()
  1332. tm.assert_frame_equal(result, expected)
  1333. # with index name which is a too long tuple...
  1334. with pytest.raises(ValueError,
  1335. match=("Item must have length equal "
  1336. "to number of levels.")):
  1337. df.rename_axis([('C', 'c', 'i')]).reset_index()
  1338. # or too short...
  1339. levels = [['A', 'a', ''], ['B', 'b', 'i']]
  1340. df2 = DataFrame([[0, 2], [1, 3]],
  1341. columns=MultiIndex.from_tuples(levels))
  1342. idx_col = DataFrame([[0], [1]],
  1343. columns=MultiIndex.from_tuples([('C', 'c', 'ii')]))
  1344. expected = pd.concat([idx_col, df2], axis=1)
  1345. result = df2.rename_axis([('C', 'c')]).reset_index(col_fill='ii')
  1346. tm.assert_frame_equal(result, expected)
  1347. # ... which is incompatible with col_fill=None
  1348. with pytest.raises(ValueError,
  1349. match=("col_fill=None is incompatible with "
  1350. r"incomplete column name \('C', 'c'\)")):
  1351. df2.rename_axis([('C', 'c')]).reset_index(col_fill=None)
  1352. # with col_level != 0
  1353. result = df2.rename_axis([('c', 'ii')]).reset_index(col_level=1,
  1354. col_fill='C')
  1355. tm.assert_frame_equal(result, expected)
  1356. def test_set_index_period(self):
  1357. # GH 6631
  1358. df = DataFrame(np.random.random(6))
  1359. idx1 = pd.period_range('2011-01-01', periods=3, freq='M')
  1360. idx1 = idx1.append(idx1)
  1361. idx2 = pd.period_range('2013-01-01 09:00', periods=2, freq='H')
  1362. idx2 = idx2.append(idx2).append(idx2)
  1363. idx3 = pd.period_range('2005', periods=6, freq='A')
  1364. df = df.set_index(idx1)
  1365. df = df.set_index(idx2, append=True)
  1366. df = df.set_index(idx3, append=True)
  1367. expected1 = pd.period_range('2011-01-01', periods=3, freq='M')
  1368. expected2 = pd.period_range('2013-01-01 09:00', periods=2, freq='H')
  1369. tm.assert_index_equal(df.index.levels[0], expected1)
  1370. tm.assert_index_equal(df.index.levels[1], expected2)
  1371. tm.assert_index_equal(df.index.levels[2], idx3)
  1372. tm.assert_index_equal(df.index.get_level_values(0), idx1)
  1373. tm.assert_index_equal(df.index.get_level_values(1), idx2)
  1374. tm.assert_index_equal(df.index.get_level_values(2), idx3)
  1375. def test_repeat(self):
  1376. # GH 9361
  1377. # fixed by # GH 7891
  1378. m_idx = MultiIndex.from_tuples([(1, 2), (3, 4), (5, 6), (7, 8)])
  1379. data = ['a', 'b', 'c', 'd']
  1380. m_df = Series(data, index=m_idx)
  1381. assert m_df.repeat(3).shape == (3 * len(data), )
  1382. class TestSorted(Base):
  1383. """ everything you wanted to test about sorting """
  1384. def test_sort_index_preserve_levels(self):
  1385. result = self.frame.sort_index()
  1386. assert result.index.names == self.frame.index.names
  1387. def test_sorting_repr_8017(self):
  1388. np.random.seed(0)
  1389. data = np.random.randn(3, 4)
  1390. for gen, extra in [([1., 3., 2., 5.], 4.), ([1, 3, 2, 5], 4),
  1391. ([Timestamp('20130101'), Timestamp('20130103'),
  1392. Timestamp('20130102'), Timestamp('20130105')],
  1393. Timestamp('20130104')),
  1394. (['1one', '3one', '2one', '5one'], '4one')]:
  1395. columns = MultiIndex.from_tuples([('red', i) for i in gen])
  1396. df = DataFrame(data, index=list('def'), columns=columns)
  1397. df2 = pd.concat([df,
  1398. DataFrame('world', index=list('def'),
  1399. columns=MultiIndex.from_tuples(
  1400. [('red', extra)]))], axis=1)
  1401. # check that the repr is good
  1402. # make sure that we have a correct sparsified repr
  1403. # e.g. only 1 header of read
  1404. assert str(df2).splitlines()[0].split() == ['red']
  1405. # GH 8017
  1406. # sorting fails after columns added
  1407. # construct single-dtype then sort
  1408. result = df.copy().sort_index(axis=1)
  1409. expected = df.iloc[:, [0, 2, 1, 3]]
  1410. tm.assert_frame_equal(result, expected)
  1411. result = df2.sort_index(axis=1)
  1412. expected = df2.iloc[:, [0, 2, 1, 4, 3]]
  1413. tm.assert_frame_equal(result, expected)
  1414. # setitem then sort
  1415. result = df.copy()
  1416. result[('red', extra)] = 'world'
  1417. result = result.sort_index(axis=1)
  1418. tm.assert_frame_equal(result, expected)
  1419. def test_sort_index_level(self):
  1420. df = self.frame.copy()
  1421. df.index = np.arange(len(df))
  1422. # axis=1
  1423. # series
  1424. a_sorted = self.frame['A'].sort_index(level=0)
  1425. # preserve names
  1426. assert a_sorted.index.names == self.frame.index.names
  1427. # inplace
  1428. rs = self.frame.copy()
  1429. rs.sort_index(level=0, inplace=True)
  1430. tm.assert_frame_equal(rs, self.frame.sort_index(level=0))
  1431. def test_sort_index_level_large_cardinality(self):
  1432. # #2684 (int64)
  1433. index = MultiIndex.from_arrays([np.arange(4000)] * 3)
  1434. df = DataFrame(np.random.randn(4000), index=index, dtype=np.int64)
  1435. # it works!
  1436. result = df.sort_index(level=0)
  1437. assert result.index.lexsort_depth == 3
  1438. # #2684 (int32)
  1439. index = MultiIndex.from_arrays([np.arange(4000)] * 3)
  1440. df = DataFrame(np.random.randn(4000), index=index, dtype=np.int32)
  1441. # it works!
  1442. result = df.sort_index(level=0)
  1443. assert (result.dtypes.values == df.dtypes.values).all()
  1444. assert result.index.lexsort_depth == 3
  1445. def test_sort_index_level_by_name(self):
  1446. self.frame.index.names = ['first', 'second']
  1447. result = self.frame.sort_index(level='second')
  1448. expected = self.frame.sort_index(level=1)
  1449. tm.assert_frame_equal(result, expected)
  1450. def test_sort_index_level_mixed(self):
  1451. sorted_before = self.frame.sort_index(level=1)
  1452. df = self.frame.copy()
  1453. df['foo'] = 'bar'
  1454. sorted_after = df.sort_index(level=1)
  1455. tm.assert_frame_equal(sorted_before,
  1456. sorted_after.drop(['foo'], axis=1))
  1457. dft = self.frame.T
  1458. sorted_before = dft.sort_index(level=1, axis=1)
  1459. dft['foo', 'three'] = 'bar'
  1460. sorted_after = dft.sort_index(level=1, axis=1)
  1461. tm.assert_frame_equal(sorted_before.drop([('foo', 'three')], axis=1),
  1462. sorted_after.drop([('foo', 'three')], axis=1))
  1463. def test_is_lexsorted(self):
  1464. levels = [[0, 1], [0, 1, 2]]
  1465. index = MultiIndex(levels=levels,
  1466. codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])
  1467. assert index.is_lexsorted()
  1468. index = MultiIndex(levels=levels,
  1469. codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]])
  1470. assert not index.is_lexsorted()
  1471. index = MultiIndex(levels=levels,
  1472. codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]])
  1473. assert not index.is_lexsorted()
  1474. assert index.lexsort_depth == 0
  1475. def test_sort_index_and_reconstruction(self):
  1476. # 15622
  1477. # lexsortedness should be identical
  1478. # across MultiIndex consruction methods
  1479. df = DataFrame([[1, 1], [2, 2]], index=list('ab'))
  1480. expected = DataFrame([[1, 1], [2, 2], [1, 1], [2, 2]],
  1481. index=MultiIndex.from_tuples([(0.5, 'a'),
  1482. (0.5, 'b'),
  1483. (0.8, 'a'),
  1484. (0.8, 'b')]))
  1485. assert expected.index.is_lexsorted()
  1486. result = DataFrame(
  1487. [[1, 1], [2, 2], [1, 1], [2, 2]],
  1488. index=MultiIndex.from_product([[0.5, 0.8], list('ab')]))
  1489. result = result.sort_index()
  1490. assert result.index.is_lexsorted()
  1491. assert result.index.is_monotonic
  1492. tm.assert_frame_equal(result, expected)
  1493. result = DataFrame(
  1494. [[1, 1], [2, 2], [1, 1], [2, 2]],
  1495. index=MultiIndex(levels=[[0.5, 0.8], ['a', 'b']],
  1496. codes=[[0, 0, 1, 1], [0, 1, 0, 1]]))
  1497. result = result.sort_index()
  1498. assert result.index.is_lexsorted()
  1499. tm.assert_frame_equal(result, expected)
  1500. concatted = pd.concat([df, df], keys=[0.8, 0.5])
  1501. result = concatted.sort_index()
  1502. assert result.index.is_lexsorted()
  1503. assert result.index.is_monotonic
  1504. tm.assert_frame_equal(result, expected)
  1505. # 14015
  1506. df = DataFrame([[1, 2], [6, 7]],
  1507. columns=MultiIndex.from_tuples(
  1508. [(0, '20160811 12:00:00'),
  1509. (0, '20160809 12:00:00')],
  1510. names=['l1', 'Date']))
  1511. df.columns.set_levels(pd.to_datetime(df.columns.levels[1]),
  1512. level=1,
  1513. inplace=True)
  1514. assert not df.columns.is_lexsorted()
  1515. assert not df.columns.is_monotonic
  1516. result = df.sort_index(axis=1)
  1517. assert result.columns.is_lexsorted()
  1518. assert result.columns.is_monotonic
  1519. result = df.sort_index(axis=1, level=1)
  1520. assert result.columns.is_lexsorted()
  1521. assert result.columns.is_monotonic
  1522. def test_sort_index_and_reconstruction_doc_example(self):
  1523. # doc example
  1524. df = DataFrame({'value': [1, 2, 3, 4]},
  1525. index=MultiIndex(
  1526. levels=[['a', 'b'], ['bb', 'aa']],
  1527. codes=[[0, 0, 1, 1], [0, 1, 0, 1]]))
  1528. assert df.index.is_lexsorted()
  1529. assert not df.index.is_monotonic
  1530. # sort it
  1531. expected = DataFrame({'value': [2, 1, 4, 3]},
  1532. index=MultiIndex(
  1533. levels=[['a', 'b'], ['aa', 'bb']],
  1534. codes=[[0, 0, 1, 1], [0, 1, 0, 1]]))
  1535. result = df.sort_index()
  1536. assert result.index.is_lexsorted()
  1537. assert result.index.is_monotonic
  1538. tm.assert_frame_equal(result, expected)
  1539. # reconstruct
  1540. result = df.sort_index().copy()
  1541. result.index = result.index._sort_levels_monotonic()
  1542. assert result.index.is_lexsorted()
  1543. assert result.index.is_monotonic
  1544. tm.assert_frame_equal(result, expected)
  1545. def test_sort_index_reorder_on_ops(self):
  1546. # 15687
  1547. df = DataFrame(
  1548. np.random.randn(8, 2),
  1549. index=MultiIndex.from_product(
  1550. [['a', 'b'], ['big', 'small'], ['red', 'blu']],
  1551. names=['letter', 'size', 'color']),
  1552. columns=['near', 'far'])
  1553. df = df.sort_index()
  1554. def my_func(group):
  1555. group.index = ['newz', 'newa']
  1556. return group
  1557. result = df.groupby(level=['letter', 'size']).apply(
  1558. my_func).sort_index()
  1559. expected = MultiIndex.from_product(
  1560. [['a', 'b'], ['big', 'small'], ['newa', 'newz']],
  1561. names=['letter', 'size', None])
  1562. tm.assert_index_equal(result.index, expected)
  1563. def test_sort_non_lexsorted(self):
  1564. # degenerate case where we sort but don't
  1565. # have a satisfying result :<
  1566. # GH 15797
  1567. idx = MultiIndex([['A', 'B', 'C'],
  1568. ['c', 'b', 'a']],
  1569. [[0, 1, 2, 0, 1, 2],
  1570. [0, 2, 1, 1, 0, 2]])
  1571. df = DataFrame({'col': range(len(idx))},
  1572. index=idx,
  1573. dtype='int64')
  1574. assert df.index.is_lexsorted() is False
  1575. assert df.index.is_monotonic is False
  1576. sorted = df.sort_index()
  1577. assert sorted.index.is_lexsorted() is True
  1578. assert sorted.index.is_monotonic is True
  1579. expected = DataFrame(
  1580. {'col': [1, 4, 5, 2]},
  1581. index=MultiIndex.from_tuples([('B', 'a'), ('B', 'c'),
  1582. ('C', 'a'), ('C', 'b')]),
  1583. dtype='int64')
  1584. result = sorted.loc[pd.IndexSlice['B':'C', 'a':'c'], :]
  1585. tm.assert_frame_equal(result, expected)
  1586. def test_sort_index_nan(self):
  1587. # GH 14784
  1588. # incorrect sorting w.r.t. nans
  1589. tuples = [[12, 13], [np.nan, np.nan], [np.nan, 3], [1, 2]]
  1590. mi = MultiIndex.from_tuples(tuples)
  1591. df = DataFrame(np.arange(16).reshape(4, 4),
  1592. index=mi, columns=list('ABCD'))
  1593. s = Series(np.arange(4), index=mi)
  1594. df2 = DataFrame({
  1595. 'date': pd.to_datetime([
  1596. '20121002', '20121007', '20130130', '20130202', '20130305',
  1597. '20121002', '20121207', '20130130', '20130202', '20130305',
  1598. '20130202', '20130305'
  1599. ]),
  1600. 'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
  1601. 'whole_cost': [1790, np.nan, 280, 259, np.nan, 623, 90, 312,
  1602. np.nan, 301, 359, 801],
  1603. 'cost': [12, 15, 10, 24, 39, 1, 0, np.nan, 45, 34, 1, 12]
  1604. }).set_index(['date', 'user_id'])
  1605. # sorting frame, default nan position is last
  1606. result = df.sort_index()
  1607. expected = df.iloc[[3, 0, 2, 1], :]
  1608. tm.assert_frame_equal(result, expected)
  1609. # sorting frame, nan position last
  1610. result = df.sort_index(na_position='last')
  1611. expected = df.iloc[[3, 0, 2, 1], :]
  1612. tm.assert_frame_equal(result, expected)
  1613. # sorting frame, nan position first
  1614. result = df.sort_index(na_position='first')
  1615. expected = df.iloc[[1, 2, 3, 0], :]
  1616. tm.assert_frame_equal(result, expected)
  1617. # sorting frame with removed rows
  1618. result = df2.dropna().sort_index()
  1619. expected = df2.sort_index().dropna()
  1620. tm.assert_frame_equal(result, expected)
  1621. # sorting series, default nan position is last
  1622. result = s.sort_index()
  1623. expected = s.iloc[[3, 0, 2, 1]]
  1624. tm.assert_series_equal(result, expected)
  1625. # sorting series, nan position last
  1626. result = s.sort_index(na_position='last')
  1627. expected = s.iloc[[3, 0, 2, 1]]
  1628. tm.assert_series_equal(result, expected)
  1629. # sorting series, nan position first
  1630. result = s.sort_index(na_position='first')
  1631. expected = s.iloc[[1, 2, 3, 0]]
  1632. tm.assert_series_equal(result, expected)
  1633. def test_sort_ascending_list(self):
  1634. # GH: 16934
  1635. # Set up a Series with a three level MultiIndex
  1636. arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
  1637. ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'],
  1638. [4, 3, 2, 1, 4, 3, 2, 1]]
  1639. tuples = lzip(*arrays)
  1640. mi = MultiIndex.from_tuples(tuples, names=['first', 'second', 'third'])
  1641. s = Series(range(8), index=mi)
  1642. # Sort with boolean ascending
  1643. result = s.sort_index(level=['third', 'first'], ascending=False)
  1644. expected = s.iloc[[4, 0, 5, 1, 6, 2, 7, 3]]
  1645. tm.assert_series_equal(result, expected)
  1646. # Sort with list of boolean ascending
  1647. result = s.sort_index(level=['third', 'first'],
  1648. ascending=[False, True])
  1649. expected = s.iloc[[0, 4, 1, 5, 2, 6, 3, 7]]
  1650. tm.assert_series_equal(result, expected)