# -*- coding: utf-8 -*- from __future__ import print_function from datetime import datetime import numpy as np import pytest from pandas.compat import lrange, lzip, u from pandas.errors import PerformanceWarning import pandas as pd from pandas import ( Categorical, DataFrame, Index, MultiIndex, Series, compat, date_range, isna) from pandas.tests.frame.common import TestData import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal class TestDataFrameSelectReindex(TestData): # These are specific reindex-based tests; other indexing tests should go in # test_indexing def test_drop_names(self): df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]], index=['a', 'b', 'c'], columns=['d', 'e', 'f']) df.index.name, df.columns.name = 'first', 'second' df_dropped_b = df.drop('b') df_dropped_e = df.drop('e', axis=1) df_inplace_b, df_inplace_e = df.copy(), df.copy() df_inplace_b.drop('b', inplace=True) df_inplace_e.drop('e', axis=1, inplace=True) for obj in (df_dropped_b, df_dropped_e, df_inplace_b, df_inplace_e): assert obj.index.name == 'first' assert obj.columns.name == 'second' assert list(df.columns) == ['d', 'e', 'f'] pytest.raises(KeyError, df.drop, ['g']) pytest.raises(KeyError, df.drop, ['g'], 1) # errors = 'ignore' dropped = df.drop(['g'], errors='ignore') expected = Index(['a', 'b', 'c'], name='first') tm.assert_index_equal(dropped.index, expected) dropped = df.drop(['b', 'g'], errors='ignore') expected = Index(['a', 'c'], name='first') tm.assert_index_equal(dropped.index, expected) dropped = df.drop(['g'], axis=1, errors='ignore') expected = Index(['d', 'e', 'f'], name='second') tm.assert_index_equal(dropped.columns, expected) dropped = df.drop(['d', 'g'], axis=1, errors='ignore') expected = Index(['e', 'f'], name='second') tm.assert_index_equal(dropped.columns, expected) # GH 16398 dropped = df.drop([], errors='ignore') expected = Index(['a', 'b', 'c'], name='first') tm.assert_index_equal(dropped.index, expected) def test_drop_col_still_multiindex(self): arrays = [['a', 'b', 'c', 'top'], ['', '', '', 'OD'], ['', '', '', 'wx']] tuples = sorted(zip(*arrays)) index = MultiIndex.from_tuples(tuples) df = DataFrame(np.random.randn(3, 4), columns=index) del df[('a', '', '')] assert(isinstance(df.columns, MultiIndex)) def test_drop(self): simple = DataFrame({"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]}) assert_frame_equal(simple.drop("A", axis=1), simple[['B']]) assert_frame_equal(simple.drop(["A", "B"], axis='columns'), simple[[]]) assert_frame_equal(simple.drop([0, 1, 3], axis=0), simple.loc[[2], :]) assert_frame_equal(simple.drop( [0, 3], axis='index'), simple.loc[[1, 2], :]) pytest.raises(KeyError, simple.drop, 5) pytest.raises(KeyError, simple.drop, 'C', 1) pytest.raises(KeyError, simple.drop, [1, 5]) pytest.raises(KeyError, simple.drop, ['A', 'C'], 1) # errors = 'ignore' assert_frame_equal(simple.drop(5, errors='ignore'), simple) assert_frame_equal(simple.drop([0, 5], errors='ignore'), simple.loc[[1, 2, 3], :]) assert_frame_equal(simple.drop('C', axis=1, errors='ignore'), simple) assert_frame_equal(simple.drop(['A', 'C'], axis=1, errors='ignore'), simple[['B']]) # non-unique - wheee! nu_df = DataFrame(lzip(range(3), range(-3, 1), list('abc')), columns=['a', 'a', 'b']) assert_frame_equal(nu_df.drop('a', axis=1), nu_df[['b']]) assert_frame_equal(nu_df.drop('b', axis='columns'), nu_df['a']) assert_frame_equal(nu_df.drop([]), nu_df) # GH 16398 nu_df = nu_df.set_index(pd.Index(['X', 'Y', 'X'])) nu_df.columns = list('abc') assert_frame_equal(nu_df.drop('X', axis='rows'), nu_df.loc[["Y"], :]) assert_frame_equal(nu_df.drop(['X', 'Y'], axis=0), nu_df.loc[[], :]) # inplace cache issue # GH 5628 df = pd.DataFrame(np.random.randn(10, 3), columns=list('abc')) expected = df[~(df.b > 0)] df.drop(labels=df[df.b > 0].index, inplace=True) assert_frame_equal(df, expected) def test_drop_multiindex_not_lexsorted(self): # GH 11640 # define the lexsorted version lexsorted_mi = MultiIndex.from_tuples( [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c']) lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) assert lexsorted_df.columns.is_lexsorted() # define the non-lexsorted version not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'], data=[[1, 'b1', 'c1', 3], [1, 'b2', 'c2', 4]]) not_lexsorted_df = not_lexsorted_df.pivot_table( index='a', columns=['b', 'c'], values='d') not_lexsorted_df = not_lexsorted_df.reset_index() assert not not_lexsorted_df.columns.is_lexsorted() # compare the results tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) expected = lexsorted_df.drop('a', axis=1) with tm.assert_produces_warning(PerformanceWarning): result = not_lexsorted_df.drop('a', axis=1) tm.assert_frame_equal(result, expected) def test_drop_api_equivalence(self): # equivalence of the labels/axis and index/columns API's (GH12392) df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]], index=['a', 'b', 'c'], columns=['d', 'e', 'f']) res1 = df.drop('a') res2 = df.drop(index='a') tm.assert_frame_equal(res1, res2) res1 = df.drop('d', 1) res2 = df.drop(columns='d') tm.assert_frame_equal(res1, res2) res1 = df.drop(labels='e', axis=1) res2 = df.drop(columns='e') tm.assert_frame_equal(res1, res2) res1 = df.drop(['a'], axis=0) res2 = df.drop(index=['a']) tm.assert_frame_equal(res1, res2) res1 = df.drop(['a'], axis=0).drop(['d'], axis=1) res2 = df.drop(index=['a'], columns=['d']) tm.assert_frame_equal(res1, res2) with pytest.raises(ValueError): df.drop(labels='a', index='b') with pytest.raises(ValueError): df.drop(labels='a', columns='b') with pytest.raises(ValueError): df.drop(axis=1) def test_merge_join_different_levels(self): # GH 9455 # first dataframe df1 = DataFrame(columns=['a', 'b'], data=[[1, 11], [0, 22]]) # second dataframe columns = MultiIndex.from_tuples([('a', ''), ('c', 'c1')]) df2 = DataFrame(columns=columns, data=[[1, 33], [0, 44]]) # merge columns = ['a', 'b', ('c', 'c1')] expected = DataFrame(columns=columns, data=[[1, 11, 33], [0, 22, 44]]) with tm.assert_produces_warning(UserWarning): result = pd.merge(df1, df2, on='a') tm.assert_frame_equal(result, expected) # join, see discussion in GH 12219 columns = ['a', 'b', ('a', ''), ('c', 'c1')] expected = DataFrame(columns=columns, data=[[1, 11, 0, 44], [0, 22, 1, 33]]) with tm.assert_produces_warning(UserWarning): result = df1.join(df2, on='a') tm.assert_frame_equal(result, expected) def test_reindex(self): newFrame = self.frame.reindex(self.ts1.index) for col in newFrame.columns: for idx, val in compat.iteritems(newFrame[col]): if idx in self.frame.index: if np.isnan(val): assert np.isnan(self.frame[col][idx]) else: assert val == self.frame[col][idx] else: assert np.isnan(val) for col, series in compat.iteritems(newFrame): assert tm.equalContents(series.index, newFrame.index) emptyFrame = self.frame.reindex(Index([])) assert len(emptyFrame.index) == 0 # Cython code should be unit-tested directly nonContigFrame = self.frame.reindex(self.ts1.index[::2]) for col in nonContigFrame.columns: for idx, val in compat.iteritems(nonContigFrame[col]): if idx in self.frame.index: if np.isnan(val): assert np.isnan(self.frame[col][idx]) else: assert val == self.frame[col][idx] else: assert np.isnan(val) for col, series in compat.iteritems(nonContigFrame): assert tm.equalContents(series.index, nonContigFrame.index) # corner cases # Same index, copies values but not index if copy=False newFrame = self.frame.reindex(self.frame.index, copy=False) assert newFrame.index is self.frame.index # length zero newFrame = self.frame.reindex([]) assert newFrame.empty assert len(newFrame.columns) == len(self.frame.columns) # length zero with columns reindexed with non-empty index newFrame = self.frame.reindex([]) newFrame = newFrame.reindex(self.frame.index) assert len(newFrame.index) == len(self.frame.index) assert len(newFrame.columns) == len(self.frame.columns) # pass non-Index newFrame = self.frame.reindex(list(self.ts1.index)) tm.assert_index_equal(newFrame.index, self.ts1.index) # copy with no axes result = self.frame.reindex() assert_frame_equal(result, self.frame) assert result is not self.frame def test_reindex_nan(self): df = pd.DataFrame([[1, 2], [3, 5], [7, 11], [9, 23]], index=[2, np.nan, 1, 5], columns=['joe', 'jim']) i, j = [np.nan, 5, 5, np.nan, 1, 2, np.nan], [1, 3, 3, 1, 2, 0, 1] assert_frame_equal(df.reindex(i), df.iloc[j]) df.index = df.index.astype('object') assert_frame_equal(df.reindex(i), df.iloc[j], check_index_type=False) # GH10388 df = pd.DataFrame({'other': ['a', 'b', np.nan, 'c'], 'date': ['2015-03-22', np.nan, '2012-01-08', np.nan], 'amount': [2, 3, 4, 5]}) df['date'] = pd.to_datetime(df.date) df['delta'] = (pd.to_datetime('2015-06-18') - df['date']).shift(1) left = df.set_index(['delta', 'other', 'date']).reset_index() right = df.reindex(columns=['delta', 'other', 'date', 'amount']) assert_frame_equal(left, right) def test_reindex_name_remains(self): s = Series(np.random.rand(10)) df = DataFrame(s, index=np.arange(len(s))) i = Series(np.arange(10), name='iname') df = df.reindex(i) assert df.index.name == 'iname' df = df.reindex(Index(np.arange(10), name='tmpname')) assert df.index.name == 'tmpname' s = Series(np.random.rand(10)) df = DataFrame(s.T, index=np.arange(len(s))) i = Series(np.arange(10), name='iname') df = df.reindex(columns=i) assert df.columns.name == 'iname' def test_reindex_int(self): smaller = self.intframe.reindex(self.intframe.index[::2]) assert smaller['A'].dtype == np.int64 bigger = smaller.reindex(self.intframe.index) assert bigger['A'].dtype == np.float64 smaller = self.intframe.reindex(columns=['A', 'B']) assert smaller['A'].dtype == np.int64 def test_reindex_like(self): other = self.frame.reindex(index=self.frame.index[:10], columns=['C', 'B']) assert_frame_equal(other, self.frame.reindex_like(other)) def test_reindex_columns(self): new_frame = self.frame.reindex(columns=['A', 'B', 'E']) tm.assert_series_equal(new_frame['B'], self.frame['B']) assert np.isnan(new_frame['E']).all() assert 'C' not in new_frame # Length zero new_frame = self.frame.reindex(columns=[]) assert new_frame.empty def test_reindex_columns_method(self): # GH 14992, reindexing over columns ignored method df = DataFrame(data=[[11, 12, 13], [21, 22, 23], [31, 32, 33]], index=[1, 2, 4], columns=[1, 2, 4], dtype=float) # default method result = df.reindex(columns=range(6)) expected = DataFrame(data=[[np.nan, 11, 12, np.nan, 13, np.nan], [np.nan, 21, 22, np.nan, 23, np.nan], [np.nan, 31, 32, np.nan, 33, np.nan]], index=[1, 2, 4], columns=range(6), dtype=float) assert_frame_equal(result, expected) # method='ffill' result = df.reindex(columns=range(6), method='ffill') expected = DataFrame(data=[[np.nan, 11, 12, 12, 13, 13], [np.nan, 21, 22, 22, 23, 23], [np.nan, 31, 32, 32, 33, 33]], index=[1, 2, 4], columns=range(6), dtype=float) assert_frame_equal(result, expected) # method='bfill' result = df.reindex(columns=range(6), method='bfill') expected = DataFrame(data=[[11, 11, 12, 13, 13, np.nan], [21, 21, 22, 23, 23, np.nan], [31, 31, 32, 33, 33, np.nan]], index=[1, 2, 4], columns=range(6), dtype=float) assert_frame_equal(result, expected) def test_reindex_axes(self): # GH 3317, reindexing by both axes loses freq of the index df = DataFrame(np.ones((3, 3)), index=[datetime(2012, 1, 1), datetime(2012, 1, 2), datetime(2012, 1, 3)], columns=['a', 'b', 'c']) time_freq = date_range('2012-01-01', '2012-01-03', freq='d') some_cols = ['a', 'b'] index_freq = df.reindex(index=time_freq).index.freq both_freq = df.reindex(index=time_freq, columns=some_cols).index.freq seq_freq = df.reindex(index=time_freq).reindex( columns=some_cols).index.freq assert index_freq == both_freq assert index_freq == seq_freq def test_reindex_fill_value(self): df = DataFrame(np.random.randn(10, 4)) # axis=0 result = df.reindex(lrange(15)) assert np.isnan(result.values[-5:]).all() result = df.reindex(lrange(15), fill_value=0) expected = df.reindex(lrange(15)).fillna(0) assert_frame_equal(result, expected) # axis=1 result = df.reindex(columns=lrange(5), fill_value=0.) expected = df.copy() expected[4] = 0. assert_frame_equal(result, expected) result = df.reindex(columns=lrange(5), fill_value=0) expected = df.copy() expected[4] = 0 assert_frame_equal(result, expected) result = df.reindex(columns=lrange(5), fill_value='foo') expected = df.copy() expected[4] = 'foo' assert_frame_equal(result, expected) # reindex_axis with tm.assert_produces_warning(FutureWarning): result = df.reindex_axis(lrange(15), fill_value=0., axis=0) expected = df.reindex(lrange(15)).fillna(0) assert_frame_equal(result, expected) with tm.assert_produces_warning(FutureWarning): result = df.reindex_axis(lrange(5), fill_value=0., axis=1) expected = df.reindex(columns=lrange(5)).fillna(0) assert_frame_equal(result, expected) # other dtypes df['foo'] = 'foo' result = df.reindex(lrange(15), fill_value=0) expected = df.reindex(lrange(15)).fillna(0) assert_frame_equal(result, expected) def test_reindex_dups(self): # GH4746, reindex on duplicate index error messages arr = np.random.randn(10) df = DataFrame(arr, index=[1, 2, 3, 4, 5, 1, 2, 3, 4, 5]) # set index is ok result = df.copy() result.index = list(range(len(df))) expected = DataFrame(arr, index=list(range(len(df)))) assert_frame_equal(result, expected) # reindex fails pytest.raises(ValueError, df.reindex, index=list(range(len(df)))) def test_reindex_axis_style(self): # https://github.com/pandas-dev/pandas/issues/12392 df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) expected = pd.DataFrame({"A": [1, 2, np.nan], "B": [4, 5, np.nan]}, index=[0, 1, 3]) result = df.reindex([0, 1, 3]) assert_frame_equal(result, expected) result = df.reindex([0, 1, 3], axis=0) assert_frame_equal(result, expected) result = df.reindex([0, 1, 3], axis='index') assert_frame_equal(result, expected) def test_reindex_positional_warns(self): # https://github.com/pandas-dev/pandas/issues/12392 df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) expected = pd.DataFrame({"A": [1., 2], 'B': [4., 5], "C": [np.nan, np.nan]}) with tm.assert_produces_warning(FutureWarning): result = df.reindex([0, 1], ['A', 'B', 'C']) assert_frame_equal(result, expected) def test_reindex_axis_style_raises(self): # https://github.com/pandas-dev/pandas/issues/12392 df = pd.DataFrame({"A": [1, 2, 3], 'B': [4, 5, 6]}) with pytest.raises(TypeError, match="Cannot specify both 'axis'"): df.reindex([0, 1], ['A'], axis=1) with pytest.raises(TypeError, match="Cannot specify both 'axis'"): df.reindex([0, 1], ['A'], axis='index') with pytest.raises(TypeError, match="Cannot specify both 'axis'"): df.reindex(index=[0, 1], axis='index') with pytest.raises(TypeError, match="Cannot specify both 'axis'"): df.reindex(index=[0, 1], axis='columns') with pytest.raises(TypeError, match="Cannot specify both 'axis'"): df.reindex(columns=[0, 1], axis='columns') with pytest.raises(TypeError, match="Cannot specify both 'axis'"): df.reindex(index=[0, 1], columns=[0, 1], axis='columns') with pytest.raises(TypeError, match='Cannot specify all'): df.reindex([0, 1], [0], ['A']) # Mixing styles with pytest.raises(TypeError, match="Cannot specify both 'axis'"): df.reindex(index=[0, 1], axis='index') with pytest.raises(TypeError, match="Cannot specify both 'axis'"): df.reindex(index=[0, 1], axis='columns') # Duplicates with pytest.raises(TypeError, match="multiple values"): df.reindex([0, 1], labels=[0, 1]) def test_reindex_single_named_indexer(self): # https://github.com/pandas-dev/pandas/issues/12392 df = pd.DataFrame({"A": [1, 2, 3], "B": [1, 2, 3]}) result = df.reindex([0, 1], columns=['A']) expected = pd.DataFrame({"A": [1, 2]}) assert_frame_equal(result, expected) def test_reindex_api_equivalence(self): # https://github.com/pandas-dev/pandas/issues/12392 # equivalence of the labels/axis and index/columns API's df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]], index=['a', 'b', 'c'], columns=['d', 'e', 'f']) res1 = df.reindex(['b', 'a']) res2 = df.reindex(index=['b', 'a']) res3 = df.reindex(labels=['b', 'a']) res4 = df.reindex(labels=['b', 'a'], axis=0) res5 = df.reindex(['b', 'a'], axis=0) for res in [res2, res3, res4, res5]: tm.assert_frame_equal(res1, res) res1 = df.reindex(columns=['e', 'd']) res2 = df.reindex(['e', 'd'], axis=1) res3 = df.reindex(labels=['e', 'd'], axis=1) for res in [res2, res3]: tm.assert_frame_equal(res1, res) with tm.assert_produces_warning(FutureWarning) as m: res1 = df.reindex(['b', 'a'], ['e', 'd']) assert 'reindex' in str(m[0].message) res2 = df.reindex(columns=['e', 'd'], index=['b', 'a']) res3 = df.reindex(labels=['b', 'a'], axis=0).reindex(labels=['e', 'd'], axis=1) for res in [res2, res3]: tm.assert_frame_equal(res1, res) def test_align(self): af, bf = self.frame.align(self.frame) assert af._data is not self.frame._data af, bf = self.frame.align(self.frame, copy=False) assert af._data is self.frame._data # axis = 0 other = self.frame.iloc[:-5, :3] af, bf = self.frame.align(other, axis=0, fill_value=-1) tm.assert_index_equal(bf.columns, other.columns) # test fill value join_idx = self.frame.index.join(other.index) diff_a = self.frame.index.difference(join_idx) diff_b = other.index.difference(join_idx) diff_a_vals = af.reindex(diff_a).values diff_b_vals = bf.reindex(diff_b).values assert (diff_a_vals == -1).all() af, bf = self.frame.align(other, join='right', axis=0) tm.assert_index_equal(bf.columns, other.columns) tm.assert_index_equal(bf.index, other.index) tm.assert_index_equal(af.index, other.index) # axis = 1 other = self.frame.iloc[:-5, :3].copy() af, bf = self.frame.align(other, axis=1) tm.assert_index_equal(bf.columns, self.frame.columns) tm.assert_index_equal(bf.index, other.index) # test fill value join_idx = self.frame.index.join(other.index) diff_a = self.frame.index.difference(join_idx) diff_b = other.index.difference(join_idx) diff_a_vals = af.reindex(diff_a).values # TODO(wesm): unused? diff_b_vals = bf.reindex(diff_b).values # noqa assert (diff_a_vals == -1).all() af, bf = self.frame.align(other, join='inner', axis=1) tm.assert_index_equal(bf.columns, other.columns) af, bf = self.frame.align(other, join='inner', axis=1, method='pad') tm.assert_index_equal(bf.columns, other.columns) # test other non-float types af, bf = self.intframe.align(other, join='inner', axis=1, method='pad') tm.assert_index_equal(bf.columns, other.columns) af, bf = self.mixed_frame.align(self.mixed_frame, join='inner', axis=1, method='pad') tm.assert_index_equal(bf.columns, self.mixed_frame.columns) af, bf = self.frame.align(other.iloc[:, 0], join='inner', axis=1, method=None, fill_value=None) tm.assert_index_equal(bf.index, Index([])) af, bf = self.frame.align(other.iloc[:, 0], join='inner', axis=1, method=None, fill_value=0) tm.assert_index_equal(bf.index, Index([])) # mixed floats/ints af, bf = self.mixed_float.align(other.iloc[:, 0], join='inner', axis=1, method=None, fill_value=0) tm.assert_index_equal(bf.index, Index([])) af, bf = self.mixed_int.align(other.iloc[:, 0], join='inner', axis=1, method=None, fill_value=0) tm.assert_index_equal(bf.index, Index([])) # Try to align DataFrame to Series along bad axis with pytest.raises(ValueError): self.frame.align(af.iloc[0, :3], join='inner', axis=2) # align dataframe to series with broadcast or not idx = self.frame.index s = Series(range(len(idx)), index=idx) left, right = self.frame.align(s, axis=0) tm.assert_index_equal(left.index, self.frame.index) tm.assert_index_equal(right.index, self.frame.index) assert isinstance(right, Series) left, right = self.frame.align(s, broadcast_axis=1) tm.assert_index_equal(left.index, self.frame.index) expected = {c: s for c in self.frame.columns} expected = DataFrame(expected, index=self.frame.index, columns=self.frame.columns) tm.assert_frame_equal(right, expected) # see gh-9558 df = DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) result = df[df['a'] == 2] expected = DataFrame([[2, 5]], index=[1], columns=['a', 'b']) tm.assert_frame_equal(result, expected) result = df.where(df['a'] == 2, 0) expected = DataFrame({'a': [0, 2, 0], 'b': [0, 5, 0]}) tm.assert_frame_equal(result, expected) def _check_align(self, a, b, axis, fill_axis, how, method, limit=None): aa, ab = a.align(b, axis=axis, join=how, method=method, limit=limit, fill_axis=fill_axis) join_index, join_columns = None, None ea, eb = a, b if axis is None or axis == 0: join_index = a.index.join(b.index, how=how) ea = ea.reindex(index=join_index) eb = eb.reindex(index=join_index) if axis is None or axis == 1: join_columns = a.columns.join(b.columns, how=how) ea = ea.reindex(columns=join_columns) eb = eb.reindex(columns=join_columns) ea = ea.fillna(axis=fill_axis, method=method, limit=limit) eb = eb.fillna(axis=fill_axis, method=method, limit=limit) assert_frame_equal(aa, ea) assert_frame_equal(ab, eb) @pytest.mark.parametrize('meth', ['pad', 'bfill']) @pytest.mark.parametrize('ax', [0, 1, None]) @pytest.mark.parametrize('fax', [0, 1]) @pytest.mark.parametrize('how', ['inner', 'outer', 'left', 'right']) def test_align_fill_method(self, how, meth, ax, fax): self._check_align_fill(how, meth, ax, fax) def _check_align_fill(self, kind, meth, ax, fax): left = self.frame.iloc[0:4, :10] right = self.frame.iloc[2:, 6:] empty = self.frame.iloc[:0, :0] self._check_align(left, right, axis=ax, fill_axis=fax, how=kind, method=meth) self._check_align(left, right, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1) # empty left self._check_align(empty, right, axis=ax, fill_axis=fax, how=kind, method=meth) self._check_align(empty, right, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1) # empty right self._check_align(left, empty, axis=ax, fill_axis=fax, how=kind, method=meth) self._check_align(left, empty, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1) # both empty self._check_align(empty, empty, axis=ax, fill_axis=fax, how=kind, method=meth) self._check_align(empty, empty, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1) def test_align_int_fill_bug(self): # GH #910 X = np.arange(10 * 10, dtype='float64').reshape(10, 10) Y = np.ones((10, 1), dtype=int) df1 = DataFrame(X) df1['0.X'] = Y.squeeze() df2 = df1.astype(float) result = df1 - df1.mean() expected = df2 - df2.mean() assert_frame_equal(result, expected) def test_align_multiindex(self): # GH 10665 # same test cases as test_align_multiindex in test_series.py midx = pd.MultiIndex.from_product([range(2), range(3), range(2)], names=('a', 'b', 'c')) idx = pd.Index(range(2), name='b') df1 = pd.DataFrame(np.arange(12, dtype='int64'), index=midx) df2 = pd.DataFrame(np.arange(2, dtype='int64'), index=idx) # these must be the same results (but flipped) res1l, res1r = df1.align(df2, join='left') res2l, res2r = df2.align(df1, join='right') expl = df1 assert_frame_equal(expl, res1l) assert_frame_equal(expl, res2r) expr = pd.DataFrame([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx) assert_frame_equal(expr, res1r) assert_frame_equal(expr, res2l) res1l, res1r = df1.align(df2, join='right') res2l, res2r = df2.align(df1, join='left') exp_idx = pd.MultiIndex.from_product([range(2), range(2), range(2)], names=('a', 'b', 'c')) expl = pd.DataFrame([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) assert_frame_equal(expl, res1l) assert_frame_equal(expl, res2r) expr = pd.DataFrame([0, 0, 1, 1] * 2, index=exp_idx) assert_frame_equal(expr, res1r) assert_frame_equal(expr, res2l) def test_align_series_combinations(self): df = pd.DataFrame({'a': [1, 3, 5], 'b': [1, 3, 5]}, index=list('ACE')) s = pd.Series([1, 2, 4], index=list('ABD'), name='x') # frame + series res1, res2 = df.align(s, axis=0) exp1 = pd.DataFrame({'a': [1, np.nan, 3, np.nan, 5], 'b': [1, np.nan, 3, np.nan, 5]}, index=list('ABCDE')) exp2 = pd.Series([1, 2, np.nan, 4, np.nan], index=list('ABCDE'), name='x') tm.assert_frame_equal(res1, exp1) tm.assert_series_equal(res2, exp2) # series + frame res1, res2 = s.align(df) tm.assert_series_equal(res1, exp2) tm.assert_frame_equal(res2, exp1) def test_filter(self): # Items filtered = self.frame.filter(['A', 'B', 'E']) assert len(filtered.columns) == 2 assert 'E' not in filtered filtered = self.frame.filter(['A', 'B', 'E'], axis='columns') assert len(filtered.columns) == 2 assert 'E' not in filtered # Other axis idx = self.frame.index[0:4] filtered = self.frame.filter(idx, axis='index') expected = self.frame.reindex(index=idx) tm.assert_frame_equal(filtered, expected) # like fcopy = self.frame.copy() fcopy['AA'] = 1 filtered = fcopy.filter(like='A') assert len(filtered.columns) == 2 assert 'AA' in filtered # like with ints in column names df = DataFrame(0., index=[0, 1, 2], columns=[0, 1, '_A', '_B']) filtered = df.filter(like='_') assert len(filtered.columns) == 2 # regex with ints in column names # from PR #10384 df = DataFrame(0., index=[0, 1, 2], columns=['A1', 1, 'B', 2, 'C']) expected = DataFrame( 0., index=[0, 1, 2], columns=pd.Index([1, 2], dtype=object)) filtered = df.filter(regex='^[0-9]+$') tm.assert_frame_equal(filtered, expected) expected = DataFrame(0., index=[0, 1, 2], columns=[0, '0', 1, '1']) # shouldn't remove anything filtered = expected.filter(regex='^[0-9]+$') tm.assert_frame_equal(filtered, expected) # pass in None with pytest.raises(TypeError, match='Must pass'): self.frame.filter() with pytest.raises(TypeError, match='Must pass'): self.frame.filter(items=None) with pytest.raises(TypeError, match='Must pass'): self.frame.filter(axis=1) # test mutually exclusive arguments with pytest.raises(TypeError, match='mutually exclusive'): self.frame.filter(items=['one', 'three'], regex='e$', like='bbi') with pytest.raises(TypeError, match='mutually exclusive'): self.frame.filter(items=['one', 'three'], regex='e$', axis=1) with pytest.raises(TypeError, match='mutually exclusive'): self.frame.filter(items=['one', 'three'], regex='e$') with pytest.raises(TypeError, match='mutually exclusive'): self.frame.filter(items=['one', 'three'], like='bbi', axis=0) with pytest.raises(TypeError, match='mutually exclusive'): self.frame.filter(items=['one', 'three'], like='bbi') # objects filtered = self.mixed_frame.filter(like='foo') assert 'foo' in filtered # unicode columns, won't ascii-encode df = self.frame.rename(columns={'B': u('\u2202')}) filtered = df.filter(like='C') assert 'C' in filtered def test_filter_regex_search(self): fcopy = self.frame.copy() fcopy['AA'] = 1 # regex filtered = fcopy.filter(regex='[A]+') assert len(filtered.columns) == 2 assert 'AA' in filtered # doesn't have to be at beginning df = DataFrame({'aBBa': [1, 2], 'BBaBB': [1, 2], 'aCCa': [1, 2], 'aCCaBB': [1, 2]}) result = df.filter(regex='BB') exp = df[[x for x in df.columns if 'BB' in x]] assert_frame_equal(result, exp) @pytest.mark.parametrize('name,expected', [ ('a', DataFrame({u'a': [1, 2]})), (u'a', DataFrame({u'a': [1, 2]})), (u'あ', DataFrame({u'あ': [3, 4]})) ]) def test_filter_unicode(self, name, expected): # GH13101 df = DataFrame({u'a': [1, 2], u'あ': [3, 4]}) assert_frame_equal(df.filter(like=name), expected) assert_frame_equal(df.filter(regex=name), expected) @pytest.mark.parametrize('name', ['a', u'a']) def test_filter_bytestring(self, name): # GH13101 df = DataFrame({b'a': [1, 2], b'b': [3, 4]}) expected = DataFrame({b'a': [1, 2]}) assert_frame_equal(df.filter(like=name), expected) assert_frame_equal(df.filter(regex=name), expected) def test_filter_corner(self): empty = DataFrame() result = empty.filter([]) assert_frame_equal(result, empty) result = empty.filter(like='foo') assert_frame_equal(result, empty) def test_select(self): # deprecated: gh-12410 f = lambda x: x.weekday() == 2 index = self.tsframe.index[[f(x) for x in self.tsframe.index]] expected_weekdays = self.tsframe.reindex(index=index) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = self.tsframe.select(f, axis=0) assert_frame_equal(result, expected_weekdays) result = self.frame.select(lambda x: x in ('B', 'D'), axis=1) expected = self.frame.reindex(columns=['B', 'D']) assert_frame_equal(result, expected, check_names=False) # replacement f = lambda x: x.weekday == 2 result = self.tsframe.loc(axis=0)[f(self.tsframe.index)] assert_frame_equal(result, expected_weekdays) crit = lambda x: x in ['B', 'D'] result = self.frame.loc(axis=1)[(self.frame.columns.map(crit))] expected = self.frame.reindex(columns=['B', 'D']) assert_frame_equal(result, expected, check_names=False) # doc example df = DataFrame({'A': [1, 2, 3]}, index=['foo', 'bar', 'baz']) crit = lambda x: x in ['bar', 'baz'] with tm.assert_produces_warning(FutureWarning): expected = df.select(crit) result = df.loc[df.index.map(crit)] assert_frame_equal(result, expected, check_names=False) def test_take(self): # homogeneous order = [3, 1, 2, 0] for df in [self.frame]: result = df.take(order, axis=0) expected = df.reindex(df.index.take(order)) assert_frame_equal(result, expected) # axis = 1 result = df.take(order, axis=1) expected = df.loc[:, ['D', 'B', 'C', 'A']] assert_frame_equal(result, expected, check_names=False) # negative indices order = [2, 1, -1] for df in [self.frame]: result = df.take(order, axis=0) expected = df.reindex(df.index.take(order)) assert_frame_equal(result, expected) with tm.assert_produces_warning(FutureWarning): result = df.take(order, convert=True, axis=0) assert_frame_equal(result, expected) with tm.assert_produces_warning(FutureWarning): result = df.take(order, convert=False, axis=0) assert_frame_equal(result, expected) # axis = 1 result = df.take(order, axis=1) expected = df.loc[:, ['C', 'B', 'D']] assert_frame_equal(result, expected, check_names=False) # illegal indices pytest.raises(IndexError, df.take, [3, 1, 2, 30], axis=0) pytest.raises(IndexError, df.take, [3, 1, 2, -31], axis=0) pytest.raises(IndexError, df.take, [3, 1, 2, 5], axis=1) pytest.raises(IndexError, df.take, [3, 1, 2, -5], axis=1) # mixed-dtype order = [4, 1, 2, 0, 3] for df in [self.mixed_frame]: result = df.take(order, axis=0) expected = df.reindex(df.index.take(order)) assert_frame_equal(result, expected) # axis = 1 result = df.take(order, axis=1) expected = df.loc[:, ['foo', 'B', 'C', 'A', 'D']] assert_frame_equal(result, expected) # negative indices order = [4, 1, -2] for df in [self.mixed_frame]: result = df.take(order, axis=0) expected = df.reindex(df.index.take(order)) assert_frame_equal(result, expected) # axis = 1 result = df.take(order, axis=1) expected = df.loc[:, ['foo', 'B', 'D']] assert_frame_equal(result, expected) # by dtype order = [1, 2, 0, 3] for df in [self.mixed_float, self.mixed_int]: result = df.take(order, axis=0) expected = df.reindex(df.index.take(order)) assert_frame_equal(result, expected) # axis = 1 result = df.take(order, axis=1) expected = df.loc[:, ['B', 'C', 'A', 'D']] assert_frame_equal(result, expected) def test_reindex_boolean(self): frame = DataFrame(np.ones((10, 2), dtype=bool), index=np.arange(0, 20, 2), columns=[0, 2]) reindexed = frame.reindex(np.arange(10)) assert reindexed.values.dtype == np.object_ assert isna(reindexed[0][1]) reindexed = frame.reindex(columns=lrange(3)) assert reindexed.values.dtype == np.object_ assert isna(reindexed[1]).all() def test_reindex_objects(self): reindexed = self.mixed_frame.reindex(columns=['foo', 'A', 'B']) assert 'foo' in reindexed reindexed = self.mixed_frame.reindex(columns=['A', 'B']) assert 'foo' not in reindexed def test_reindex_corner(self): index = Index(['a', 'b', 'c']) dm = self.empty.reindex(index=[1, 2, 3]) reindexed = dm.reindex(columns=index) tm.assert_index_equal(reindexed.columns, index) # ints are weird smaller = self.intframe.reindex(columns=['A', 'B', 'E']) assert smaller['E'].dtype == np.float64 def test_reindex_axis(self): cols = ['A', 'B', 'E'] with tm.assert_produces_warning(FutureWarning) as m: reindexed1 = self.intframe.reindex_axis(cols, axis=1) assert 'reindex' in str(m[0].message) reindexed2 = self.intframe.reindex(columns=cols) assert_frame_equal(reindexed1, reindexed2) rows = self.intframe.index[0:5] with tm.assert_produces_warning(FutureWarning) as m: reindexed1 = self.intframe.reindex_axis(rows, axis=0) assert 'reindex' in str(m[0].message) reindexed2 = self.intframe.reindex(index=rows) assert_frame_equal(reindexed1, reindexed2) pytest.raises(ValueError, self.intframe.reindex_axis, rows, axis=2) # no-op case cols = self.frame.columns.copy() with tm.assert_produces_warning(FutureWarning) as m: newFrame = self.frame.reindex_axis(cols, axis=1) assert 'reindex' in str(m[0].message) assert_frame_equal(newFrame, self.frame) def test_reindex_with_nans(self): df = DataFrame([[1, 2], [3, 4], [np.nan, np.nan], [7, 8], [9, 10]], columns=['a', 'b'], index=[100.0, 101.0, np.nan, 102.0, 103.0]) result = df.reindex(index=[101.0, 102.0, 103.0]) expected = df.iloc[[1, 3, 4]] assert_frame_equal(result, expected) result = df.reindex(index=[103.0]) expected = df.iloc[[4]] assert_frame_equal(result, expected) result = df.reindex(index=[101.0]) expected = df.iloc[[1]] assert_frame_equal(result, expected) def test_reindex_multi(self): df = DataFrame(np.random.randn(3, 3)) result = df.reindex(index=lrange(4), columns=lrange(4)) expected = df.reindex(lrange(4)).reindex(columns=lrange(4)) assert_frame_equal(result, expected) df = DataFrame(np.random.randint(0, 10, (3, 3))) result = df.reindex(index=lrange(4), columns=lrange(4)) expected = df.reindex(lrange(4)).reindex(columns=lrange(4)) assert_frame_equal(result, expected) df = DataFrame(np.random.randint(0, 10, (3, 3))) result = df.reindex(index=lrange(2), columns=lrange(2)) expected = df.reindex(lrange(2)).reindex(columns=lrange(2)) assert_frame_equal(result, expected) df = DataFrame(np.random.randn(5, 3) + 1j, columns=['a', 'b', 'c']) result = df.reindex(index=[0, 1], columns=['a', 'b']) expected = df.reindex([0, 1]).reindex(columns=['a', 'b']) assert_frame_equal(result, expected) def test_reindex_multi_categorical_time(self): # https://github.com/pandas-dev/pandas/issues/21390 midx = pd.MultiIndex.from_product( [Categorical(['a', 'b', 'c']), Categorical(date_range("2012-01-01", periods=3, freq='H'))]) df = pd.DataFrame({'a': range(len(midx))}, index=midx) df2 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 8]] result = df2.reindex(midx) expected = pd.DataFrame( {'a': [0, 1, 2, 3, 4, 5, 6, np.nan, 8]}, index=midx) assert_frame_equal(result, expected) data = [[1, 2, 3], [1, 2, 3]] @pytest.mark.parametrize('actual', [ DataFrame(data=data, index=['a', 'a']), DataFrame(data=data, index=['a', 'b']), DataFrame(data=data, index=['a', 'b']).set_index([0, 1]), DataFrame(data=data, index=['a', 'a']).set_index([0, 1]) ]) def test_raise_on_drop_duplicate_index(self, actual): # issue 19186 level = 0 if isinstance(actual.index, MultiIndex) else None with pytest.raises(KeyError): actual.drop('c', level=level, axis=0) with pytest.raises(KeyError): actual.T.drop('c', level=level, axis=1) expected_no_err = actual.drop('c', axis=0, level=level, errors='ignore') assert_frame_equal(expected_no_err, actual) expected_no_err = actual.T.drop('c', axis=1, level=level, errors='ignore') assert_frame_equal(expected_no_err.T, actual) @pytest.mark.parametrize('index', [[1, 2, 3], [1, 1, 2]]) @pytest.mark.parametrize('drop_labels', [[], [1], [2]]) def test_drop_empty_list(self, index, drop_labels): # GH 21494 expected_index = [i for i in index if i not in drop_labels] frame = pd.DataFrame(index=index).drop(drop_labels) tm.assert_frame_equal(frame, pd.DataFrame(index=expected_index)) @pytest.mark.parametrize('index', [[1, 2, 3], [1, 2, 2]]) @pytest.mark.parametrize('drop_labels', [[1, 4], [4, 5]]) def test_drop_non_empty_list(self, index, drop_labels): # GH 21494 with pytest.raises(KeyError, match='not found in axis'): pd.DataFrame(index=index).drop(drop_labels)