1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159 |
- # -*- coding: utf-8 -*-
- from __future__ import print_function
- from datetime import datetime
- import numpy as np
- import pytest
- from pandas.compat import lrange, lzip, u
- from pandas.errors import PerformanceWarning
- import pandas as pd
- from pandas import (
- Categorical, DataFrame, Index, MultiIndex, Series, compat, date_range,
- isna)
- from pandas.tests.frame.common import TestData
- import pandas.util.testing as tm
- from pandas.util.testing import assert_frame_equal
- class TestDataFrameSelectReindex(TestData):
- # These are specific reindex-based tests; other indexing tests should go in
- # test_indexing
- def test_drop_names(self):
- df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]],
- index=['a', 'b', 'c'],
- columns=['d', 'e', 'f'])
- df.index.name, df.columns.name = 'first', 'second'
- df_dropped_b = df.drop('b')
- df_dropped_e = df.drop('e', axis=1)
- df_inplace_b, df_inplace_e = df.copy(), df.copy()
- df_inplace_b.drop('b', inplace=True)
- df_inplace_e.drop('e', axis=1, inplace=True)
- for obj in (df_dropped_b, df_dropped_e, df_inplace_b, df_inplace_e):
- assert obj.index.name == 'first'
- assert obj.columns.name == 'second'
- assert list(df.columns) == ['d', 'e', 'f']
- pytest.raises(KeyError, df.drop, ['g'])
- pytest.raises(KeyError, df.drop, ['g'], 1)
- # errors = 'ignore'
- dropped = df.drop(['g'], errors='ignore')
- expected = Index(['a', 'b', 'c'], name='first')
- tm.assert_index_equal(dropped.index, expected)
- dropped = df.drop(['b', 'g'], errors='ignore')
- expected = Index(['a', 'c'], name='first')
- tm.assert_index_equal(dropped.index, expected)
- dropped = df.drop(['g'], axis=1, errors='ignore')
- expected = Index(['d', 'e', 'f'], name='second')
- tm.assert_index_equal(dropped.columns, expected)
- dropped = df.drop(['d', 'g'], axis=1, errors='ignore')
- expected = Index(['e', 'f'], name='second')
- tm.assert_index_equal(dropped.columns, expected)
- # GH 16398
- dropped = df.drop([], errors='ignore')
- expected = Index(['a', 'b', 'c'], name='first')
- tm.assert_index_equal(dropped.index, expected)
- def test_drop_col_still_multiindex(self):
- arrays = [['a', 'b', 'c', 'top'],
- ['', '', '', 'OD'],
- ['', '', '', 'wx']]
- tuples = sorted(zip(*arrays))
- index = MultiIndex.from_tuples(tuples)
- df = DataFrame(np.random.randn(3, 4), columns=index)
- del df[('a', '', '')]
- assert(isinstance(df.columns, MultiIndex))
- def test_drop(self):
- simple = DataFrame({"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]})
- assert_frame_equal(simple.drop("A", axis=1), simple[['B']])
- assert_frame_equal(simple.drop(["A", "B"], axis='columns'),
- simple[[]])
- assert_frame_equal(simple.drop([0, 1, 3], axis=0), simple.loc[[2], :])
- assert_frame_equal(simple.drop(
- [0, 3], axis='index'), simple.loc[[1, 2], :])
- pytest.raises(KeyError, simple.drop, 5)
- pytest.raises(KeyError, simple.drop, 'C', 1)
- pytest.raises(KeyError, simple.drop, [1, 5])
- pytest.raises(KeyError, simple.drop, ['A', 'C'], 1)
- # errors = 'ignore'
- assert_frame_equal(simple.drop(5, errors='ignore'), simple)
- assert_frame_equal(simple.drop([0, 5], errors='ignore'),
- simple.loc[[1, 2, 3], :])
- assert_frame_equal(simple.drop('C', axis=1, errors='ignore'), simple)
- assert_frame_equal(simple.drop(['A', 'C'], axis=1, errors='ignore'),
- simple[['B']])
- # non-unique - wheee!
- nu_df = DataFrame(lzip(range(3), range(-3, 1), list('abc')),
- columns=['a', 'a', 'b'])
- assert_frame_equal(nu_df.drop('a', axis=1), nu_df[['b']])
- assert_frame_equal(nu_df.drop('b', axis='columns'), nu_df['a'])
- assert_frame_equal(nu_df.drop([]), nu_df) # GH 16398
- nu_df = nu_df.set_index(pd.Index(['X', 'Y', 'X']))
- nu_df.columns = list('abc')
- assert_frame_equal(nu_df.drop('X', axis='rows'), nu_df.loc[["Y"], :])
- assert_frame_equal(nu_df.drop(['X', 'Y'], axis=0), nu_df.loc[[], :])
- # inplace cache issue
- # GH 5628
- df = pd.DataFrame(np.random.randn(10, 3), columns=list('abc'))
- expected = df[~(df.b > 0)]
- df.drop(labels=df[df.b > 0].index, inplace=True)
- assert_frame_equal(df, expected)
- def test_drop_multiindex_not_lexsorted(self):
- # GH 11640
- # define the lexsorted version
- lexsorted_mi = MultiIndex.from_tuples(
- [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c'])
- lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
- assert lexsorted_df.columns.is_lexsorted()
- # define the non-lexsorted version
- not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'],
- data=[[1, 'b1', 'c1', 3],
- [1, 'b2', 'c2', 4]])
- not_lexsorted_df = not_lexsorted_df.pivot_table(
- index='a', columns=['b', 'c'], values='d')
- not_lexsorted_df = not_lexsorted_df.reset_index()
- assert not not_lexsorted_df.columns.is_lexsorted()
- # compare the results
- tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)
- expected = lexsorted_df.drop('a', axis=1)
- with tm.assert_produces_warning(PerformanceWarning):
- result = not_lexsorted_df.drop('a', axis=1)
- tm.assert_frame_equal(result, expected)
- def test_drop_api_equivalence(self):
- # equivalence of the labels/axis and index/columns API's (GH12392)
- df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]],
- index=['a', 'b', 'c'],
- columns=['d', 'e', 'f'])
- res1 = df.drop('a')
- res2 = df.drop(index='a')
- tm.assert_frame_equal(res1, res2)
- res1 = df.drop('d', 1)
- res2 = df.drop(columns='d')
- tm.assert_frame_equal(res1, res2)
- res1 = df.drop(labels='e', axis=1)
- res2 = df.drop(columns='e')
- tm.assert_frame_equal(res1, res2)
- res1 = df.drop(['a'], axis=0)
- res2 = df.drop(index=['a'])
- tm.assert_frame_equal(res1, res2)
- res1 = df.drop(['a'], axis=0).drop(['d'], axis=1)
- res2 = df.drop(index=['a'], columns=['d'])
- tm.assert_frame_equal(res1, res2)
- with pytest.raises(ValueError):
- df.drop(labels='a', index='b')
- with pytest.raises(ValueError):
- df.drop(labels='a', columns='b')
- with pytest.raises(ValueError):
- df.drop(axis=1)
- def test_merge_join_different_levels(self):
- # GH 9455
- # first dataframe
- df1 = DataFrame(columns=['a', 'b'], data=[[1, 11], [0, 22]])
- # second dataframe
- columns = MultiIndex.from_tuples([('a', ''), ('c', 'c1')])
- df2 = DataFrame(columns=columns, data=[[1, 33], [0, 44]])
- # merge
- columns = ['a', 'b', ('c', 'c1')]
- expected = DataFrame(columns=columns, data=[[1, 11, 33], [0, 22, 44]])
- with tm.assert_produces_warning(UserWarning):
- result = pd.merge(df1, df2, on='a')
- tm.assert_frame_equal(result, expected)
- # join, see discussion in GH 12219
- columns = ['a', 'b', ('a', ''), ('c', 'c1')]
- expected = DataFrame(columns=columns,
- data=[[1, 11, 0, 44], [0, 22, 1, 33]])
- with tm.assert_produces_warning(UserWarning):
- result = df1.join(df2, on='a')
- tm.assert_frame_equal(result, expected)
- def test_reindex(self):
- newFrame = self.frame.reindex(self.ts1.index)
- for col in newFrame.columns:
- for idx, val in compat.iteritems(newFrame[col]):
- if idx in self.frame.index:
- if np.isnan(val):
- assert np.isnan(self.frame[col][idx])
- else:
- assert val == self.frame[col][idx]
- else:
- assert np.isnan(val)
- for col, series in compat.iteritems(newFrame):
- assert tm.equalContents(series.index, newFrame.index)
- emptyFrame = self.frame.reindex(Index([]))
- assert len(emptyFrame.index) == 0
- # Cython code should be unit-tested directly
- nonContigFrame = self.frame.reindex(self.ts1.index[::2])
- for col in nonContigFrame.columns:
- for idx, val in compat.iteritems(nonContigFrame[col]):
- if idx in self.frame.index:
- if np.isnan(val):
- assert np.isnan(self.frame[col][idx])
- else:
- assert val == self.frame[col][idx]
- else:
- assert np.isnan(val)
- for col, series in compat.iteritems(nonContigFrame):
- assert tm.equalContents(series.index, nonContigFrame.index)
- # corner cases
- # Same index, copies values but not index if copy=False
- newFrame = self.frame.reindex(self.frame.index, copy=False)
- assert newFrame.index is self.frame.index
- # length zero
- newFrame = self.frame.reindex([])
- assert newFrame.empty
- assert len(newFrame.columns) == len(self.frame.columns)
- # length zero with columns reindexed with non-empty index
- newFrame = self.frame.reindex([])
- newFrame = newFrame.reindex(self.frame.index)
- assert len(newFrame.index) == len(self.frame.index)
- assert len(newFrame.columns) == len(self.frame.columns)
- # pass non-Index
- newFrame = self.frame.reindex(list(self.ts1.index))
- tm.assert_index_equal(newFrame.index, self.ts1.index)
- # copy with no axes
- result = self.frame.reindex()
- assert_frame_equal(result, self.frame)
- assert result is not self.frame
- def test_reindex_nan(self):
- df = pd.DataFrame([[1, 2], [3, 5], [7, 11], [9, 23]],
- index=[2, np.nan, 1, 5],
- columns=['joe', 'jim'])
- i, j = [np.nan, 5, 5, np.nan, 1, 2, np.nan], [1, 3, 3, 1, 2, 0, 1]
- assert_frame_equal(df.reindex(i), df.iloc[j])
- df.index = df.index.astype('object')
- assert_frame_equal(df.reindex(i), df.iloc[j], check_index_type=False)
- # GH10388
- df = pd.DataFrame({'other': ['a', 'b', np.nan, 'c'],
- 'date': ['2015-03-22', np.nan,
- '2012-01-08', np.nan],
- 'amount': [2, 3, 4, 5]})
- df['date'] = pd.to_datetime(df.date)
- df['delta'] = (pd.to_datetime('2015-06-18') - df['date']).shift(1)
- left = df.set_index(['delta', 'other', 'date']).reset_index()
- right = df.reindex(columns=['delta', 'other', 'date', 'amount'])
- assert_frame_equal(left, right)
- def test_reindex_name_remains(self):
- s = Series(np.random.rand(10))
- df = DataFrame(s, index=np.arange(len(s)))
- i = Series(np.arange(10), name='iname')
- df = df.reindex(i)
- assert df.index.name == 'iname'
- df = df.reindex(Index(np.arange(10), name='tmpname'))
- assert df.index.name == 'tmpname'
- s = Series(np.random.rand(10))
- df = DataFrame(s.T, index=np.arange(len(s)))
- i = Series(np.arange(10), name='iname')
- df = df.reindex(columns=i)
- assert df.columns.name == 'iname'
- def test_reindex_int(self):
- smaller = self.intframe.reindex(self.intframe.index[::2])
- assert smaller['A'].dtype == np.int64
- bigger = smaller.reindex(self.intframe.index)
- assert bigger['A'].dtype == np.float64
- smaller = self.intframe.reindex(columns=['A', 'B'])
- assert smaller['A'].dtype == np.int64
- def test_reindex_like(self):
- other = self.frame.reindex(index=self.frame.index[:10],
- columns=['C', 'B'])
- assert_frame_equal(other, self.frame.reindex_like(other))
- def test_reindex_columns(self):
- new_frame = self.frame.reindex(columns=['A', 'B', 'E'])
- tm.assert_series_equal(new_frame['B'], self.frame['B'])
- assert np.isnan(new_frame['E']).all()
- assert 'C' not in new_frame
- # Length zero
- new_frame = self.frame.reindex(columns=[])
- assert new_frame.empty
- def test_reindex_columns_method(self):
- # GH 14992, reindexing over columns ignored method
- df = DataFrame(data=[[11, 12, 13], [21, 22, 23], [31, 32, 33]],
- index=[1, 2, 4],
- columns=[1, 2, 4],
- dtype=float)
- # default method
- result = df.reindex(columns=range(6))
- expected = DataFrame(data=[[np.nan, 11, 12, np.nan, 13, np.nan],
- [np.nan, 21, 22, np.nan, 23, np.nan],
- [np.nan, 31, 32, np.nan, 33, np.nan]],
- index=[1, 2, 4],
- columns=range(6),
- dtype=float)
- assert_frame_equal(result, expected)
- # method='ffill'
- result = df.reindex(columns=range(6), method='ffill')
- expected = DataFrame(data=[[np.nan, 11, 12, 12, 13, 13],
- [np.nan, 21, 22, 22, 23, 23],
- [np.nan, 31, 32, 32, 33, 33]],
- index=[1, 2, 4],
- columns=range(6),
- dtype=float)
- assert_frame_equal(result, expected)
- # method='bfill'
- result = df.reindex(columns=range(6), method='bfill')
- expected = DataFrame(data=[[11, 11, 12, 13, 13, np.nan],
- [21, 21, 22, 23, 23, np.nan],
- [31, 31, 32, 33, 33, np.nan]],
- index=[1, 2, 4],
- columns=range(6),
- dtype=float)
- assert_frame_equal(result, expected)
- def test_reindex_axes(self):
- # GH 3317, reindexing by both axes loses freq of the index
- df = DataFrame(np.ones((3, 3)),
- index=[datetime(2012, 1, 1),
- datetime(2012, 1, 2),
- datetime(2012, 1, 3)],
- columns=['a', 'b', 'c'])
- time_freq = date_range('2012-01-01', '2012-01-03', freq='d')
- some_cols = ['a', 'b']
- index_freq = df.reindex(index=time_freq).index.freq
- both_freq = df.reindex(index=time_freq, columns=some_cols).index.freq
- seq_freq = df.reindex(index=time_freq).reindex(
- columns=some_cols).index.freq
- assert index_freq == both_freq
- assert index_freq == seq_freq
- def test_reindex_fill_value(self):
- df = DataFrame(np.random.randn(10, 4))
- # axis=0
- result = df.reindex(lrange(15))
- assert np.isnan(result.values[-5:]).all()
- result = df.reindex(lrange(15), fill_value=0)
- expected = df.reindex(lrange(15)).fillna(0)
- assert_frame_equal(result, expected)
- # axis=1
- result = df.reindex(columns=lrange(5), fill_value=0.)
- expected = df.copy()
- expected[4] = 0.
- assert_frame_equal(result, expected)
- result = df.reindex(columns=lrange(5), fill_value=0)
- expected = df.copy()
- expected[4] = 0
- assert_frame_equal(result, expected)
- result = df.reindex(columns=lrange(5), fill_value='foo')
- expected = df.copy()
- expected[4] = 'foo'
- assert_frame_equal(result, expected)
- # reindex_axis
- with tm.assert_produces_warning(FutureWarning):
- result = df.reindex_axis(lrange(15), fill_value=0., axis=0)
- expected = df.reindex(lrange(15)).fillna(0)
- assert_frame_equal(result, expected)
- with tm.assert_produces_warning(FutureWarning):
- result = df.reindex_axis(lrange(5), fill_value=0., axis=1)
- expected = df.reindex(columns=lrange(5)).fillna(0)
- assert_frame_equal(result, expected)
- # other dtypes
- df['foo'] = 'foo'
- result = df.reindex(lrange(15), fill_value=0)
- expected = df.reindex(lrange(15)).fillna(0)
- assert_frame_equal(result, expected)
- def test_reindex_dups(self):
- # GH4746, reindex on duplicate index error messages
- arr = np.random.randn(10)
- df = DataFrame(arr, index=[1, 2, 3, 4, 5, 1, 2, 3, 4, 5])
- # set index is ok
- result = df.copy()
- result.index = list(range(len(df)))
- expected = DataFrame(arr, index=list(range(len(df))))
- assert_frame_equal(result, expected)
- # reindex fails
- pytest.raises(ValueError, df.reindex, index=list(range(len(df))))
- def test_reindex_axis_style(self):
- # https://github.com/pandas-dev/pandas/issues/12392
- df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
- expected = pd.DataFrame({"A": [1, 2, np.nan], "B": [4, 5, np.nan]},
- index=[0, 1, 3])
- result = df.reindex([0, 1, 3])
- assert_frame_equal(result, expected)
- result = df.reindex([0, 1, 3], axis=0)
- assert_frame_equal(result, expected)
- result = df.reindex([0, 1, 3], axis='index')
- assert_frame_equal(result, expected)
- def test_reindex_positional_warns(self):
- # https://github.com/pandas-dev/pandas/issues/12392
- df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
- expected = pd.DataFrame({"A": [1., 2], 'B': [4., 5],
- "C": [np.nan, np.nan]})
- with tm.assert_produces_warning(FutureWarning):
- result = df.reindex([0, 1], ['A', 'B', 'C'])
- assert_frame_equal(result, expected)
- def test_reindex_axis_style_raises(self):
- # https://github.com/pandas-dev/pandas/issues/12392
- df = pd.DataFrame({"A": [1, 2, 3], 'B': [4, 5, 6]})
- with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
- df.reindex([0, 1], ['A'], axis=1)
- with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
- df.reindex([0, 1], ['A'], axis='index')
- with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
- df.reindex(index=[0, 1], axis='index')
- with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
- df.reindex(index=[0, 1], axis='columns')
- with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
- df.reindex(columns=[0, 1], axis='columns')
- with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
- df.reindex(index=[0, 1], columns=[0, 1], axis='columns')
- with pytest.raises(TypeError, match='Cannot specify all'):
- df.reindex([0, 1], [0], ['A'])
- # Mixing styles
- with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
- df.reindex(index=[0, 1], axis='index')
- with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
- df.reindex(index=[0, 1], axis='columns')
- # Duplicates
- with pytest.raises(TypeError, match="multiple values"):
- df.reindex([0, 1], labels=[0, 1])
- def test_reindex_single_named_indexer(self):
- # https://github.com/pandas-dev/pandas/issues/12392
- df = pd.DataFrame({"A": [1, 2, 3], "B": [1, 2, 3]})
- result = df.reindex([0, 1], columns=['A'])
- expected = pd.DataFrame({"A": [1, 2]})
- assert_frame_equal(result, expected)
- def test_reindex_api_equivalence(self):
- # https://github.com/pandas-dev/pandas/issues/12392
- # equivalence of the labels/axis and index/columns API's
- df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]],
- index=['a', 'b', 'c'],
- columns=['d', 'e', 'f'])
- res1 = df.reindex(['b', 'a'])
- res2 = df.reindex(index=['b', 'a'])
- res3 = df.reindex(labels=['b', 'a'])
- res4 = df.reindex(labels=['b', 'a'], axis=0)
- res5 = df.reindex(['b', 'a'], axis=0)
- for res in [res2, res3, res4, res5]:
- tm.assert_frame_equal(res1, res)
- res1 = df.reindex(columns=['e', 'd'])
- res2 = df.reindex(['e', 'd'], axis=1)
- res3 = df.reindex(labels=['e', 'd'], axis=1)
- for res in [res2, res3]:
- tm.assert_frame_equal(res1, res)
- with tm.assert_produces_warning(FutureWarning) as m:
- res1 = df.reindex(['b', 'a'], ['e', 'd'])
- assert 'reindex' in str(m[0].message)
- res2 = df.reindex(columns=['e', 'd'], index=['b', 'a'])
- res3 = df.reindex(labels=['b', 'a'], axis=0).reindex(labels=['e', 'd'],
- axis=1)
- for res in [res2, res3]:
- tm.assert_frame_equal(res1, res)
- def test_align(self):
- af, bf = self.frame.align(self.frame)
- assert af._data is not self.frame._data
- af, bf = self.frame.align(self.frame, copy=False)
- assert af._data is self.frame._data
- # axis = 0
- other = self.frame.iloc[:-5, :3]
- af, bf = self.frame.align(other, axis=0, fill_value=-1)
- tm.assert_index_equal(bf.columns, other.columns)
- # test fill value
- join_idx = self.frame.index.join(other.index)
- diff_a = self.frame.index.difference(join_idx)
- diff_b = other.index.difference(join_idx)
- diff_a_vals = af.reindex(diff_a).values
- diff_b_vals = bf.reindex(diff_b).values
- assert (diff_a_vals == -1).all()
- af, bf = self.frame.align(other, join='right', axis=0)
- tm.assert_index_equal(bf.columns, other.columns)
- tm.assert_index_equal(bf.index, other.index)
- tm.assert_index_equal(af.index, other.index)
- # axis = 1
- other = self.frame.iloc[:-5, :3].copy()
- af, bf = self.frame.align(other, axis=1)
- tm.assert_index_equal(bf.columns, self.frame.columns)
- tm.assert_index_equal(bf.index, other.index)
- # test fill value
- join_idx = self.frame.index.join(other.index)
- diff_a = self.frame.index.difference(join_idx)
- diff_b = other.index.difference(join_idx)
- diff_a_vals = af.reindex(diff_a).values
- # TODO(wesm): unused?
- diff_b_vals = bf.reindex(diff_b).values # noqa
- assert (diff_a_vals == -1).all()
- af, bf = self.frame.align(other, join='inner', axis=1)
- tm.assert_index_equal(bf.columns, other.columns)
- af, bf = self.frame.align(other, join='inner', axis=1, method='pad')
- tm.assert_index_equal(bf.columns, other.columns)
- # test other non-float types
- af, bf = self.intframe.align(other, join='inner', axis=1, method='pad')
- tm.assert_index_equal(bf.columns, other.columns)
- af, bf = self.mixed_frame.align(self.mixed_frame,
- join='inner', axis=1, method='pad')
- tm.assert_index_equal(bf.columns, self.mixed_frame.columns)
- af, bf = self.frame.align(other.iloc[:, 0], join='inner', axis=1,
- method=None, fill_value=None)
- tm.assert_index_equal(bf.index, Index([]))
- af, bf = self.frame.align(other.iloc[:, 0], join='inner', axis=1,
- method=None, fill_value=0)
- tm.assert_index_equal(bf.index, Index([]))
- # mixed floats/ints
- af, bf = self.mixed_float.align(other.iloc[:, 0], join='inner', axis=1,
- method=None, fill_value=0)
- tm.assert_index_equal(bf.index, Index([]))
- af, bf = self.mixed_int.align(other.iloc[:, 0], join='inner', axis=1,
- method=None, fill_value=0)
- tm.assert_index_equal(bf.index, Index([]))
- # Try to align DataFrame to Series along bad axis
- with pytest.raises(ValueError):
- self.frame.align(af.iloc[0, :3], join='inner', axis=2)
- # align dataframe to series with broadcast or not
- idx = self.frame.index
- s = Series(range(len(idx)), index=idx)
- left, right = self.frame.align(s, axis=0)
- tm.assert_index_equal(left.index, self.frame.index)
- tm.assert_index_equal(right.index, self.frame.index)
- assert isinstance(right, Series)
- left, right = self.frame.align(s, broadcast_axis=1)
- tm.assert_index_equal(left.index, self.frame.index)
- expected = {c: s for c in self.frame.columns}
- expected = DataFrame(expected, index=self.frame.index,
- columns=self.frame.columns)
- tm.assert_frame_equal(right, expected)
- # see gh-9558
- df = DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
- result = df[df['a'] == 2]
- expected = DataFrame([[2, 5]], index=[1], columns=['a', 'b'])
- tm.assert_frame_equal(result, expected)
- result = df.where(df['a'] == 2, 0)
- expected = DataFrame({'a': [0, 2, 0], 'b': [0, 5, 0]})
- tm.assert_frame_equal(result, expected)
- def _check_align(self, a, b, axis, fill_axis, how, method, limit=None):
- aa, ab = a.align(b, axis=axis, join=how, method=method, limit=limit,
- fill_axis=fill_axis)
- join_index, join_columns = None, None
- ea, eb = a, b
- if axis is None or axis == 0:
- join_index = a.index.join(b.index, how=how)
- ea = ea.reindex(index=join_index)
- eb = eb.reindex(index=join_index)
- if axis is None or axis == 1:
- join_columns = a.columns.join(b.columns, how=how)
- ea = ea.reindex(columns=join_columns)
- eb = eb.reindex(columns=join_columns)
- ea = ea.fillna(axis=fill_axis, method=method, limit=limit)
- eb = eb.fillna(axis=fill_axis, method=method, limit=limit)
- assert_frame_equal(aa, ea)
- assert_frame_equal(ab, eb)
- @pytest.mark.parametrize('meth', ['pad', 'bfill'])
- @pytest.mark.parametrize('ax', [0, 1, None])
- @pytest.mark.parametrize('fax', [0, 1])
- @pytest.mark.parametrize('how', ['inner', 'outer', 'left', 'right'])
- def test_align_fill_method(self, how, meth, ax, fax):
- self._check_align_fill(how, meth, ax, fax)
- def _check_align_fill(self, kind, meth, ax, fax):
- left = self.frame.iloc[0:4, :10]
- right = self.frame.iloc[2:, 6:]
- empty = self.frame.iloc[:0, :0]
- self._check_align(left, right, axis=ax, fill_axis=fax,
- how=kind, method=meth)
- self._check_align(left, right, axis=ax, fill_axis=fax,
- how=kind, method=meth, limit=1)
- # empty left
- self._check_align(empty, right, axis=ax, fill_axis=fax,
- how=kind, method=meth)
- self._check_align(empty, right, axis=ax, fill_axis=fax,
- how=kind, method=meth, limit=1)
- # empty right
- self._check_align(left, empty, axis=ax, fill_axis=fax,
- how=kind, method=meth)
- self._check_align(left, empty, axis=ax, fill_axis=fax,
- how=kind, method=meth, limit=1)
- # both empty
- self._check_align(empty, empty, axis=ax, fill_axis=fax,
- how=kind, method=meth)
- self._check_align(empty, empty, axis=ax, fill_axis=fax,
- how=kind, method=meth, limit=1)
- def test_align_int_fill_bug(self):
- # GH #910
- X = np.arange(10 * 10, dtype='float64').reshape(10, 10)
- Y = np.ones((10, 1), dtype=int)
- df1 = DataFrame(X)
- df1['0.X'] = Y.squeeze()
- df2 = df1.astype(float)
- result = df1 - df1.mean()
- expected = df2 - df2.mean()
- assert_frame_equal(result, expected)
- def test_align_multiindex(self):
- # GH 10665
- # same test cases as test_align_multiindex in test_series.py
- midx = pd.MultiIndex.from_product([range(2), range(3), range(2)],
- names=('a', 'b', 'c'))
- idx = pd.Index(range(2), name='b')
- df1 = pd.DataFrame(np.arange(12, dtype='int64'), index=midx)
- df2 = pd.DataFrame(np.arange(2, dtype='int64'), index=idx)
- # these must be the same results (but flipped)
- res1l, res1r = df1.align(df2, join='left')
- res2l, res2r = df2.align(df1, join='right')
- expl = df1
- assert_frame_equal(expl, res1l)
- assert_frame_equal(expl, res2r)
- expr = pd.DataFrame([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx)
- assert_frame_equal(expr, res1r)
- assert_frame_equal(expr, res2l)
- res1l, res1r = df1.align(df2, join='right')
- res2l, res2r = df2.align(df1, join='left')
- exp_idx = pd.MultiIndex.from_product([range(2), range(2), range(2)],
- names=('a', 'b', 'c'))
- expl = pd.DataFrame([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx)
- assert_frame_equal(expl, res1l)
- assert_frame_equal(expl, res2r)
- expr = pd.DataFrame([0, 0, 1, 1] * 2, index=exp_idx)
- assert_frame_equal(expr, res1r)
- assert_frame_equal(expr, res2l)
- def test_align_series_combinations(self):
- df = pd.DataFrame({'a': [1, 3, 5],
- 'b': [1, 3, 5]}, index=list('ACE'))
- s = pd.Series([1, 2, 4], index=list('ABD'), name='x')
- # frame + series
- res1, res2 = df.align(s, axis=0)
- exp1 = pd.DataFrame({'a': [1, np.nan, 3, np.nan, 5],
- 'b': [1, np.nan, 3, np.nan, 5]},
- index=list('ABCDE'))
- exp2 = pd.Series([1, 2, np.nan, 4, np.nan],
- index=list('ABCDE'), name='x')
- tm.assert_frame_equal(res1, exp1)
- tm.assert_series_equal(res2, exp2)
- # series + frame
- res1, res2 = s.align(df)
- tm.assert_series_equal(res1, exp2)
- tm.assert_frame_equal(res2, exp1)
- def test_filter(self):
- # Items
- filtered = self.frame.filter(['A', 'B', 'E'])
- assert len(filtered.columns) == 2
- assert 'E' not in filtered
- filtered = self.frame.filter(['A', 'B', 'E'], axis='columns')
- assert len(filtered.columns) == 2
- assert 'E' not in filtered
- # Other axis
- idx = self.frame.index[0:4]
- filtered = self.frame.filter(idx, axis='index')
- expected = self.frame.reindex(index=idx)
- tm.assert_frame_equal(filtered, expected)
- # like
- fcopy = self.frame.copy()
- fcopy['AA'] = 1
- filtered = fcopy.filter(like='A')
- assert len(filtered.columns) == 2
- assert 'AA' in filtered
- # like with ints in column names
- df = DataFrame(0., index=[0, 1, 2], columns=[0, 1, '_A', '_B'])
- filtered = df.filter(like='_')
- assert len(filtered.columns) == 2
- # regex with ints in column names
- # from PR #10384
- df = DataFrame(0., index=[0, 1, 2], columns=['A1', 1, 'B', 2, 'C'])
- expected = DataFrame(
- 0., index=[0, 1, 2], columns=pd.Index([1, 2], dtype=object))
- filtered = df.filter(regex='^[0-9]+$')
- tm.assert_frame_equal(filtered, expected)
- expected = DataFrame(0., index=[0, 1, 2], columns=[0, '0', 1, '1'])
- # shouldn't remove anything
- filtered = expected.filter(regex='^[0-9]+$')
- tm.assert_frame_equal(filtered, expected)
- # pass in None
- with pytest.raises(TypeError, match='Must pass'):
- self.frame.filter()
- with pytest.raises(TypeError, match='Must pass'):
- self.frame.filter(items=None)
- with pytest.raises(TypeError, match='Must pass'):
- self.frame.filter(axis=1)
- # test mutually exclusive arguments
- with pytest.raises(TypeError, match='mutually exclusive'):
- self.frame.filter(items=['one', 'three'], regex='e$', like='bbi')
- with pytest.raises(TypeError, match='mutually exclusive'):
- self.frame.filter(items=['one', 'three'], regex='e$', axis=1)
- with pytest.raises(TypeError, match='mutually exclusive'):
- self.frame.filter(items=['one', 'three'], regex='e$')
- with pytest.raises(TypeError, match='mutually exclusive'):
- self.frame.filter(items=['one', 'three'], like='bbi', axis=0)
- with pytest.raises(TypeError, match='mutually exclusive'):
- self.frame.filter(items=['one', 'three'], like='bbi')
- # objects
- filtered = self.mixed_frame.filter(like='foo')
- assert 'foo' in filtered
- # unicode columns, won't ascii-encode
- df = self.frame.rename(columns={'B': u('\u2202')})
- filtered = df.filter(like='C')
- assert 'C' in filtered
- def test_filter_regex_search(self):
- fcopy = self.frame.copy()
- fcopy['AA'] = 1
- # regex
- filtered = fcopy.filter(regex='[A]+')
- assert len(filtered.columns) == 2
- assert 'AA' in filtered
- # doesn't have to be at beginning
- df = DataFrame({'aBBa': [1, 2],
- 'BBaBB': [1, 2],
- 'aCCa': [1, 2],
- 'aCCaBB': [1, 2]})
- result = df.filter(regex='BB')
- exp = df[[x for x in df.columns if 'BB' in x]]
- assert_frame_equal(result, exp)
- @pytest.mark.parametrize('name,expected', [
- ('a', DataFrame({u'a': [1, 2]})),
- (u'a', DataFrame({u'a': [1, 2]})),
- (u'あ', DataFrame({u'あ': [3, 4]}))
- ])
- def test_filter_unicode(self, name, expected):
- # GH13101
- df = DataFrame({u'a': [1, 2], u'あ': [3, 4]})
- assert_frame_equal(df.filter(like=name), expected)
- assert_frame_equal(df.filter(regex=name), expected)
- @pytest.mark.parametrize('name', ['a', u'a'])
- def test_filter_bytestring(self, name):
- # GH13101
- df = DataFrame({b'a': [1, 2], b'b': [3, 4]})
- expected = DataFrame({b'a': [1, 2]})
- assert_frame_equal(df.filter(like=name), expected)
- assert_frame_equal(df.filter(regex=name), expected)
- def test_filter_corner(self):
- empty = DataFrame()
- result = empty.filter([])
- assert_frame_equal(result, empty)
- result = empty.filter(like='foo')
- assert_frame_equal(result, empty)
- def test_select(self):
- # deprecated: gh-12410
- f = lambda x: x.weekday() == 2
- index = self.tsframe.index[[f(x) for x in self.tsframe.index]]
- expected_weekdays = self.tsframe.reindex(index=index)
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- result = self.tsframe.select(f, axis=0)
- assert_frame_equal(result, expected_weekdays)
- result = self.frame.select(lambda x: x in ('B', 'D'), axis=1)
- expected = self.frame.reindex(columns=['B', 'D'])
- assert_frame_equal(result, expected, check_names=False)
- # replacement
- f = lambda x: x.weekday == 2
- result = self.tsframe.loc(axis=0)[f(self.tsframe.index)]
- assert_frame_equal(result, expected_weekdays)
- crit = lambda x: x in ['B', 'D']
- result = self.frame.loc(axis=1)[(self.frame.columns.map(crit))]
- expected = self.frame.reindex(columns=['B', 'D'])
- assert_frame_equal(result, expected, check_names=False)
- # doc example
- df = DataFrame({'A': [1, 2, 3]}, index=['foo', 'bar', 'baz'])
- crit = lambda x: x in ['bar', 'baz']
- with tm.assert_produces_warning(FutureWarning):
- expected = df.select(crit)
- result = df.loc[df.index.map(crit)]
- assert_frame_equal(result, expected, check_names=False)
- def test_take(self):
- # homogeneous
- order = [3, 1, 2, 0]
- for df in [self.frame]:
- result = df.take(order, axis=0)
- expected = df.reindex(df.index.take(order))
- assert_frame_equal(result, expected)
- # axis = 1
- result = df.take(order, axis=1)
- expected = df.loc[:, ['D', 'B', 'C', 'A']]
- assert_frame_equal(result, expected, check_names=False)
- # negative indices
- order = [2, 1, -1]
- for df in [self.frame]:
- result = df.take(order, axis=0)
- expected = df.reindex(df.index.take(order))
- assert_frame_equal(result, expected)
- with tm.assert_produces_warning(FutureWarning):
- result = df.take(order, convert=True, axis=0)
- assert_frame_equal(result, expected)
- with tm.assert_produces_warning(FutureWarning):
- result = df.take(order, convert=False, axis=0)
- assert_frame_equal(result, expected)
- # axis = 1
- result = df.take(order, axis=1)
- expected = df.loc[:, ['C', 'B', 'D']]
- assert_frame_equal(result, expected, check_names=False)
- # illegal indices
- pytest.raises(IndexError, df.take, [3, 1, 2, 30], axis=0)
- pytest.raises(IndexError, df.take, [3, 1, 2, -31], axis=0)
- pytest.raises(IndexError, df.take, [3, 1, 2, 5], axis=1)
- pytest.raises(IndexError, df.take, [3, 1, 2, -5], axis=1)
- # mixed-dtype
- order = [4, 1, 2, 0, 3]
- for df in [self.mixed_frame]:
- result = df.take(order, axis=0)
- expected = df.reindex(df.index.take(order))
- assert_frame_equal(result, expected)
- # axis = 1
- result = df.take(order, axis=1)
- expected = df.loc[:, ['foo', 'B', 'C', 'A', 'D']]
- assert_frame_equal(result, expected)
- # negative indices
- order = [4, 1, -2]
- for df in [self.mixed_frame]:
- result = df.take(order, axis=0)
- expected = df.reindex(df.index.take(order))
- assert_frame_equal(result, expected)
- # axis = 1
- result = df.take(order, axis=1)
- expected = df.loc[:, ['foo', 'B', 'D']]
- assert_frame_equal(result, expected)
- # by dtype
- order = [1, 2, 0, 3]
- for df in [self.mixed_float, self.mixed_int]:
- result = df.take(order, axis=0)
- expected = df.reindex(df.index.take(order))
- assert_frame_equal(result, expected)
- # axis = 1
- result = df.take(order, axis=1)
- expected = df.loc[:, ['B', 'C', 'A', 'D']]
- assert_frame_equal(result, expected)
- def test_reindex_boolean(self):
- frame = DataFrame(np.ones((10, 2), dtype=bool),
- index=np.arange(0, 20, 2),
- columns=[0, 2])
- reindexed = frame.reindex(np.arange(10))
- assert reindexed.values.dtype == np.object_
- assert isna(reindexed[0][1])
- reindexed = frame.reindex(columns=lrange(3))
- assert reindexed.values.dtype == np.object_
- assert isna(reindexed[1]).all()
- def test_reindex_objects(self):
- reindexed = self.mixed_frame.reindex(columns=['foo', 'A', 'B'])
- assert 'foo' in reindexed
- reindexed = self.mixed_frame.reindex(columns=['A', 'B'])
- assert 'foo' not in reindexed
- def test_reindex_corner(self):
- index = Index(['a', 'b', 'c'])
- dm = self.empty.reindex(index=[1, 2, 3])
- reindexed = dm.reindex(columns=index)
- tm.assert_index_equal(reindexed.columns, index)
- # ints are weird
- smaller = self.intframe.reindex(columns=['A', 'B', 'E'])
- assert smaller['E'].dtype == np.float64
- def test_reindex_axis(self):
- cols = ['A', 'B', 'E']
- with tm.assert_produces_warning(FutureWarning) as m:
- reindexed1 = self.intframe.reindex_axis(cols, axis=1)
- assert 'reindex' in str(m[0].message)
- reindexed2 = self.intframe.reindex(columns=cols)
- assert_frame_equal(reindexed1, reindexed2)
- rows = self.intframe.index[0:5]
- with tm.assert_produces_warning(FutureWarning) as m:
- reindexed1 = self.intframe.reindex_axis(rows, axis=0)
- assert 'reindex' in str(m[0].message)
- reindexed2 = self.intframe.reindex(index=rows)
- assert_frame_equal(reindexed1, reindexed2)
- pytest.raises(ValueError, self.intframe.reindex_axis, rows, axis=2)
- # no-op case
- cols = self.frame.columns.copy()
- with tm.assert_produces_warning(FutureWarning) as m:
- newFrame = self.frame.reindex_axis(cols, axis=1)
- assert 'reindex' in str(m[0].message)
- assert_frame_equal(newFrame, self.frame)
- def test_reindex_with_nans(self):
- df = DataFrame([[1, 2], [3, 4], [np.nan, np.nan], [7, 8], [9, 10]],
- columns=['a', 'b'],
- index=[100.0, 101.0, np.nan, 102.0, 103.0])
- result = df.reindex(index=[101.0, 102.0, 103.0])
- expected = df.iloc[[1, 3, 4]]
- assert_frame_equal(result, expected)
- result = df.reindex(index=[103.0])
- expected = df.iloc[[4]]
- assert_frame_equal(result, expected)
- result = df.reindex(index=[101.0])
- expected = df.iloc[[1]]
- assert_frame_equal(result, expected)
- def test_reindex_multi(self):
- df = DataFrame(np.random.randn(3, 3))
- result = df.reindex(index=lrange(4), columns=lrange(4))
- expected = df.reindex(lrange(4)).reindex(columns=lrange(4))
- assert_frame_equal(result, expected)
- df = DataFrame(np.random.randint(0, 10, (3, 3)))
- result = df.reindex(index=lrange(4), columns=lrange(4))
- expected = df.reindex(lrange(4)).reindex(columns=lrange(4))
- assert_frame_equal(result, expected)
- df = DataFrame(np.random.randint(0, 10, (3, 3)))
- result = df.reindex(index=lrange(2), columns=lrange(2))
- expected = df.reindex(lrange(2)).reindex(columns=lrange(2))
- assert_frame_equal(result, expected)
- df = DataFrame(np.random.randn(5, 3) + 1j, columns=['a', 'b', 'c'])
- result = df.reindex(index=[0, 1], columns=['a', 'b'])
- expected = df.reindex([0, 1]).reindex(columns=['a', 'b'])
- assert_frame_equal(result, expected)
- def test_reindex_multi_categorical_time(self):
- # https://github.com/pandas-dev/pandas/issues/21390
- midx = pd.MultiIndex.from_product(
- [Categorical(['a', 'b', 'c']),
- Categorical(date_range("2012-01-01", periods=3, freq='H'))])
- df = pd.DataFrame({'a': range(len(midx))}, index=midx)
- df2 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 8]]
- result = df2.reindex(midx)
- expected = pd.DataFrame(
- {'a': [0, 1, 2, 3, 4, 5, 6, np.nan, 8]}, index=midx)
- assert_frame_equal(result, expected)
- data = [[1, 2, 3], [1, 2, 3]]
- @pytest.mark.parametrize('actual', [
- DataFrame(data=data, index=['a', 'a']),
- DataFrame(data=data, index=['a', 'b']),
- DataFrame(data=data, index=['a', 'b']).set_index([0, 1]),
- DataFrame(data=data, index=['a', 'a']).set_index([0, 1])
- ])
- def test_raise_on_drop_duplicate_index(self, actual):
- # issue 19186
- level = 0 if isinstance(actual.index, MultiIndex) else None
- with pytest.raises(KeyError):
- actual.drop('c', level=level, axis=0)
- with pytest.raises(KeyError):
- actual.T.drop('c', level=level, axis=1)
- expected_no_err = actual.drop('c', axis=0, level=level,
- errors='ignore')
- assert_frame_equal(expected_no_err, actual)
- expected_no_err = actual.T.drop('c', axis=1, level=level,
- errors='ignore')
- assert_frame_equal(expected_no_err.T, actual)
- @pytest.mark.parametrize('index', [[1, 2, 3], [1, 1, 2]])
- @pytest.mark.parametrize('drop_labels', [[], [1], [2]])
- def test_drop_empty_list(self, index, drop_labels):
- # GH 21494
- expected_index = [i for i in index if i not in drop_labels]
- frame = pd.DataFrame(index=index).drop(drop_labels)
- tm.assert_frame_equal(frame, pd.DataFrame(index=expected_index))
- @pytest.mark.parametrize('index', [[1, 2, 3], [1, 2, 2]])
- @pytest.mark.parametrize('drop_labels', [[1, 4], [4, 5]])
- def test_drop_non_empty_list(self, index, drop_labels):
- # GH 21494
- with pytest.raises(KeyError, match='not found in axis'):
- pd.DataFrame(index=index).drop(drop_labels)
|