123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863 |
- # -*- coding: utf-8 -*-
- from __future__ import print_function
- import datetime
- from distutils.version import LooseVersion
- import dateutil
- import numpy as np
- import pytest
- from pandas.compat import lrange
- import pandas.util._test_decorators as td
- import pandas as pd
- from pandas import Categorical, DataFrame, Series, Timestamp, date_range
- from pandas.tests.frame.common import TestData, _check_mixed_float
- import pandas.util.testing as tm
- from pandas.util.testing import assert_frame_equal, assert_series_equal
- try:
- import scipy
- _is_scipy_ge_0190 = (LooseVersion(scipy.__version__) >=
- LooseVersion('0.19.0'))
- except ImportError:
- _is_scipy_ge_0190 = False
- def _skip_if_no_pchip():
- try:
- from scipy.interpolate import pchip_interpolate # noqa
- except ImportError:
- import pytest
- pytest.skip('scipy.interpolate.pchip missing')
- class TestDataFrameMissingData(TestData):
- def test_dropEmptyRows(self):
- N = len(self.frame.index)
- mat = np.random.randn(N)
- mat[:5] = np.nan
- frame = DataFrame({'foo': mat}, index=self.frame.index)
- original = Series(mat, index=self.frame.index, name='foo')
- expected = original.dropna()
- inplace_frame1, inplace_frame2 = frame.copy(), frame.copy()
- smaller_frame = frame.dropna(how='all')
- # check that original was preserved
- assert_series_equal(frame['foo'], original)
- inplace_frame1.dropna(how='all', inplace=True)
- assert_series_equal(smaller_frame['foo'], expected)
- assert_series_equal(inplace_frame1['foo'], expected)
- smaller_frame = frame.dropna(how='all', subset=['foo'])
- inplace_frame2.dropna(how='all', subset=['foo'], inplace=True)
- assert_series_equal(smaller_frame['foo'], expected)
- assert_series_equal(inplace_frame2['foo'], expected)
- def test_dropIncompleteRows(self):
- N = len(self.frame.index)
- mat = np.random.randn(N)
- mat[:5] = np.nan
- frame = DataFrame({'foo': mat}, index=self.frame.index)
- frame['bar'] = 5
- original = Series(mat, index=self.frame.index, name='foo')
- inp_frame1, inp_frame2 = frame.copy(), frame.copy()
- smaller_frame = frame.dropna()
- assert_series_equal(frame['foo'], original)
- inp_frame1.dropna(inplace=True)
- exp = Series(mat[5:], index=self.frame.index[5:], name='foo')
- tm.assert_series_equal(smaller_frame['foo'], exp)
- tm.assert_series_equal(inp_frame1['foo'], exp)
- samesize_frame = frame.dropna(subset=['bar'])
- assert_series_equal(frame['foo'], original)
- assert (frame['bar'] == 5).all()
- inp_frame2.dropna(subset=['bar'], inplace=True)
- tm.assert_index_equal(samesize_frame.index, self.frame.index)
- tm.assert_index_equal(inp_frame2.index, self.frame.index)
- def test_dropna(self):
- df = DataFrame(np.random.randn(6, 4))
- df[2][:2] = np.nan
- dropped = df.dropna(axis=1)
- expected = df.loc[:, [0, 1, 3]]
- inp = df.copy()
- inp.dropna(axis=1, inplace=True)
- assert_frame_equal(dropped, expected)
- assert_frame_equal(inp, expected)
- dropped = df.dropna(axis=0)
- expected = df.loc[lrange(2, 6)]
- inp = df.copy()
- inp.dropna(axis=0, inplace=True)
- assert_frame_equal(dropped, expected)
- assert_frame_equal(inp, expected)
- # threshold
- dropped = df.dropna(axis=1, thresh=5)
- expected = df.loc[:, [0, 1, 3]]
- inp = df.copy()
- inp.dropna(axis=1, thresh=5, inplace=True)
- assert_frame_equal(dropped, expected)
- assert_frame_equal(inp, expected)
- dropped = df.dropna(axis=0, thresh=4)
- expected = df.loc[lrange(2, 6)]
- inp = df.copy()
- inp.dropna(axis=0, thresh=4, inplace=True)
- assert_frame_equal(dropped, expected)
- assert_frame_equal(inp, expected)
- dropped = df.dropna(axis=1, thresh=4)
- assert_frame_equal(dropped, df)
- dropped = df.dropna(axis=1, thresh=3)
- assert_frame_equal(dropped, df)
- # subset
- dropped = df.dropna(axis=0, subset=[0, 1, 3])
- inp = df.copy()
- inp.dropna(axis=0, subset=[0, 1, 3], inplace=True)
- assert_frame_equal(dropped, df)
- assert_frame_equal(inp, df)
- # all
- dropped = df.dropna(axis=1, how='all')
- assert_frame_equal(dropped, df)
- df[2] = np.nan
- dropped = df.dropna(axis=1, how='all')
- expected = df.loc[:, [0, 1, 3]]
- assert_frame_equal(dropped, expected)
- # bad input
- pytest.raises(ValueError, df.dropna, axis=3)
- def test_drop_and_dropna_caching(self):
- # tst that cacher updates
- original = Series([1, 2, np.nan], name='A')
- expected = Series([1, 2], dtype=original.dtype, name='A')
- df = pd.DataFrame({'A': original.values.copy()})
- df2 = df.copy()
- df['A'].dropna()
- assert_series_equal(df['A'], original)
- df['A'].dropna(inplace=True)
- assert_series_equal(df['A'], expected)
- df2['A'].drop([1])
- assert_series_equal(df2['A'], original)
- df2['A'].drop([1], inplace=True)
- assert_series_equal(df2['A'], original.drop([1]))
- def test_dropna_corner(self):
- # bad input
- pytest.raises(ValueError, self.frame.dropna, how='foo')
- pytest.raises(TypeError, self.frame.dropna, how=None)
- # non-existent column - 8303
- pytest.raises(KeyError, self.frame.dropna, subset=['A', 'X'])
- def test_dropna_multiple_axes(self):
- df = DataFrame([[1, np.nan, 2, 3],
- [4, np.nan, 5, 6],
- [np.nan, np.nan, np.nan, np.nan],
- [7, np.nan, 8, 9]])
- cp = df.copy()
- # GH20987
- with tm.assert_produces_warning(FutureWarning):
- result = df.dropna(how='all', axis=[0, 1])
- with tm.assert_produces_warning(FutureWarning):
- result2 = df.dropna(how='all', axis=(0, 1))
- expected = df.dropna(how='all').dropna(how='all', axis=1)
- assert_frame_equal(result, expected)
- assert_frame_equal(result2, expected)
- assert_frame_equal(df, cp)
- inp = df.copy()
- with tm.assert_produces_warning(FutureWarning):
- inp.dropna(how='all', axis=(0, 1), inplace=True)
- assert_frame_equal(inp, expected)
- def test_dropna_tz_aware_datetime(self):
- # GH13407
- df = DataFrame()
- dt1 = datetime.datetime(2015, 1, 1,
- tzinfo=dateutil.tz.tzutc())
- dt2 = datetime.datetime(2015, 2, 2,
- tzinfo=dateutil.tz.tzutc())
- df['Time'] = [dt1]
- result = df.dropna(axis=0)
- expected = DataFrame({'Time': [dt1]})
- assert_frame_equal(result, expected)
- # Ex2
- df = DataFrame({'Time': [dt1, None, np.nan, dt2]})
- result = df.dropna(axis=0)
- expected = DataFrame([dt1, dt2],
- columns=['Time'],
- index=[0, 3])
- assert_frame_equal(result, expected)
- def test_fillna(self):
- tf = self.tsframe
- tf.loc[tf.index[:5], 'A'] = np.nan
- tf.loc[tf.index[-5:], 'A'] = np.nan
- zero_filled = self.tsframe.fillna(0)
- assert (zero_filled.loc[zero_filled.index[:5], 'A'] == 0).all()
- padded = self.tsframe.fillna(method='pad')
- assert np.isnan(padded.loc[padded.index[:5], 'A']).all()
- assert (padded.loc[padded.index[-5:], 'A'] ==
- padded.loc[padded.index[-5], 'A']).all()
- # mixed type
- mf = self.mixed_frame
- mf.loc[mf.index[5:20], 'foo'] = np.nan
- mf.loc[mf.index[-10:], 'A'] = np.nan
- result = self.mixed_frame.fillna(value=0)
- result = self.mixed_frame.fillna(method='pad')
- pytest.raises(ValueError, self.tsframe.fillna)
- pytest.raises(ValueError, self.tsframe.fillna, 5, method='ffill')
- # mixed numeric (but no float16)
- mf = self.mixed_float.reindex(columns=['A', 'B', 'D'])
- mf.loc[mf.index[-10:], 'A'] = np.nan
- result = mf.fillna(value=0)
- _check_mixed_float(result, dtype=dict(C=None))
- result = mf.fillna(method='pad')
- _check_mixed_float(result, dtype=dict(C=None))
- # empty frame (GH #2778)
- df = DataFrame(columns=['x'])
- for m in ['pad', 'backfill']:
- df.x.fillna(method=m, inplace=True)
- df.x.fillna(method=m)
- # with different dtype (GH3386)
- df = DataFrame([['a', 'a', np.nan, 'a'], [
- 'b', 'b', np.nan, 'b'], ['c', 'c', np.nan, 'c']])
- result = df.fillna({2: 'foo'})
- expected = DataFrame([['a', 'a', 'foo', 'a'],
- ['b', 'b', 'foo', 'b'],
- ['c', 'c', 'foo', 'c']])
- assert_frame_equal(result, expected)
- df.fillna({2: 'foo'}, inplace=True)
- assert_frame_equal(df, expected)
- # limit and value
- df = DataFrame(np.random.randn(10, 3))
- df.iloc[2:7, 0] = np.nan
- df.iloc[3:5, 2] = np.nan
- expected = df.copy()
- expected.iloc[2, 0] = 999
- expected.iloc[3, 2] = 999
- result = df.fillna(999, limit=1)
- assert_frame_equal(result, expected)
- # with datelike
- # GH 6344
- df = DataFrame({
- 'Date': [pd.NaT, Timestamp("2014-1-1")],
- 'Date2': [Timestamp("2013-1-1"), pd.NaT]
- })
- expected = df.copy()
- expected['Date'] = expected['Date'].fillna(
- df.loc[df.index[0], 'Date2'])
- result = df.fillna(value={'Date': df['Date2']})
- assert_frame_equal(result, expected)
- # with timezone
- # GH 15855
- df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
- pd.NaT]})
- exp = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
- pd.Timestamp('2012-11-11 00:00:00+01:00')]})
- assert_frame_equal(df.fillna(method='pad'), exp)
- df = pd.DataFrame({'A': [pd.NaT,
- pd.Timestamp('2012-11-11 00:00:00+01:00')]})
- exp = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
- pd.Timestamp('2012-11-11 00:00:00+01:00')]})
- assert_frame_equal(df.fillna(method='bfill'), exp)
- # with timezone in another column
- # GH 15522
- df = pd.DataFrame({'A': pd.date_range('20130101', periods=4,
- tz='US/Eastern'),
- 'B': [1, 2, np.nan, np.nan]})
- result = df.fillna(method='pad')
- expected = pd.DataFrame({'A': pd.date_range('20130101', periods=4,
- tz='US/Eastern'),
- 'B': [1., 2., 2., 2.]})
- assert_frame_equal(result, expected)
- def test_na_actions_categorical(self):
- cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
- vals = ["a", "b", np.nan, "d"]
- df = DataFrame({"cats": cat, "vals": vals})
- cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3])
- vals2 = ["a", "b", "b", "d"]
- df_exp_fill = DataFrame({"cats": cat2, "vals": vals2})
- cat3 = Categorical([1, 2, 3], categories=[1, 2, 3])
- vals3 = ["a", "b", np.nan]
- df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3})
- cat4 = Categorical([1, 2], categories=[1, 2, 3])
- vals4 = ["a", "b"]
- df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4})
- # fillna
- res = df.fillna(value={"cats": 3, "vals": "b"})
- tm.assert_frame_equal(res, df_exp_fill)
- with pytest.raises(ValueError, match=("fill value must "
- "be in categories")):
- df.fillna(value={"cats": 4, "vals": "c"})
- res = df.fillna(method='pad')
- tm.assert_frame_equal(res, df_exp_fill)
- # dropna
- res = df.dropna(subset=["cats"])
- tm.assert_frame_equal(res, df_exp_drop_cats)
- res = df.dropna()
- tm.assert_frame_equal(res, df_exp_drop_all)
- # make sure that fillna takes missing values into account
- c = Categorical([np.nan, "b", np.nan], categories=["a", "b"])
- df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]})
- cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"])
- df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]})
- res = df.fillna("a")
- tm.assert_frame_equal(res, df_exp)
- def test_fillna_categorical_nan(self):
- # GH 14021
- # np.nan should always be a valid filler
- cat = Categorical([np.nan, 2, np.nan])
- val = Categorical([np.nan, np.nan, np.nan])
- df = DataFrame({"cats": cat, "vals": val})
- res = df.fillna(df.median())
- v_exp = [np.nan, np.nan, np.nan]
- df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp},
- dtype='category')
- tm.assert_frame_equal(res, df_exp)
- result = df.cats.fillna(np.nan)
- tm.assert_series_equal(result, df.cats)
- result = df.vals.fillna(np.nan)
- tm.assert_series_equal(result, df.vals)
- idx = pd.DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45',
- '2011-01-01 09:00', pd.NaT, pd.NaT])
- df = DataFrame({'a': Categorical(idx)})
- tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
- idx = pd.PeriodIndex(['2011-01', '2011-01', '2011-01',
- pd.NaT, pd.NaT], freq='M')
- df = DataFrame({'a': Categorical(idx)})
- tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
- idx = pd.TimedeltaIndex(['1 days', '2 days',
- '1 days', pd.NaT, pd.NaT])
- df = DataFrame({'a': Categorical(idx)})
- tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
- def test_fillna_downcast(self):
- # GH 15277
- # infer int64 from float64
- df = pd.DataFrame({'a': [1., np.nan]})
- result = df.fillna(0, downcast='infer')
- expected = pd.DataFrame({'a': [1, 0]})
- assert_frame_equal(result, expected)
- # infer int64 from float64 when fillna value is a dict
- df = pd.DataFrame({'a': [1., np.nan]})
- result = df.fillna({'a': 0}, downcast='infer')
- expected = pd.DataFrame({'a': [1, 0]})
- assert_frame_equal(result, expected)
- def test_fillna_dtype_conversion(self):
- # make sure that fillna on an empty frame works
- df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
- result = df.get_dtype_counts().sort_values()
- expected = Series({'object': 5})
- assert_series_equal(result, expected)
- result = df.fillna(1)
- expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
- result = result.get_dtype_counts().sort_values()
- expected = Series({'int64': 5})
- assert_series_equal(result, expected)
- # empty block
- df = DataFrame(index=lrange(3), columns=['A', 'B'], dtype='float64')
- result = df.fillna('nan')
- expected = DataFrame('nan', index=lrange(3), columns=['A', 'B'])
- assert_frame_equal(result, expected)
- # equiv of replace
- df = DataFrame(dict(A=[1, np.nan], B=[1., 2.]))
- for v in ['', 1, np.nan, 1.0]:
- expected = df.replace(np.nan, v)
- result = df.fillna(v)
- assert_frame_equal(result, expected)
- def test_fillna_datetime_columns(self):
- # GH 7095
- df = pd.DataFrame({'A': [-1, -2, np.nan],
- 'B': date_range('20130101', periods=3),
- 'C': ['foo', 'bar', None],
- 'D': ['foo2', 'bar2', None]},
- index=date_range('20130110', periods=3))
- result = df.fillna('?')
- expected = pd.DataFrame({'A': [-1, -2, '?'],
- 'B': date_range('20130101', periods=3),
- 'C': ['foo', 'bar', '?'],
- 'D': ['foo2', 'bar2', '?']},
- index=date_range('20130110', periods=3))
- tm.assert_frame_equal(result, expected)
- df = pd.DataFrame({'A': [-1, -2, np.nan],
- 'B': [pd.Timestamp('2013-01-01'),
- pd.Timestamp('2013-01-02'), pd.NaT],
- 'C': ['foo', 'bar', None],
- 'D': ['foo2', 'bar2', None]},
- index=date_range('20130110', periods=3))
- result = df.fillna('?')
- expected = pd.DataFrame({'A': [-1, -2, '?'],
- 'B': [pd.Timestamp('2013-01-01'),
- pd.Timestamp('2013-01-02'), '?'],
- 'C': ['foo', 'bar', '?'],
- 'D': ['foo2', 'bar2', '?']},
- index=pd.date_range('20130110', periods=3))
- tm.assert_frame_equal(result, expected)
- def test_ffill(self):
- self.tsframe['A'][:5] = np.nan
- self.tsframe['A'][-5:] = np.nan
- assert_frame_equal(self.tsframe.ffill(),
- self.tsframe.fillna(method='ffill'))
- def test_bfill(self):
- self.tsframe['A'][:5] = np.nan
- self.tsframe['A'][-5:] = np.nan
- assert_frame_equal(self.tsframe.bfill(),
- self.tsframe.fillna(method='bfill'))
- def test_frame_pad_backfill_limit(self):
- index = np.arange(10)
- df = DataFrame(np.random.randn(10, 4), index=index)
- result = df[:2].reindex(index, method='pad', limit=5)
- expected = df[:2].reindex(index).fillna(method='pad')
- expected.values[-3:] = np.nan
- tm.assert_frame_equal(result, expected)
- result = df[-2:].reindex(index, method='backfill', limit=5)
- expected = df[-2:].reindex(index).fillna(method='backfill')
- expected.values[:3] = np.nan
- tm.assert_frame_equal(result, expected)
- def test_frame_fillna_limit(self):
- index = np.arange(10)
- df = DataFrame(np.random.randn(10, 4), index=index)
- result = df[:2].reindex(index)
- result = result.fillna(method='pad', limit=5)
- expected = df[:2].reindex(index).fillna(method='pad')
- expected.values[-3:] = np.nan
- tm.assert_frame_equal(result, expected)
- result = df[-2:].reindex(index)
- result = result.fillna(method='backfill', limit=5)
- expected = df[-2:].reindex(index).fillna(method='backfill')
- expected.values[:3] = np.nan
- tm.assert_frame_equal(result, expected)
- def test_fillna_skip_certain_blocks(self):
- # don't try to fill boolean, int blocks
- df = DataFrame(np.random.randn(10, 4).astype(int))
- # it works!
- df.fillna(np.nan)
- def test_fillna_inplace(self):
- df = DataFrame(np.random.randn(10, 4))
- df[1][:4] = np.nan
- df[3][-4:] = np.nan
- expected = df.fillna(value=0)
- assert expected is not df
- df.fillna(value=0, inplace=True)
- tm.assert_frame_equal(df, expected)
- expected = df.fillna(value={0: 0}, inplace=True)
- assert expected is None
- df[1][:4] = np.nan
- df[3][-4:] = np.nan
- expected = df.fillna(method='ffill')
- assert expected is not df
- df.fillna(method='ffill', inplace=True)
- tm.assert_frame_equal(df, expected)
- def test_fillna_dict_series(self):
- df = DataFrame({'a': [np.nan, 1, 2, np.nan, np.nan],
- 'b': [1, 2, 3, np.nan, np.nan],
- 'c': [np.nan, 1, 2, 3, 4]})
- result = df.fillna({'a': 0, 'b': 5})
- expected = df.copy()
- expected['a'] = expected['a'].fillna(0)
- expected['b'] = expected['b'].fillna(5)
- assert_frame_equal(result, expected)
- # it works
- result = df.fillna({'a': 0, 'b': 5, 'd': 7})
- # Series treated same as dict
- result = df.fillna(df.max())
- expected = df.fillna(df.max().to_dict())
- assert_frame_equal(result, expected)
- # disable this for now
- with pytest.raises(NotImplementedError, match='column by column'):
- df.fillna(df.max(1), axis=1)
- def test_fillna_dataframe(self):
- # GH 8377
- df = DataFrame({'a': [np.nan, 1, 2, np.nan, np.nan],
- 'b': [1, 2, 3, np.nan, np.nan],
- 'c': [np.nan, 1, 2, 3, 4]},
- index=list('VWXYZ'))
- # df2 may have different index and columns
- df2 = DataFrame({'a': [np.nan, 10, 20, 30, 40],
- 'b': [50, 60, 70, 80, 90],
- 'foo': ['bar'] * 5},
- index=list('VWXuZ'))
- result = df.fillna(df2)
- # only those columns and indices which are shared get filled
- expected = DataFrame({'a': [np.nan, 1, 2, np.nan, 40],
- 'b': [1, 2, 3, np.nan, 90],
- 'c': [np.nan, 1, 2, 3, 4]},
- index=list('VWXYZ'))
- assert_frame_equal(result, expected)
- def test_fillna_columns(self):
- df = DataFrame(np.random.randn(10, 10))
- df.values[:, ::2] = np.nan
- result = df.fillna(method='ffill', axis=1)
- expected = df.T.fillna(method='pad').T
- assert_frame_equal(result, expected)
- df.insert(6, 'foo', 5)
- result = df.fillna(method='ffill', axis=1)
- expected = df.astype(float).fillna(method='ffill', axis=1)
- assert_frame_equal(result, expected)
- def test_fillna_invalid_method(self):
- with pytest.raises(ValueError, match='ffil'):
- self.frame.fillna(method='ffil')
- def test_fillna_invalid_value(self):
- # list
- pytest.raises(TypeError, self.frame.fillna, [1, 2])
- # tuple
- pytest.raises(TypeError, self.frame.fillna, (1, 2))
- # frame with series
- pytest.raises(TypeError, self.frame.iloc[:, 0].fillna, self.frame)
- def test_fillna_col_reordering(self):
- cols = ["COL." + str(i) for i in range(5, 0, -1)]
- data = np.random.rand(20, 5)
- df = DataFrame(index=lrange(20), columns=cols, data=data)
- filled = df.fillna(method='ffill')
- assert df.columns.tolist() == filled.columns.tolist()
- def test_fill_corner(self):
- mf = self.mixed_frame
- mf.loc[mf.index[5:20], 'foo'] = np.nan
- mf.loc[mf.index[-10:], 'A'] = np.nan
- filled = self.mixed_frame.fillna(value=0)
- assert (filled.loc[filled.index[5:20], 'foo'] == 0).all()
- del self.mixed_frame['foo']
- empty_float = self.frame.reindex(columns=[])
- # TODO(wesm): unused?
- result = empty_float.fillna(value=0) # noqa
- def test_fill_value_when_combine_const(self):
- # GH12723
- dat = np.array([0, 1, np.nan, 3, 4, 5], dtype='float')
- df = DataFrame({'foo': dat}, index=range(6))
- exp = df.fillna(0).add(2)
- res = df.add(2, fill_value=0)
- assert_frame_equal(res, exp)
- class TestDataFrameInterpolate(TestData):
- def test_interp_basic(self):
- df = DataFrame({'A': [1, 2, np.nan, 4],
- 'B': [1, 4, 9, np.nan],
- 'C': [1, 2, 3, 5],
- 'D': list('abcd')})
- expected = DataFrame({'A': [1., 2., 3., 4.],
- 'B': [1., 4., 9., 9.],
- 'C': [1, 2, 3, 5],
- 'D': list('abcd')})
- result = df.interpolate()
- assert_frame_equal(result, expected)
- result = df.set_index('C').interpolate()
- expected = df.set_index('C')
- expected.loc[3, 'A'] = 3
- expected.loc[5, 'B'] = 9
- assert_frame_equal(result, expected)
- def test_interp_bad_method(self):
- df = DataFrame({'A': [1, 2, np.nan, 4],
- 'B': [1, 4, 9, np.nan],
- 'C': [1, 2, 3, 5],
- 'D': list('abcd')})
- with pytest.raises(ValueError):
- df.interpolate(method='not_a_method')
- def test_interp_combo(self):
- df = DataFrame({'A': [1., 2., np.nan, 4.],
- 'B': [1, 4, 9, np.nan],
- 'C': [1, 2, 3, 5],
- 'D': list('abcd')})
- result = df['A'].interpolate()
- expected = Series([1., 2., 3., 4.], name='A')
- assert_series_equal(result, expected)
- result = df['A'].interpolate(downcast='infer')
- expected = Series([1, 2, 3, 4], name='A')
- assert_series_equal(result, expected)
- def test_interp_nan_idx(self):
- df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [np.nan, 2, 3, 4]})
- df = df.set_index('A')
- with pytest.raises(NotImplementedError):
- df.interpolate(method='values')
- @td.skip_if_no_scipy
- def test_interp_various(self):
- df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7],
- 'C': [1, 2, 3, 5, 8, 13, 21]})
- df = df.set_index('C')
- expected = df.copy()
- result = df.interpolate(method='polynomial', order=1)
- expected.A.loc[3] = 2.66666667
- expected.A.loc[13] = 5.76923076
- assert_frame_equal(result, expected)
- result = df.interpolate(method='cubic')
- # GH #15662.
- # new cubic and quadratic interpolation algorithms from scipy 0.19.0.
- # previously `splmake` was used. See scipy/scipy#6710
- if _is_scipy_ge_0190:
- expected.A.loc[3] = 2.81547781
- expected.A.loc[13] = 5.52964175
- else:
- expected.A.loc[3] = 2.81621174
- expected.A.loc[13] = 5.64146581
- assert_frame_equal(result, expected)
- result = df.interpolate(method='nearest')
- expected.A.loc[3] = 2
- expected.A.loc[13] = 5
- assert_frame_equal(result, expected, check_dtype=False)
- result = df.interpolate(method='quadratic')
- if _is_scipy_ge_0190:
- expected.A.loc[3] = 2.82150771
- expected.A.loc[13] = 6.12648668
- else:
- expected.A.loc[3] = 2.82533638
- expected.A.loc[13] = 6.02817974
- assert_frame_equal(result, expected)
- result = df.interpolate(method='slinear')
- expected.A.loc[3] = 2.66666667
- expected.A.loc[13] = 5.76923077
- assert_frame_equal(result, expected)
- result = df.interpolate(method='zero')
- expected.A.loc[3] = 2.
- expected.A.loc[13] = 5
- assert_frame_equal(result, expected, check_dtype=False)
- @td.skip_if_no_scipy
- def test_interp_alt_scipy(self):
- df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7],
- 'C': [1, 2, 3, 5, 8, 13, 21]})
- result = df.interpolate(method='barycentric')
- expected = df.copy()
- expected.loc[2, 'A'] = 3
- expected.loc[5, 'A'] = 6
- assert_frame_equal(result, expected)
- result = df.interpolate(method='barycentric', downcast='infer')
- assert_frame_equal(result, expected.astype(np.int64))
- result = df.interpolate(method='krogh')
- expectedk = df.copy()
- expectedk['A'] = expected['A']
- assert_frame_equal(result, expectedk)
- _skip_if_no_pchip()
- import scipy
- result = df.interpolate(method='pchip')
- expected.loc[2, 'A'] = 3
- if LooseVersion(scipy.__version__) >= LooseVersion('0.17.0'):
- expected.loc[5, 'A'] = 6.0
- else:
- expected.loc[5, 'A'] = 6.125
- assert_frame_equal(result, expected)
- def test_interp_rowwise(self):
- df = DataFrame({0: [1, 2, np.nan, 4],
- 1: [2, 3, 4, np.nan],
- 2: [np.nan, 4, 5, 6],
- 3: [4, np.nan, 6, 7],
- 4: [1, 2, 3, 4]})
- result = df.interpolate(axis=1)
- expected = df.copy()
- expected.loc[3, 1] = 5
- expected.loc[0, 2] = 3
- expected.loc[1, 3] = 3
- expected[4] = expected[4].astype(np.float64)
- assert_frame_equal(result, expected)
- result = df.interpolate(axis=1, method='values')
- assert_frame_equal(result, expected)
- result = df.interpolate(axis=0)
- expected = df.interpolate()
- assert_frame_equal(result, expected)
- def test_rowwise_alt(self):
- df = DataFrame({0: [0, .5, 1., np.nan, 4, 8, np.nan, np.nan, 64],
- 1: [1, 2, 3, 4, 3, 2, 1, 0, -1]})
- df.interpolate(axis=0)
- @pytest.mark.parametrize("check_scipy", [
- False, pytest.param(True, marks=td.skip_if_no_scipy)
- ])
- def test_interp_leading_nans(self, check_scipy):
- df = DataFrame({"A": [np.nan, np.nan, .5, .25, 0],
- "B": [np.nan, -3, -3.5, np.nan, -4]})
- result = df.interpolate()
- expected = df.copy()
- expected['B'].loc[3] = -3.75
- assert_frame_equal(result, expected)
- if check_scipy:
- result = df.interpolate(method='polynomial', order=1)
- assert_frame_equal(result, expected)
- def test_interp_raise_on_only_mixed(self):
- df = DataFrame({'A': [1, 2, np.nan, 4],
- 'B': ['a', 'b', 'c', 'd'],
- 'C': [np.nan, 2, 5, 7],
- 'D': [np.nan, np.nan, 9, 9],
- 'E': [1, 2, 3, 4]})
- with pytest.raises(TypeError):
- df.interpolate(axis=1)
- def test_interp_raise_on_all_object_dtype(self):
- # GH 22985
- df = DataFrame({
- 'A': [1, 2, 3],
- 'B': [4, 5, 6]},
- dtype='object')
- msg = ("Cannot interpolate with all object-dtype columns "
- "in the DataFrame. Try setting at least one "
- "column to a numeric dtype.")
- with pytest.raises(TypeError, match=msg):
- df.interpolate()
- def test_interp_inplace(self):
- df = DataFrame({'a': [1., 2., np.nan, 4.]})
- expected = DataFrame({'a': [1., 2., 3., 4.]})
- result = df.copy()
- result['a'].interpolate(inplace=True)
- assert_frame_equal(result, expected)
- result = df.copy()
- result['a'].interpolate(inplace=True, downcast='infer')
- assert_frame_equal(result, expected.astype('int64'))
- def test_interp_inplace_row(self):
- # GH 10395
- result = DataFrame({'a': [1., 2., 3., 4.],
- 'b': [np.nan, 2., 3., 4.],
- 'c': [3, 2, 2, 2]})
- expected = result.interpolate(method='linear', axis=1, inplace=False)
- result.interpolate(method='linear', axis=1, inplace=True)
- assert_frame_equal(result, expected)
- def test_interp_ignore_all_good(self):
- # GH
- df = DataFrame({'A': [1, 2, np.nan, 4],
- 'B': [1, 2, 3, 4],
- 'C': [1., 2., np.nan, 4.],
- 'D': [1., 2., 3., 4.]})
- expected = DataFrame({'A': np.array(
- [1, 2, 3, 4], dtype='float64'),
- 'B': np.array(
- [1, 2, 3, 4], dtype='int64'),
- 'C': np.array(
- [1., 2., 3, 4.], dtype='float64'),
- 'D': np.array(
- [1., 2., 3., 4.], dtype='float64')})
- result = df.interpolate(downcast=None)
- assert_frame_equal(result, expected)
- # all good
- result = df[['B', 'D']].interpolate(downcast=None)
- assert_frame_equal(result, df[['B', 'D']])
|