test_missing.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863
  1. # -*- coding: utf-8 -*-
  2. from __future__ import print_function
  3. import datetime
  4. from distutils.version import LooseVersion
  5. import dateutil
  6. import numpy as np
  7. import pytest
  8. from pandas.compat import lrange
  9. import pandas.util._test_decorators as td
  10. import pandas as pd
  11. from pandas import Categorical, DataFrame, Series, Timestamp, date_range
  12. from pandas.tests.frame.common import TestData, _check_mixed_float
  13. import pandas.util.testing as tm
  14. from pandas.util.testing import assert_frame_equal, assert_series_equal
  15. try:
  16. import scipy
  17. _is_scipy_ge_0190 = (LooseVersion(scipy.__version__) >=
  18. LooseVersion('0.19.0'))
  19. except ImportError:
  20. _is_scipy_ge_0190 = False
  21. def _skip_if_no_pchip():
  22. try:
  23. from scipy.interpolate import pchip_interpolate # noqa
  24. except ImportError:
  25. import pytest
  26. pytest.skip('scipy.interpolate.pchip missing')
  27. class TestDataFrameMissingData(TestData):
  28. def test_dropEmptyRows(self):
  29. N = len(self.frame.index)
  30. mat = np.random.randn(N)
  31. mat[:5] = np.nan
  32. frame = DataFrame({'foo': mat}, index=self.frame.index)
  33. original = Series(mat, index=self.frame.index, name='foo')
  34. expected = original.dropna()
  35. inplace_frame1, inplace_frame2 = frame.copy(), frame.copy()
  36. smaller_frame = frame.dropna(how='all')
  37. # check that original was preserved
  38. assert_series_equal(frame['foo'], original)
  39. inplace_frame1.dropna(how='all', inplace=True)
  40. assert_series_equal(smaller_frame['foo'], expected)
  41. assert_series_equal(inplace_frame1['foo'], expected)
  42. smaller_frame = frame.dropna(how='all', subset=['foo'])
  43. inplace_frame2.dropna(how='all', subset=['foo'], inplace=True)
  44. assert_series_equal(smaller_frame['foo'], expected)
  45. assert_series_equal(inplace_frame2['foo'], expected)
  46. def test_dropIncompleteRows(self):
  47. N = len(self.frame.index)
  48. mat = np.random.randn(N)
  49. mat[:5] = np.nan
  50. frame = DataFrame({'foo': mat}, index=self.frame.index)
  51. frame['bar'] = 5
  52. original = Series(mat, index=self.frame.index, name='foo')
  53. inp_frame1, inp_frame2 = frame.copy(), frame.copy()
  54. smaller_frame = frame.dropna()
  55. assert_series_equal(frame['foo'], original)
  56. inp_frame1.dropna(inplace=True)
  57. exp = Series(mat[5:], index=self.frame.index[5:], name='foo')
  58. tm.assert_series_equal(smaller_frame['foo'], exp)
  59. tm.assert_series_equal(inp_frame1['foo'], exp)
  60. samesize_frame = frame.dropna(subset=['bar'])
  61. assert_series_equal(frame['foo'], original)
  62. assert (frame['bar'] == 5).all()
  63. inp_frame2.dropna(subset=['bar'], inplace=True)
  64. tm.assert_index_equal(samesize_frame.index, self.frame.index)
  65. tm.assert_index_equal(inp_frame2.index, self.frame.index)
  66. def test_dropna(self):
  67. df = DataFrame(np.random.randn(6, 4))
  68. df[2][:2] = np.nan
  69. dropped = df.dropna(axis=1)
  70. expected = df.loc[:, [0, 1, 3]]
  71. inp = df.copy()
  72. inp.dropna(axis=1, inplace=True)
  73. assert_frame_equal(dropped, expected)
  74. assert_frame_equal(inp, expected)
  75. dropped = df.dropna(axis=0)
  76. expected = df.loc[lrange(2, 6)]
  77. inp = df.copy()
  78. inp.dropna(axis=0, inplace=True)
  79. assert_frame_equal(dropped, expected)
  80. assert_frame_equal(inp, expected)
  81. # threshold
  82. dropped = df.dropna(axis=1, thresh=5)
  83. expected = df.loc[:, [0, 1, 3]]
  84. inp = df.copy()
  85. inp.dropna(axis=1, thresh=5, inplace=True)
  86. assert_frame_equal(dropped, expected)
  87. assert_frame_equal(inp, expected)
  88. dropped = df.dropna(axis=0, thresh=4)
  89. expected = df.loc[lrange(2, 6)]
  90. inp = df.copy()
  91. inp.dropna(axis=0, thresh=4, inplace=True)
  92. assert_frame_equal(dropped, expected)
  93. assert_frame_equal(inp, expected)
  94. dropped = df.dropna(axis=1, thresh=4)
  95. assert_frame_equal(dropped, df)
  96. dropped = df.dropna(axis=1, thresh=3)
  97. assert_frame_equal(dropped, df)
  98. # subset
  99. dropped = df.dropna(axis=0, subset=[0, 1, 3])
  100. inp = df.copy()
  101. inp.dropna(axis=0, subset=[0, 1, 3], inplace=True)
  102. assert_frame_equal(dropped, df)
  103. assert_frame_equal(inp, df)
  104. # all
  105. dropped = df.dropna(axis=1, how='all')
  106. assert_frame_equal(dropped, df)
  107. df[2] = np.nan
  108. dropped = df.dropna(axis=1, how='all')
  109. expected = df.loc[:, [0, 1, 3]]
  110. assert_frame_equal(dropped, expected)
  111. # bad input
  112. pytest.raises(ValueError, df.dropna, axis=3)
  113. def test_drop_and_dropna_caching(self):
  114. # tst that cacher updates
  115. original = Series([1, 2, np.nan], name='A')
  116. expected = Series([1, 2], dtype=original.dtype, name='A')
  117. df = pd.DataFrame({'A': original.values.copy()})
  118. df2 = df.copy()
  119. df['A'].dropna()
  120. assert_series_equal(df['A'], original)
  121. df['A'].dropna(inplace=True)
  122. assert_series_equal(df['A'], expected)
  123. df2['A'].drop([1])
  124. assert_series_equal(df2['A'], original)
  125. df2['A'].drop([1], inplace=True)
  126. assert_series_equal(df2['A'], original.drop([1]))
  127. def test_dropna_corner(self):
  128. # bad input
  129. pytest.raises(ValueError, self.frame.dropna, how='foo')
  130. pytest.raises(TypeError, self.frame.dropna, how=None)
  131. # non-existent column - 8303
  132. pytest.raises(KeyError, self.frame.dropna, subset=['A', 'X'])
  133. def test_dropna_multiple_axes(self):
  134. df = DataFrame([[1, np.nan, 2, 3],
  135. [4, np.nan, 5, 6],
  136. [np.nan, np.nan, np.nan, np.nan],
  137. [7, np.nan, 8, 9]])
  138. cp = df.copy()
  139. # GH20987
  140. with tm.assert_produces_warning(FutureWarning):
  141. result = df.dropna(how='all', axis=[0, 1])
  142. with tm.assert_produces_warning(FutureWarning):
  143. result2 = df.dropna(how='all', axis=(0, 1))
  144. expected = df.dropna(how='all').dropna(how='all', axis=1)
  145. assert_frame_equal(result, expected)
  146. assert_frame_equal(result2, expected)
  147. assert_frame_equal(df, cp)
  148. inp = df.copy()
  149. with tm.assert_produces_warning(FutureWarning):
  150. inp.dropna(how='all', axis=(0, 1), inplace=True)
  151. assert_frame_equal(inp, expected)
  152. def test_dropna_tz_aware_datetime(self):
  153. # GH13407
  154. df = DataFrame()
  155. dt1 = datetime.datetime(2015, 1, 1,
  156. tzinfo=dateutil.tz.tzutc())
  157. dt2 = datetime.datetime(2015, 2, 2,
  158. tzinfo=dateutil.tz.tzutc())
  159. df['Time'] = [dt1]
  160. result = df.dropna(axis=0)
  161. expected = DataFrame({'Time': [dt1]})
  162. assert_frame_equal(result, expected)
  163. # Ex2
  164. df = DataFrame({'Time': [dt1, None, np.nan, dt2]})
  165. result = df.dropna(axis=0)
  166. expected = DataFrame([dt1, dt2],
  167. columns=['Time'],
  168. index=[0, 3])
  169. assert_frame_equal(result, expected)
  170. def test_fillna(self):
  171. tf = self.tsframe
  172. tf.loc[tf.index[:5], 'A'] = np.nan
  173. tf.loc[tf.index[-5:], 'A'] = np.nan
  174. zero_filled = self.tsframe.fillna(0)
  175. assert (zero_filled.loc[zero_filled.index[:5], 'A'] == 0).all()
  176. padded = self.tsframe.fillna(method='pad')
  177. assert np.isnan(padded.loc[padded.index[:5], 'A']).all()
  178. assert (padded.loc[padded.index[-5:], 'A'] ==
  179. padded.loc[padded.index[-5], 'A']).all()
  180. # mixed type
  181. mf = self.mixed_frame
  182. mf.loc[mf.index[5:20], 'foo'] = np.nan
  183. mf.loc[mf.index[-10:], 'A'] = np.nan
  184. result = self.mixed_frame.fillna(value=0)
  185. result = self.mixed_frame.fillna(method='pad')
  186. pytest.raises(ValueError, self.tsframe.fillna)
  187. pytest.raises(ValueError, self.tsframe.fillna, 5, method='ffill')
  188. # mixed numeric (but no float16)
  189. mf = self.mixed_float.reindex(columns=['A', 'B', 'D'])
  190. mf.loc[mf.index[-10:], 'A'] = np.nan
  191. result = mf.fillna(value=0)
  192. _check_mixed_float(result, dtype=dict(C=None))
  193. result = mf.fillna(method='pad')
  194. _check_mixed_float(result, dtype=dict(C=None))
  195. # empty frame (GH #2778)
  196. df = DataFrame(columns=['x'])
  197. for m in ['pad', 'backfill']:
  198. df.x.fillna(method=m, inplace=True)
  199. df.x.fillna(method=m)
  200. # with different dtype (GH3386)
  201. df = DataFrame([['a', 'a', np.nan, 'a'], [
  202. 'b', 'b', np.nan, 'b'], ['c', 'c', np.nan, 'c']])
  203. result = df.fillna({2: 'foo'})
  204. expected = DataFrame([['a', 'a', 'foo', 'a'],
  205. ['b', 'b', 'foo', 'b'],
  206. ['c', 'c', 'foo', 'c']])
  207. assert_frame_equal(result, expected)
  208. df.fillna({2: 'foo'}, inplace=True)
  209. assert_frame_equal(df, expected)
  210. # limit and value
  211. df = DataFrame(np.random.randn(10, 3))
  212. df.iloc[2:7, 0] = np.nan
  213. df.iloc[3:5, 2] = np.nan
  214. expected = df.copy()
  215. expected.iloc[2, 0] = 999
  216. expected.iloc[3, 2] = 999
  217. result = df.fillna(999, limit=1)
  218. assert_frame_equal(result, expected)
  219. # with datelike
  220. # GH 6344
  221. df = DataFrame({
  222. 'Date': [pd.NaT, Timestamp("2014-1-1")],
  223. 'Date2': [Timestamp("2013-1-1"), pd.NaT]
  224. })
  225. expected = df.copy()
  226. expected['Date'] = expected['Date'].fillna(
  227. df.loc[df.index[0], 'Date2'])
  228. result = df.fillna(value={'Date': df['Date2']})
  229. assert_frame_equal(result, expected)
  230. # with timezone
  231. # GH 15855
  232. df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
  233. pd.NaT]})
  234. exp = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
  235. pd.Timestamp('2012-11-11 00:00:00+01:00')]})
  236. assert_frame_equal(df.fillna(method='pad'), exp)
  237. df = pd.DataFrame({'A': [pd.NaT,
  238. pd.Timestamp('2012-11-11 00:00:00+01:00')]})
  239. exp = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
  240. pd.Timestamp('2012-11-11 00:00:00+01:00')]})
  241. assert_frame_equal(df.fillna(method='bfill'), exp)
  242. # with timezone in another column
  243. # GH 15522
  244. df = pd.DataFrame({'A': pd.date_range('20130101', periods=4,
  245. tz='US/Eastern'),
  246. 'B': [1, 2, np.nan, np.nan]})
  247. result = df.fillna(method='pad')
  248. expected = pd.DataFrame({'A': pd.date_range('20130101', periods=4,
  249. tz='US/Eastern'),
  250. 'B': [1., 2., 2., 2.]})
  251. assert_frame_equal(result, expected)
  252. def test_na_actions_categorical(self):
  253. cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
  254. vals = ["a", "b", np.nan, "d"]
  255. df = DataFrame({"cats": cat, "vals": vals})
  256. cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3])
  257. vals2 = ["a", "b", "b", "d"]
  258. df_exp_fill = DataFrame({"cats": cat2, "vals": vals2})
  259. cat3 = Categorical([1, 2, 3], categories=[1, 2, 3])
  260. vals3 = ["a", "b", np.nan]
  261. df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3})
  262. cat4 = Categorical([1, 2], categories=[1, 2, 3])
  263. vals4 = ["a", "b"]
  264. df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4})
  265. # fillna
  266. res = df.fillna(value={"cats": 3, "vals": "b"})
  267. tm.assert_frame_equal(res, df_exp_fill)
  268. with pytest.raises(ValueError, match=("fill value must "
  269. "be in categories")):
  270. df.fillna(value={"cats": 4, "vals": "c"})
  271. res = df.fillna(method='pad')
  272. tm.assert_frame_equal(res, df_exp_fill)
  273. # dropna
  274. res = df.dropna(subset=["cats"])
  275. tm.assert_frame_equal(res, df_exp_drop_cats)
  276. res = df.dropna()
  277. tm.assert_frame_equal(res, df_exp_drop_all)
  278. # make sure that fillna takes missing values into account
  279. c = Categorical([np.nan, "b", np.nan], categories=["a", "b"])
  280. df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]})
  281. cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"])
  282. df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]})
  283. res = df.fillna("a")
  284. tm.assert_frame_equal(res, df_exp)
  285. def test_fillna_categorical_nan(self):
  286. # GH 14021
  287. # np.nan should always be a valid filler
  288. cat = Categorical([np.nan, 2, np.nan])
  289. val = Categorical([np.nan, np.nan, np.nan])
  290. df = DataFrame({"cats": cat, "vals": val})
  291. res = df.fillna(df.median())
  292. v_exp = [np.nan, np.nan, np.nan]
  293. df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp},
  294. dtype='category')
  295. tm.assert_frame_equal(res, df_exp)
  296. result = df.cats.fillna(np.nan)
  297. tm.assert_series_equal(result, df.cats)
  298. result = df.vals.fillna(np.nan)
  299. tm.assert_series_equal(result, df.vals)
  300. idx = pd.DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45',
  301. '2011-01-01 09:00', pd.NaT, pd.NaT])
  302. df = DataFrame({'a': Categorical(idx)})
  303. tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
  304. idx = pd.PeriodIndex(['2011-01', '2011-01', '2011-01',
  305. pd.NaT, pd.NaT], freq='M')
  306. df = DataFrame({'a': Categorical(idx)})
  307. tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
  308. idx = pd.TimedeltaIndex(['1 days', '2 days',
  309. '1 days', pd.NaT, pd.NaT])
  310. df = DataFrame({'a': Categorical(idx)})
  311. tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
  312. def test_fillna_downcast(self):
  313. # GH 15277
  314. # infer int64 from float64
  315. df = pd.DataFrame({'a': [1., np.nan]})
  316. result = df.fillna(0, downcast='infer')
  317. expected = pd.DataFrame({'a': [1, 0]})
  318. assert_frame_equal(result, expected)
  319. # infer int64 from float64 when fillna value is a dict
  320. df = pd.DataFrame({'a': [1., np.nan]})
  321. result = df.fillna({'a': 0}, downcast='infer')
  322. expected = pd.DataFrame({'a': [1, 0]})
  323. assert_frame_equal(result, expected)
  324. def test_fillna_dtype_conversion(self):
  325. # make sure that fillna on an empty frame works
  326. df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
  327. result = df.get_dtype_counts().sort_values()
  328. expected = Series({'object': 5})
  329. assert_series_equal(result, expected)
  330. result = df.fillna(1)
  331. expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
  332. result = result.get_dtype_counts().sort_values()
  333. expected = Series({'int64': 5})
  334. assert_series_equal(result, expected)
  335. # empty block
  336. df = DataFrame(index=lrange(3), columns=['A', 'B'], dtype='float64')
  337. result = df.fillna('nan')
  338. expected = DataFrame('nan', index=lrange(3), columns=['A', 'B'])
  339. assert_frame_equal(result, expected)
  340. # equiv of replace
  341. df = DataFrame(dict(A=[1, np.nan], B=[1., 2.]))
  342. for v in ['', 1, np.nan, 1.0]:
  343. expected = df.replace(np.nan, v)
  344. result = df.fillna(v)
  345. assert_frame_equal(result, expected)
  346. def test_fillna_datetime_columns(self):
  347. # GH 7095
  348. df = pd.DataFrame({'A': [-1, -2, np.nan],
  349. 'B': date_range('20130101', periods=3),
  350. 'C': ['foo', 'bar', None],
  351. 'D': ['foo2', 'bar2', None]},
  352. index=date_range('20130110', periods=3))
  353. result = df.fillna('?')
  354. expected = pd.DataFrame({'A': [-1, -2, '?'],
  355. 'B': date_range('20130101', periods=3),
  356. 'C': ['foo', 'bar', '?'],
  357. 'D': ['foo2', 'bar2', '?']},
  358. index=date_range('20130110', periods=3))
  359. tm.assert_frame_equal(result, expected)
  360. df = pd.DataFrame({'A': [-1, -2, np.nan],
  361. 'B': [pd.Timestamp('2013-01-01'),
  362. pd.Timestamp('2013-01-02'), pd.NaT],
  363. 'C': ['foo', 'bar', None],
  364. 'D': ['foo2', 'bar2', None]},
  365. index=date_range('20130110', periods=3))
  366. result = df.fillna('?')
  367. expected = pd.DataFrame({'A': [-1, -2, '?'],
  368. 'B': [pd.Timestamp('2013-01-01'),
  369. pd.Timestamp('2013-01-02'), '?'],
  370. 'C': ['foo', 'bar', '?'],
  371. 'D': ['foo2', 'bar2', '?']},
  372. index=pd.date_range('20130110', periods=3))
  373. tm.assert_frame_equal(result, expected)
  374. def test_ffill(self):
  375. self.tsframe['A'][:5] = np.nan
  376. self.tsframe['A'][-5:] = np.nan
  377. assert_frame_equal(self.tsframe.ffill(),
  378. self.tsframe.fillna(method='ffill'))
  379. def test_bfill(self):
  380. self.tsframe['A'][:5] = np.nan
  381. self.tsframe['A'][-5:] = np.nan
  382. assert_frame_equal(self.tsframe.bfill(),
  383. self.tsframe.fillna(method='bfill'))
  384. def test_frame_pad_backfill_limit(self):
  385. index = np.arange(10)
  386. df = DataFrame(np.random.randn(10, 4), index=index)
  387. result = df[:2].reindex(index, method='pad', limit=5)
  388. expected = df[:2].reindex(index).fillna(method='pad')
  389. expected.values[-3:] = np.nan
  390. tm.assert_frame_equal(result, expected)
  391. result = df[-2:].reindex(index, method='backfill', limit=5)
  392. expected = df[-2:].reindex(index).fillna(method='backfill')
  393. expected.values[:3] = np.nan
  394. tm.assert_frame_equal(result, expected)
  395. def test_frame_fillna_limit(self):
  396. index = np.arange(10)
  397. df = DataFrame(np.random.randn(10, 4), index=index)
  398. result = df[:2].reindex(index)
  399. result = result.fillna(method='pad', limit=5)
  400. expected = df[:2].reindex(index).fillna(method='pad')
  401. expected.values[-3:] = np.nan
  402. tm.assert_frame_equal(result, expected)
  403. result = df[-2:].reindex(index)
  404. result = result.fillna(method='backfill', limit=5)
  405. expected = df[-2:].reindex(index).fillna(method='backfill')
  406. expected.values[:3] = np.nan
  407. tm.assert_frame_equal(result, expected)
  408. def test_fillna_skip_certain_blocks(self):
  409. # don't try to fill boolean, int blocks
  410. df = DataFrame(np.random.randn(10, 4).astype(int))
  411. # it works!
  412. df.fillna(np.nan)
  413. def test_fillna_inplace(self):
  414. df = DataFrame(np.random.randn(10, 4))
  415. df[1][:4] = np.nan
  416. df[3][-4:] = np.nan
  417. expected = df.fillna(value=0)
  418. assert expected is not df
  419. df.fillna(value=0, inplace=True)
  420. tm.assert_frame_equal(df, expected)
  421. expected = df.fillna(value={0: 0}, inplace=True)
  422. assert expected is None
  423. df[1][:4] = np.nan
  424. df[3][-4:] = np.nan
  425. expected = df.fillna(method='ffill')
  426. assert expected is not df
  427. df.fillna(method='ffill', inplace=True)
  428. tm.assert_frame_equal(df, expected)
  429. def test_fillna_dict_series(self):
  430. df = DataFrame({'a': [np.nan, 1, 2, np.nan, np.nan],
  431. 'b': [1, 2, 3, np.nan, np.nan],
  432. 'c': [np.nan, 1, 2, 3, 4]})
  433. result = df.fillna({'a': 0, 'b': 5})
  434. expected = df.copy()
  435. expected['a'] = expected['a'].fillna(0)
  436. expected['b'] = expected['b'].fillna(5)
  437. assert_frame_equal(result, expected)
  438. # it works
  439. result = df.fillna({'a': 0, 'b': 5, 'd': 7})
  440. # Series treated same as dict
  441. result = df.fillna(df.max())
  442. expected = df.fillna(df.max().to_dict())
  443. assert_frame_equal(result, expected)
  444. # disable this for now
  445. with pytest.raises(NotImplementedError, match='column by column'):
  446. df.fillna(df.max(1), axis=1)
  447. def test_fillna_dataframe(self):
  448. # GH 8377
  449. df = DataFrame({'a': [np.nan, 1, 2, np.nan, np.nan],
  450. 'b': [1, 2, 3, np.nan, np.nan],
  451. 'c': [np.nan, 1, 2, 3, 4]},
  452. index=list('VWXYZ'))
  453. # df2 may have different index and columns
  454. df2 = DataFrame({'a': [np.nan, 10, 20, 30, 40],
  455. 'b': [50, 60, 70, 80, 90],
  456. 'foo': ['bar'] * 5},
  457. index=list('VWXuZ'))
  458. result = df.fillna(df2)
  459. # only those columns and indices which are shared get filled
  460. expected = DataFrame({'a': [np.nan, 1, 2, np.nan, 40],
  461. 'b': [1, 2, 3, np.nan, 90],
  462. 'c': [np.nan, 1, 2, 3, 4]},
  463. index=list('VWXYZ'))
  464. assert_frame_equal(result, expected)
  465. def test_fillna_columns(self):
  466. df = DataFrame(np.random.randn(10, 10))
  467. df.values[:, ::2] = np.nan
  468. result = df.fillna(method='ffill', axis=1)
  469. expected = df.T.fillna(method='pad').T
  470. assert_frame_equal(result, expected)
  471. df.insert(6, 'foo', 5)
  472. result = df.fillna(method='ffill', axis=1)
  473. expected = df.astype(float).fillna(method='ffill', axis=1)
  474. assert_frame_equal(result, expected)
  475. def test_fillna_invalid_method(self):
  476. with pytest.raises(ValueError, match='ffil'):
  477. self.frame.fillna(method='ffil')
  478. def test_fillna_invalid_value(self):
  479. # list
  480. pytest.raises(TypeError, self.frame.fillna, [1, 2])
  481. # tuple
  482. pytest.raises(TypeError, self.frame.fillna, (1, 2))
  483. # frame with series
  484. pytest.raises(TypeError, self.frame.iloc[:, 0].fillna, self.frame)
  485. def test_fillna_col_reordering(self):
  486. cols = ["COL." + str(i) for i in range(5, 0, -1)]
  487. data = np.random.rand(20, 5)
  488. df = DataFrame(index=lrange(20), columns=cols, data=data)
  489. filled = df.fillna(method='ffill')
  490. assert df.columns.tolist() == filled.columns.tolist()
  491. def test_fill_corner(self):
  492. mf = self.mixed_frame
  493. mf.loc[mf.index[5:20], 'foo'] = np.nan
  494. mf.loc[mf.index[-10:], 'A'] = np.nan
  495. filled = self.mixed_frame.fillna(value=0)
  496. assert (filled.loc[filled.index[5:20], 'foo'] == 0).all()
  497. del self.mixed_frame['foo']
  498. empty_float = self.frame.reindex(columns=[])
  499. # TODO(wesm): unused?
  500. result = empty_float.fillna(value=0) # noqa
  501. def test_fill_value_when_combine_const(self):
  502. # GH12723
  503. dat = np.array([0, 1, np.nan, 3, 4, 5], dtype='float')
  504. df = DataFrame({'foo': dat}, index=range(6))
  505. exp = df.fillna(0).add(2)
  506. res = df.add(2, fill_value=0)
  507. assert_frame_equal(res, exp)
  508. class TestDataFrameInterpolate(TestData):
  509. def test_interp_basic(self):
  510. df = DataFrame({'A': [1, 2, np.nan, 4],
  511. 'B': [1, 4, 9, np.nan],
  512. 'C': [1, 2, 3, 5],
  513. 'D': list('abcd')})
  514. expected = DataFrame({'A': [1., 2., 3., 4.],
  515. 'B': [1., 4., 9., 9.],
  516. 'C': [1, 2, 3, 5],
  517. 'D': list('abcd')})
  518. result = df.interpolate()
  519. assert_frame_equal(result, expected)
  520. result = df.set_index('C').interpolate()
  521. expected = df.set_index('C')
  522. expected.loc[3, 'A'] = 3
  523. expected.loc[5, 'B'] = 9
  524. assert_frame_equal(result, expected)
  525. def test_interp_bad_method(self):
  526. df = DataFrame({'A': [1, 2, np.nan, 4],
  527. 'B': [1, 4, 9, np.nan],
  528. 'C': [1, 2, 3, 5],
  529. 'D': list('abcd')})
  530. with pytest.raises(ValueError):
  531. df.interpolate(method='not_a_method')
  532. def test_interp_combo(self):
  533. df = DataFrame({'A': [1., 2., np.nan, 4.],
  534. 'B': [1, 4, 9, np.nan],
  535. 'C': [1, 2, 3, 5],
  536. 'D': list('abcd')})
  537. result = df['A'].interpolate()
  538. expected = Series([1., 2., 3., 4.], name='A')
  539. assert_series_equal(result, expected)
  540. result = df['A'].interpolate(downcast='infer')
  541. expected = Series([1, 2, 3, 4], name='A')
  542. assert_series_equal(result, expected)
  543. def test_interp_nan_idx(self):
  544. df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [np.nan, 2, 3, 4]})
  545. df = df.set_index('A')
  546. with pytest.raises(NotImplementedError):
  547. df.interpolate(method='values')
  548. @td.skip_if_no_scipy
  549. def test_interp_various(self):
  550. df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7],
  551. 'C': [1, 2, 3, 5, 8, 13, 21]})
  552. df = df.set_index('C')
  553. expected = df.copy()
  554. result = df.interpolate(method='polynomial', order=1)
  555. expected.A.loc[3] = 2.66666667
  556. expected.A.loc[13] = 5.76923076
  557. assert_frame_equal(result, expected)
  558. result = df.interpolate(method='cubic')
  559. # GH #15662.
  560. # new cubic and quadratic interpolation algorithms from scipy 0.19.0.
  561. # previously `splmake` was used. See scipy/scipy#6710
  562. if _is_scipy_ge_0190:
  563. expected.A.loc[3] = 2.81547781
  564. expected.A.loc[13] = 5.52964175
  565. else:
  566. expected.A.loc[3] = 2.81621174
  567. expected.A.loc[13] = 5.64146581
  568. assert_frame_equal(result, expected)
  569. result = df.interpolate(method='nearest')
  570. expected.A.loc[3] = 2
  571. expected.A.loc[13] = 5
  572. assert_frame_equal(result, expected, check_dtype=False)
  573. result = df.interpolate(method='quadratic')
  574. if _is_scipy_ge_0190:
  575. expected.A.loc[3] = 2.82150771
  576. expected.A.loc[13] = 6.12648668
  577. else:
  578. expected.A.loc[3] = 2.82533638
  579. expected.A.loc[13] = 6.02817974
  580. assert_frame_equal(result, expected)
  581. result = df.interpolate(method='slinear')
  582. expected.A.loc[3] = 2.66666667
  583. expected.A.loc[13] = 5.76923077
  584. assert_frame_equal(result, expected)
  585. result = df.interpolate(method='zero')
  586. expected.A.loc[3] = 2.
  587. expected.A.loc[13] = 5
  588. assert_frame_equal(result, expected, check_dtype=False)
  589. @td.skip_if_no_scipy
  590. def test_interp_alt_scipy(self):
  591. df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7],
  592. 'C': [1, 2, 3, 5, 8, 13, 21]})
  593. result = df.interpolate(method='barycentric')
  594. expected = df.copy()
  595. expected.loc[2, 'A'] = 3
  596. expected.loc[5, 'A'] = 6
  597. assert_frame_equal(result, expected)
  598. result = df.interpolate(method='barycentric', downcast='infer')
  599. assert_frame_equal(result, expected.astype(np.int64))
  600. result = df.interpolate(method='krogh')
  601. expectedk = df.copy()
  602. expectedk['A'] = expected['A']
  603. assert_frame_equal(result, expectedk)
  604. _skip_if_no_pchip()
  605. import scipy
  606. result = df.interpolate(method='pchip')
  607. expected.loc[2, 'A'] = 3
  608. if LooseVersion(scipy.__version__) >= LooseVersion('0.17.0'):
  609. expected.loc[5, 'A'] = 6.0
  610. else:
  611. expected.loc[5, 'A'] = 6.125
  612. assert_frame_equal(result, expected)
  613. def test_interp_rowwise(self):
  614. df = DataFrame({0: [1, 2, np.nan, 4],
  615. 1: [2, 3, 4, np.nan],
  616. 2: [np.nan, 4, 5, 6],
  617. 3: [4, np.nan, 6, 7],
  618. 4: [1, 2, 3, 4]})
  619. result = df.interpolate(axis=1)
  620. expected = df.copy()
  621. expected.loc[3, 1] = 5
  622. expected.loc[0, 2] = 3
  623. expected.loc[1, 3] = 3
  624. expected[4] = expected[4].astype(np.float64)
  625. assert_frame_equal(result, expected)
  626. result = df.interpolate(axis=1, method='values')
  627. assert_frame_equal(result, expected)
  628. result = df.interpolate(axis=0)
  629. expected = df.interpolate()
  630. assert_frame_equal(result, expected)
  631. def test_rowwise_alt(self):
  632. df = DataFrame({0: [0, .5, 1., np.nan, 4, 8, np.nan, np.nan, 64],
  633. 1: [1, 2, 3, 4, 3, 2, 1, 0, -1]})
  634. df.interpolate(axis=0)
  635. @pytest.mark.parametrize("check_scipy", [
  636. False, pytest.param(True, marks=td.skip_if_no_scipy)
  637. ])
  638. def test_interp_leading_nans(self, check_scipy):
  639. df = DataFrame({"A": [np.nan, np.nan, .5, .25, 0],
  640. "B": [np.nan, -3, -3.5, np.nan, -4]})
  641. result = df.interpolate()
  642. expected = df.copy()
  643. expected['B'].loc[3] = -3.75
  644. assert_frame_equal(result, expected)
  645. if check_scipy:
  646. result = df.interpolate(method='polynomial', order=1)
  647. assert_frame_equal(result, expected)
  648. def test_interp_raise_on_only_mixed(self):
  649. df = DataFrame({'A': [1, 2, np.nan, 4],
  650. 'B': ['a', 'b', 'c', 'd'],
  651. 'C': [np.nan, 2, 5, 7],
  652. 'D': [np.nan, np.nan, 9, 9],
  653. 'E': [1, 2, 3, 4]})
  654. with pytest.raises(TypeError):
  655. df.interpolate(axis=1)
  656. def test_interp_raise_on_all_object_dtype(self):
  657. # GH 22985
  658. df = DataFrame({
  659. 'A': [1, 2, 3],
  660. 'B': [4, 5, 6]},
  661. dtype='object')
  662. msg = ("Cannot interpolate with all object-dtype columns "
  663. "in the DataFrame. Try setting at least one "
  664. "column to a numeric dtype.")
  665. with pytest.raises(TypeError, match=msg):
  666. df.interpolate()
  667. def test_interp_inplace(self):
  668. df = DataFrame({'a': [1., 2., np.nan, 4.]})
  669. expected = DataFrame({'a': [1., 2., 3., 4.]})
  670. result = df.copy()
  671. result['a'].interpolate(inplace=True)
  672. assert_frame_equal(result, expected)
  673. result = df.copy()
  674. result['a'].interpolate(inplace=True, downcast='infer')
  675. assert_frame_equal(result, expected.astype('int64'))
  676. def test_interp_inplace_row(self):
  677. # GH 10395
  678. result = DataFrame({'a': [1., 2., 3., 4.],
  679. 'b': [np.nan, 2., 3., 4.],
  680. 'c': [3, 2, 2, 2]})
  681. expected = result.interpolate(method='linear', axis=1, inplace=False)
  682. result.interpolate(method='linear', axis=1, inplace=True)
  683. assert_frame_equal(result, expected)
  684. def test_interp_ignore_all_good(self):
  685. # GH
  686. df = DataFrame({'A': [1, 2, np.nan, 4],
  687. 'B': [1, 2, 3, 4],
  688. 'C': [1., 2., np.nan, 4.],
  689. 'D': [1., 2., 3., 4.]})
  690. expected = DataFrame({'A': np.array(
  691. [1, 2, 3, 4], dtype='float64'),
  692. 'B': np.array(
  693. [1, 2, 3, 4], dtype='int64'),
  694. 'C': np.array(
  695. [1., 2., 3, 4.], dtype='float64'),
  696. 'D': np.array(
  697. [1., 2., 3., 4.], dtype='float64')})
  698. result = df.interpolate(downcast=None)
  699. assert_frame_equal(result, expected)
  700. # all good
  701. result = df[['B', 'D']].interpolate(downcast=None)
  702. assert_frame_equal(result, df[['B', 'D']])