123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384 |
- # -*- coding: utf-8 -*-
- from __future__ import print_function
- import numpy as np
- import pytest
- import pandas as pd
- from pandas import DataFrame, Series, Timestamp
- from pandas.tests.frame.common import TestData
- import pandas.util.testing as tm
- from pandas.util.testing import assert_frame_equal, assert_series_equal
- class TestDataFrameQuantile(TestData):
- def test_quantile(self):
- from numpy import percentile
- q = self.tsframe.quantile(0.1, axis=0)
- assert q['A'] == percentile(self.tsframe['A'], 10)
- tm.assert_index_equal(q.index, self.tsframe.columns)
- q = self.tsframe.quantile(0.9, axis=1)
- assert (q['2000-01-17'] ==
- percentile(self.tsframe.loc['2000-01-17'], 90))
- tm.assert_index_equal(q.index, self.tsframe.index)
- # test degenerate case
- q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0)
- assert(np.isnan(q['x']) and np.isnan(q['y']))
- # non-numeric exclusion
- df = DataFrame({'col1': ['A', 'A', 'B', 'B'], 'col2': [1, 2, 3, 4]})
- rs = df.quantile(0.5)
- xp = df.median().rename(0.5)
- assert_series_equal(rs, xp)
- # axis
- df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
- result = df.quantile(.5, axis=1)
- expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
- assert_series_equal(result, expected)
- result = df.quantile([.5, .75], axis=1)
- expected = DataFrame({1: [1.5, 1.75], 2: [2.5, 2.75],
- 3: [3.5, 3.75]}, index=[0.5, 0.75])
- assert_frame_equal(result, expected, check_index_type=True)
- # We may want to break API in the future to change this
- # so that we exclude non-numeric along the same axis
- # See GH #7312
- df = DataFrame([[1, 2, 3],
- ['a', 'b', 4]])
- result = df.quantile(.5, axis=1)
- expected = Series([3., 4.], index=[0, 1], name=0.5)
- assert_series_equal(result, expected)
- def test_quantile_axis_mixed(self):
- # mixed on axis=1
- df = DataFrame({"A": [1, 2, 3],
- "B": [2., 3., 4.],
- "C": pd.date_range('20130101', periods=3),
- "D": ['foo', 'bar', 'baz']})
- result = df.quantile(.5, axis=1)
- expected = Series([1.5, 2.5, 3.5], name=0.5)
- assert_series_equal(result, expected)
- # must raise
- with pytest.raises(TypeError):
- df.quantile(.5, axis=1, numeric_only=False)
- def test_quantile_axis_parameter(self):
- # GH 9543/9544
- df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
- result = df.quantile(.5, axis=0)
- expected = Series([2., 3.], index=["A", "B"], name=0.5)
- assert_series_equal(result, expected)
- expected = df.quantile(.5, axis="index")
- assert_series_equal(result, expected)
- result = df.quantile(.5, axis=1)
- expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
- assert_series_equal(result, expected)
- result = df.quantile(.5, axis="columns")
- assert_series_equal(result, expected)
- pytest.raises(ValueError, df.quantile, 0.1, axis=-1)
- pytest.raises(ValueError, df.quantile, 0.1, axis="column")
- def test_quantile_interpolation(self):
- # see gh-10174
- from numpy import percentile
- # interpolation = linear (default case)
- q = self.tsframe.quantile(0.1, axis=0, interpolation='linear')
- assert q['A'] == percentile(self.tsframe['A'], 10)
- q = self.intframe.quantile(0.1)
- assert q['A'] == percentile(self.intframe['A'], 10)
- # test with and without interpolation keyword
- q1 = self.intframe.quantile(0.1)
- assert q1['A'] == np.percentile(self.intframe['A'], 10)
- tm.assert_series_equal(q, q1)
- # interpolation method other than default linear
- df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
- result = df.quantile(.5, axis=1, interpolation='nearest')
- expected = Series([1, 2, 3], index=[1, 2, 3], name=0.5)
- tm.assert_series_equal(result, expected)
- # cross-check interpolation=nearest results in original dtype
- exp = np.percentile(np.array([[1, 2, 3], [2, 3, 4]]), .5,
- axis=0, interpolation='nearest')
- expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='int64')
- tm.assert_series_equal(result, expected)
- # float
- df = DataFrame({"A": [1., 2., 3.], "B": [2., 3., 4.]}, index=[1, 2, 3])
- result = df.quantile(.5, axis=1, interpolation='nearest')
- expected = Series([1., 2., 3.], index=[1, 2, 3], name=0.5)
- tm.assert_series_equal(result, expected)
- exp = np.percentile(np.array([[1., 2., 3.], [2., 3., 4.]]), .5,
- axis=0, interpolation='nearest')
- expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='float64')
- assert_series_equal(result, expected)
- # axis
- result = df.quantile([.5, .75], axis=1, interpolation='lower')
- expected = DataFrame({1: [1., 1.], 2: [2., 2.],
- 3: [3., 3.]}, index=[0.5, 0.75])
- assert_frame_equal(result, expected)
- # test degenerate case
- df = DataFrame({'x': [], 'y': []})
- q = df.quantile(0.1, axis=0, interpolation='higher')
- assert(np.isnan(q['x']) and np.isnan(q['y']))
- # multi
- df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
- columns=['a', 'b', 'c'])
- result = df.quantile([.25, .5], interpolation='midpoint')
- # https://github.com/numpy/numpy/issues/7163
- expected = DataFrame([[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]],
- index=[.25, .5], columns=['a', 'b', 'c'])
- assert_frame_equal(result, expected)
- def test_quantile_multi(self):
- df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
- columns=['a', 'b', 'c'])
- result = df.quantile([.25, .5])
- expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]],
- index=[.25, .5], columns=['a', 'b', 'c'])
- assert_frame_equal(result, expected)
- # axis = 1
- result = df.quantile([.25, .5], axis=1)
- expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]],
- index=[.25, .5], columns=[0, 1, 2])
- # empty
- result = DataFrame({'x': [], 'y': []}).quantile([0.1, .9], axis=0)
- expected = DataFrame({'x': [np.nan, np.nan], 'y': [np.nan, np.nan]},
- index=[.1, .9])
- assert_frame_equal(result, expected)
- def test_quantile_datetime(self):
- df = DataFrame({'a': pd.to_datetime(['2010', '2011']), 'b': [0, 5]})
- # exclude datetime
- result = df.quantile(.5)
- expected = Series([2.5], index=['b'])
- # datetime
- result = df.quantile(.5, numeric_only=False)
- expected = Series([Timestamp('2010-07-02 12:00:00'), 2.5],
- index=['a', 'b'],
- name=0.5)
- assert_series_equal(result, expected)
- # datetime w/ multi
- result = df.quantile([.5], numeric_only=False)
- expected = DataFrame([[Timestamp('2010-07-02 12:00:00'), 2.5]],
- index=[.5], columns=['a', 'b'])
- assert_frame_equal(result, expected)
- # axis = 1
- df['c'] = pd.to_datetime(['2011', '2012'])
- result = df[['a', 'c']].quantile(.5, axis=1, numeric_only=False)
- expected = Series([Timestamp('2010-07-02 12:00:00'),
- Timestamp('2011-07-02 12:00:00')],
- index=[0, 1],
- name=0.5)
- assert_series_equal(result, expected)
- result = df[['a', 'c']].quantile([.5], axis=1, numeric_only=False)
- expected = DataFrame([[Timestamp('2010-07-02 12:00:00'),
- Timestamp('2011-07-02 12:00:00')]],
- index=[0.5], columns=[0, 1])
- assert_frame_equal(result, expected)
- # empty when numeric_only=True
- # FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
- # result = df[['a', 'c']].quantile(.5)
- # result = df[['a', 'c']].quantile([.5])
- def test_quantile_invalid(self):
- msg = 'percentiles should all be in the interval \\[0, 1\\]'
- for invalid in [-1, 2, [0.5, -1], [0.5, 2]]:
- with pytest.raises(ValueError, match=msg):
- self.tsframe.quantile(invalid)
- def test_quantile_box(self):
- df = DataFrame({'A': [pd.Timestamp('2011-01-01'),
- pd.Timestamp('2011-01-02'),
- pd.Timestamp('2011-01-03')],
- 'B': [pd.Timestamp('2011-01-01', tz='US/Eastern'),
- pd.Timestamp('2011-01-02', tz='US/Eastern'),
- pd.Timestamp('2011-01-03', tz='US/Eastern')],
- 'C': [pd.Timedelta('1 days'),
- pd.Timedelta('2 days'),
- pd.Timedelta('3 days')]})
- res = df.quantile(0.5, numeric_only=False)
- exp = pd.Series([pd.Timestamp('2011-01-02'),
- pd.Timestamp('2011-01-02', tz='US/Eastern'),
- pd.Timedelta('2 days')],
- name=0.5, index=['A', 'B', 'C'])
- tm.assert_series_equal(res, exp)
- res = df.quantile([0.5], numeric_only=False)
- exp = pd.DataFrame([[pd.Timestamp('2011-01-02'),
- pd.Timestamp('2011-01-02', tz='US/Eastern'),
- pd.Timedelta('2 days')]],
- index=[0.5], columns=['A', 'B', 'C'])
- tm.assert_frame_equal(res, exp)
- # DatetimeBlock may be consolidated and contain NaT in different loc
- df = DataFrame({'A': [pd.Timestamp('2011-01-01'),
- pd.NaT,
- pd.Timestamp('2011-01-02'),
- pd.Timestamp('2011-01-03')],
- 'a': [pd.Timestamp('2011-01-01'),
- pd.Timestamp('2011-01-02'),
- pd.NaT,
- pd.Timestamp('2011-01-03')],
- 'B': [pd.Timestamp('2011-01-01', tz='US/Eastern'),
- pd.NaT,
- pd.Timestamp('2011-01-02', tz='US/Eastern'),
- pd.Timestamp('2011-01-03', tz='US/Eastern')],
- 'b': [pd.Timestamp('2011-01-01', tz='US/Eastern'),
- pd.Timestamp('2011-01-02', tz='US/Eastern'),
- pd.NaT,
- pd.Timestamp('2011-01-03', tz='US/Eastern')],
- 'C': [pd.Timedelta('1 days'),
- pd.Timedelta('2 days'),
- pd.Timedelta('3 days'),
- pd.NaT],
- 'c': [pd.NaT,
- pd.Timedelta('1 days'),
- pd.Timedelta('2 days'),
- pd.Timedelta('3 days')]},
- columns=list('AaBbCc'))
- res = df.quantile(0.5, numeric_only=False)
- exp = pd.Series([pd.Timestamp('2011-01-02'),
- pd.Timestamp('2011-01-02'),
- pd.Timestamp('2011-01-02', tz='US/Eastern'),
- pd.Timestamp('2011-01-02', tz='US/Eastern'),
- pd.Timedelta('2 days'),
- pd.Timedelta('2 days')],
- name=0.5, index=list('AaBbCc'))
- tm.assert_series_equal(res, exp)
- res = df.quantile([0.5], numeric_only=False)
- exp = pd.DataFrame([[pd.Timestamp('2011-01-02'),
- pd.Timestamp('2011-01-02'),
- pd.Timestamp('2011-01-02', tz='US/Eastern'),
- pd.Timestamp('2011-01-02', tz='US/Eastern'),
- pd.Timedelta('2 days'),
- pd.Timedelta('2 days')]],
- index=[0.5], columns=list('AaBbCc'))
- tm.assert_frame_equal(res, exp)
- def test_quantile_nan(self):
- # GH 14357 - float block where some cols have missing values
- df = DataFrame({'a': np.arange(1, 6.0), 'b': np.arange(1, 6.0)})
- df.iloc[-1, 1] = np.nan
- res = df.quantile(0.5)
- exp = Series([3.0, 2.5], index=['a', 'b'], name=0.5)
- tm.assert_series_equal(res, exp)
- res = df.quantile([0.5, 0.75])
- exp = DataFrame({'a': [3.0, 4.0], 'b': [2.5, 3.25]}, index=[0.5, 0.75])
- tm.assert_frame_equal(res, exp)
- res = df.quantile(0.5, axis=1)
- exp = Series(np.arange(1.0, 6.0), name=0.5)
- tm.assert_series_equal(res, exp)
- res = df.quantile([0.5, 0.75], axis=1)
- exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75])
- tm.assert_frame_equal(res, exp)
- # full-nan column
- df['b'] = np.nan
- res = df.quantile(0.5)
- exp = Series([3.0, np.nan], index=['a', 'b'], name=0.5)
- tm.assert_series_equal(res, exp)
- res = df.quantile([0.5, 0.75])
- exp = DataFrame({'a': [3.0, 4.0], 'b': [np.nan, np.nan]},
- index=[0.5, 0.75])
- tm.assert_frame_equal(res, exp)
- def test_quantile_nat(self):
- # full NaT column
- df = DataFrame({'a': [pd.NaT, pd.NaT, pd.NaT]})
- res = df.quantile(0.5, numeric_only=False)
- exp = Series([pd.NaT], index=['a'], name=0.5)
- tm.assert_series_equal(res, exp)
- res = df.quantile([0.5], numeric_only=False)
- exp = DataFrame({'a': [pd.NaT]}, index=[0.5])
- tm.assert_frame_equal(res, exp)
- # mixed non-null / full null column
- df = DataFrame({'a': [pd.Timestamp('2012-01-01'),
- pd.Timestamp('2012-01-02'),
- pd.Timestamp('2012-01-03')],
- 'b': [pd.NaT, pd.NaT, pd.NaT]})
- res = df.quantile(0.5, numeric_only=False)
- exp = Series([pd.Timestamp('2012-01-02'), pd.NaT], index=['a', 'b'],
- name=0.5)
- tm.assert_series_equal(res, exp)
- res = df.quantile([0.5], numeric_only=False)
- exp = DataFrame([[pd.Timestamp('2012-01-02'), pd.NaT]], index=[0.5],
- columns=['a', 'b'])
- tm.assert_frame_equal(res, exp)
- def test_quantile_empty(self):
- # floats
- df = DataFrame(columns=['a', 'b'], dtype='float64')
- res = df.quantile(0.5)
- exp = Series([np.nan, np.nan], index=['a', 'b'], name=0.5)
- tm.assert_series_equal(res, exp)
- res = df.quantile([0.5])
- exp = DataFrame([[np.nan, np.nan]], columns=['a', 'b'], index=[0.5])
- tm.assert_frame_equal(res, exp)
- # FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
- # res = df.quantile(0.5, axis=1)
- # res = df.quantile([0.5], axis=1)
- # ints
- df = DataFrame(columns=['a', 'b'], dtype='int64')
- # FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
- # res = df.quantile(0.5)
- # datetimes
- df = DataFrame(columns=['a', 'b'], dtype='datetime64[ns]')
- # FIXME (gives NaNs instead of NaT in 0.18.1 or 0.19.0)
- # res = df.quantile(0.5, numeric_only=False)
|