|
- # -*- coding: utf-8 -*-
- from datetime import timedelta
- import operator
- from string import ascii_lowercase
- import warnings
- import numpy as np
- import pytest
- from pandas.compat import PY35, lrange
- import pandas.util._test_decorators as td
- import pandas as pd
- from pandas import (
- Categorical, DataFrame, MultiIndex, Series, Timestamp, compat, date_range,
- isna, notna, to_datetime, to_timedelta)
- import pandas.core.algorithms as algorithms
- import pandas.core.nanops as nanops
- import pandas.util.testing as tm
- def assert_stat_op_calc(opname, alternative, frame, has_skipna=True,
- check_dtype=True, check_dates=False,
- check_less_precise=False, skipna_alternative=None):
- """
- Check that operator opname works as advertised on frame
- Parameters
- ----------
- opname : string
- Name of the operator to test on frame
- alternative : function
- Function that opname is tested against; i.e. "frame.opname()" should
- equal "alternative(frame)".
- frame : DataFrame
- The object that the tests are executed on
- has_skipna : bool, default True
- Whether the method "opname" has the kwarg "skip_na"
- check_dtype : bool, default True
- Whether the dtypes of the result of "frame.opname()" and
- "alternative(frame)" should be checked.
- check_dates : bool, default false
- Whether opname should be tested on a Datetime Series
- check_less_precise : bool, default False
- Whether results should only be compared approximately;
- passed on to tm.assert_series_equal
- skipna_alternative : function, default None
- NaN-safe version of alternative
- """
- f = getattr(frame, opname)
- if check_dates:
- df = DataFrame({'b': date_range('1/1/2001', periods=2)})
- result = getattr(df, opname)()
- assert isinstance(result, Series)
- df['a'] = lrange(len(df))
- result = getattr(df, opname)()
- assert isinstance(result, Series)
- assert len(result)
- if has_skipna:
- def wrapper(x):
- return alternative(x.values)
- skipna_wrapper = tm._make_skipna_wrapper(alternative,
- skipna_alternative)
- result0 = f(axis=0, skipna=False)
- result1 = f(axis=1, skipna=False)
- tm.assert_series_equal(result0, frame.apply(wrapper),
- check_dtype=check_dtype,
- check_less_precise=check_less_precise)
- # HACK: win32
- tm.assert_series_equal(result1, frame.apply(wrapper, axis=1),
- check_dtype=False,
- check_less_precise=check_less_precise)
- else:
- skipna_wrapper = alternative
- result0 = f(axis=0)
- result1 = f(axis=1)
- tm.assert_series_equal(result0, frame.apply(skipna_wrapper),
- check_dtype=check_dtype,
- check_less_precise=check_less_precise)
- if opname in ['sum', 'prod']:
- expected = frame.apply(skipna_wrapper, axis=1)
- tm.assert_series_equal(result1, expected, check_dtype=False,
- check_less_precise=check_less_precise)
- # check dtypes
- if check_dtype:
- lcd_dtype = frame.values.dtype
- assert lcd_dtype == result0.dtype
- assert lcd_dtype == result1.dtype
- # bad axis
- with pytest.raises(ValueError, match='No axis named 2'):
- f(axis=2)
- # all NA case
- if has_skipna:
- all_na = frame * np.NaN
- r0 = getattr(all_na, opname)(axis=0)
- r1 = getattr(all_na, opname)(axis=1)
- if opname in ['sum', 'prod']:
- unit = 1 if opname == 'prod' else 0 # result for empty sum/prod
- expected = pd.Series(unit, index=r0.index, dtype=r0.dtype)
- tm.assert_series_equal(r0, expected)
- expected = pd.Series(unit, index=r1.index, dtype=r1.dtype)
- tm.assert_series_equal(r1, expected)
- def assert_stat_op_api(opname, float_frame, float_string_frame,
- has_numeric_only=False):
- """
- Check that API for operator opname works as advertised on frame
- Parameters
- ----------
- opname : string
- Name of the operator to test on frame
- float_frame : DataFrame
- DataFrame with columns of type float
- float_string_frame : DataFrame
- DataFrame with both float and string columns
- has_numeric_only : bool, default False
- Whether the method "opname" has the kwarg "numeric_only"
- """
- # make sure works on mixed-type frame
- getattr(float_string_frame, opname)(axis=0)
- getattr(float_string_frame, opname)(axis=1)
- if has_numeric_only:
- getattr(float_string_frame, opname)(axis=0, numeric_only=True)
- getattr(float_string_frame, opname)(axis=1, numeric_only=True)
- getattr(float_frame, opname)(axis=0, numeric_only=False)
- getattr(float_frame, opname)(axis=1, numeric_only=False)
- def assert_bool_op_calc(opname, alternative, frame, has_skipna=True):
- """
- Check that bool operator opname works as advertised on frame
- Parameters
- ----------
- opname : string
- Name of the operator to test on frame
- alternative : function
- Function that opname is tested against; i.e. "frame.opname()" should
- equal "alternative(frame)".
- frame : DataFrame
- The object that the tests are executed on
- has_skipna : bool, default True
- Whether the method "opname" has the kwarg "skip_na"
- """
- f = getattr(frame, opname)
- if has_skipna:
- def skipna_wrapper(x):
- nona = x.dropna().values
- return alternative(nona)
- def wrapper(x):
- return alternative(x.values)
- result0 = f(axis=0, skipna=False)
- result1 = f(axis=1, skipna=False)
- tm.assert_series_equal(result0, frame.apply(wrapper))
- tm.assert_series_equal(result1, frame.apply(wrapper, axis=1),
- check_dtype=False) # HACK: win32
- else:
- skipna_wrapper = alternative
- wrapper = alternative
- result0 = f(axis=0)
- result1 = f(axis=1)
- tm.assert_series_equal(result0, frame.apply(skipna_wrapper))
- tm.assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1),
- check_dtype=False)
- # bad axis
- with pytest.raises(ValueError, match='No axis named 2'):
- f(axis=2)
- # all NA case
- if has_skipna:
- all_na = frame * np.NaN
- r0 = getattr(all_na, opname)(axis=0)
- r1 = getattr(all_na, opname)(axis=1)
- if opname == 'any':
- assert not r0.any()
- assert not r1.any()
- else:
- assert r0.all()
- assert r1.all()
- def assert_bool_op_api(opname, bool_frame_with_na, float_string_frame,
- has_bool_only=False):
- """
- Check that API for boolean operator opname works as advertised on frame
- Parameters
- ----------
- opname : string
- Name of the operator to test on frame
- float_frame : DataFrame
- DataFrame with columns of type float
- float_string_frame : DataFrame
- DataFrame with both float and string columns
- has_bool_only : bool, default False
- Whether the method "opname" has the kwarg "bool_only"
- """
- # make sure op works on mixed-type frame
- mixed = float_string_frame
- mixed['_bool_'] = np.random.randn(len(mixed)) > 0.5
- getattr(mixed, opname)(axis=0)
- getattr(mixed, opname)(axis=1)
- if has_bool_only:
- getattr(mixed, opname)(axis=0, bool_only=True)
- getattr(mixed, opname)(axis=1, bool_only=True)
- getattr(bool_frame_with_na, opname)(axis=0, bool_only=False)
- getattr(bool_frame_with_na, opname)(axis=1, bool_only=False)
- class TestDataFrameAnalytics():
- # ---------------------------------------------------------------------=
- # Correlation and covariance
- @td.skip_if_no_scipy
- def test_corr_pearson(self, float_frame):
- float_frame['A'][:5] = np.nan
- float_frame['B'][5:10] = np.nan
- self._check_method(float_frame, 'pearson')
- @td.skip_if_no_scipy
- def test_corr_kendall(self, float_frame):
- float_frame['A'][:5] = np.nan
- float_frame['B'][5:10] = np.nan
- self._check_method(float_frame, 'kendall')
- @td.skip_if_no_scipy
- def test_corr_spearman(self, float_frame):
- float_frame['A'][:5] = np.nan
- float_frame['B'][5:10] = np.nan
- self._check_method(float_frame, 'spearman')
- def _check_method(self, frame, method='pearson'):
- correls = frame.corr(method=method)
- expected = frame['A'].corr(frame['C'], method=method)
- tm.assert_almost_equal(correls['A']['C'], expected)
- @td.skip_if_no_scipy
- def test_corr_non_numeric(self, float_frame, float_string_frame):
- float_frame['A'][:5] = np.nan
- float_frame['B'][5:10] = np.nan
- # exclude non-numeric types
- result = float_string_frame.corr()
- expected = float_string_frame.loc[:, ['A', 'B', 'C', 'D']].corr()
- tm.assert_frame_equal(result, expected)
- @td.skip_if_no_scipy
- @pytest.mark.parametrize('meth', ['pearson', 'kendall', 'spearman'])
- def test_corr_nooverlap(self, meth):
- # nothing in common
- df = DataFrame({'A': [1, 1.5, 1, np.nan, np.nan, np.nan],
- 'B': [np.nan, np.nan, np.nan, 1, 1.5, 1],
- 'C': [np.nan, np.nan, np.nan, np.nan,
- np.nan, np.nan]})
- rs = df.corr(meth)
- assert isna(rs.loc['A', 'B'])
- assert isna(rs.loc['B', 'A'])
- assert rs.loc['A', 'A'] == 1
- assert rs.loc['B', 'B'] == 1
- assert isna(rs.loc['C', 'C'])
- @td.skip_if_no_scipy
- @pytest.mark.parametrize('meth', ['pearson', 'spearman'])
- def test_corr_constant(self, meth):
- # constant --> all NA
- df = DataFrame({'A': [1, 1, 1, np.nan, np.nan, np.nan],
- 'B': [np.nan, np.nan, np.nan, 1, 1, 1]})
- rs = df.corr(meth)
- assert isna(rs.values).all()
- def test_corr_int(self):
- # dtypes other than float64 #1761
- df3 = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]})
- df3.cov()
- df3.corr()
- @td.skip_if_no_scipy
- def test_corr_int_and_boolean(self):
- # when dtypes of pandas series are different
- # then ndarray will have dtype=object,
- # so it need to be properly handled
- df = DataFrame({"a": [True, False], "b": [1, 0]})
- expected = DataFrame(np.ones((2, 2)), index=[
- 'a', 'b'], columns=['a', 'b'])
- for meth in ['pearson', 'kendall', 'spearman']:
- with warnings.catch_warnings(record=True):
- warnings.simplefilter("ignore", RuntimeWarning)
- result = df.corr(meth)
- tm.assert_frame_equal(result, expected)
- def test_corr_cov_independent_index_column(self):
- # GH 14617
- df = pd.DataFrame(np.random.randn(4 * 10).reshape(10, 4),
- columns=list("abcd"))
- for method in ['cov', 'corr']:
- result = getattr(df, method)()
- assert result.index is not result.columns
- assert result.index.equals(result.columns)
- def test_corr_invalid_method(self):
- # GH 22298
- df = pd.DataFrame(np.random.normal(size=(10, 2)))
- msg = ("method must be either 'pearson', 'spearman', "
- "or 'kendall'")
- with pytest.raises(ValueError, match=msg):
- df.corr(method="____")
- def test_cov(self, float_frame, float_string_frame):
- # min_periods no NAs (corner case)
- expected = float_frame.cov()
- result = float_frame.cov(min_periods=len(float_frame))
- tm.assert_frame_equal(expected, result)
- result = float_frame.cov(min_periods=len(float_frame) + 1)
- assert isna(result.values).all()
- # with NAs
- frame = float_frame.copy()
- frame['A'][:5] = np.nan
- frame['B'][5:10] = np.nan
- result = float_frame.cov(min_periods=len(float_frame) - 8)
- expected = float_frame.cov()
- expected.loc['A', 'B'] = np.nan
- expected.loc['B', 'A'] = np.nan
- # regular
- float_frame['A'][:5] = np.nan
- float_frame['B'][:10] = np.nan
- cov = float_frame.cov()
- tm.assert_almost_equal(cov['A']['C'],
- float_frame['A'].cov(float_frame['C']))
- # exclude non-numeric types
- result = float_string_frame.cov()
- expected = float_string_frame.loc[:, ['A', 'B', 'C', 'D']].cov()
- tm.assert_frame_equal(result, expected)
- # Single column frame
- df = DataFrame(np.linspace(0.0, 1.0, 10))
- result = df.cov()
- expected = DataFrame(np.cov(df.values.T).reshape((1, 1)),
- index=df.columns, columns=df.columns)
- tm.assert_frame_equal(result, expected)
- df.loc[0] = np.nan
- result = df.cov()
- expected = DataFrame(np.cov(df.values[1:].T).reshape((1, 1)),
- index=df.columns, columns=df.columns)
- tm.assert_frame_equal(result, expected)
- def test_corrwith(self, datetime_frame):
- a = datetime_frame
- noise = Series(np.random.randn(len(a)), index=a.index)
- b = datetime_frame.add(noise, axis=0)
- # make sure order does not matter
- b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:])
- del b['B']
- colcorr = a.corrwith(b, axis=0)
- tm.assert_almost_equal(colcorr['A'], a['A'].corr(b['A']))
- rowcorr = a.corrwith(b, axis=1)
- tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0))
- dropped = a.corrwith(b, axis=0, drop=True)
- tm.assert_almost_equal(dropped['A'], a['A'].corr(b['A']))
- assert 'B' not in dropped
- dropped = a.corrwith(b, axis=1, drop=True)
- assert a.index[-1] not in dropped.index
- # non time-series data
- index = ['a', 'b', 'c', 'd', 'e']
- columns = ['one', 'two', 'three', 'four']
- df1 = DataFrame(np.random.randn(5, 4), index=index, columns=columns)
- df2 = DataFrame(np.random.randn(4, 4),
- index=index[:4], columns=columns)
- correls = df1.corrwith(df2, axis=1)
- for row in index[:4]:
- tm.assert_almost_equal(correls[row],
- df1.loc[row].corr(df2.loc[row]))
- def test_corrwith_with_objects(self):
- df1 = tm.makeTimeDataFrame()
- df2 = tm.makeTimeDataFrame()
- cols = ['A', 'B', 'C', 'D']
- df1['obj'] = 'foo'
- df2['obj'] = 'bar'
- result = df1.corrwith(df2)
- expected = df1.loc[:, cols].corrwith(df2.loc[:, cols])
- tm.assert_series_equal(result, expected)
- result = df1.corrwith(df2, axis=1)
- expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], axis=1)
- tm.assert_series_equal(result, expected)
- def test_corrwith_series(self, datetime_frame):
- result = datetime_frame.corrwith(datetime_frame['A'])
- expected = datetime_frame.apply(datetime_frame['A'].corr)
- tm.assert_series_equal(result, expected)
- def test_corrwith_matches_corrcoef(self):
- df1 = DataFrame(np.arange(10000), columns=['a'])
- df2 = DataFrame(np.arange(10000) ** 2, columns=['a'])
- c1 = df1.corrwith(df2)['a']
- c2 = np.corrcoef(df1['a'], df2['a'])[0][1]
- tm.assert_almost_equal(c1, c2)
- assert c1 < 1
- def test_corrwith_mixed_dtypes(self):
- # GH 18570
- df = pd.DataFrame({'a': [1, 4, 3, 2], 'b': [4, 6, 7, 3],
- 'c': ['a', 'b', 'c', 'd']})
- s = pd.Series([0, 6, 7, 3])
- result = df.corrwith(s)
- corrs = [df['a'].corr(s), df['b'].corr(s)]
- expected = pd.Series(data=corrs, index=['a', 'b'])
- tm.assert_series_equal(result, expected)
- def test_corrwith_index_intersection(self):
- df1 = pd.DataFrame(np.random.random(size=(10, 2)),
- columns=["a", "b"])
- df2 = pd.DataFrame(np.random.random(size=(10, 3)),
- columns=["a", "b", "c"])
- result = df1.corrwith(df2, drop=True).index.sort_values()
- expected = df1.columns.intersection(df2.columns).sort_values()
- tm.assert_index_equal(result, expected)
- def test_corrwith_index_union(self):
- df1 = pd.DataFrame(np.random.random(size=(10, 2)),
- columns=["a", "b"])
- df2 = pd.DataFrame(np.random.random(size=(10, 3)),
- columns=["a", "b", "c"])
- result = df1.corrwith(df2, drop=False).index.sort_values()
- expected = df1.columns.union(df2.columns).sort_values()
- tm.assert_index_equal(result, expected)
- def test_corrwith_dup_cols(self):
- # GH 21925
- df1 = pd.DataFrame(np.vstack([np.arange(10)] * 3).T)
- df2 = df1.copy()
- df2 = pd.concat((df2, df2[0]), axis=1)
- result = df1.corrwith(df2)
- expected = pd.Series(np.ones(4), index=[0, 0, 1, 2])
- tm.assert_series_equal(result, expected)
- @td.skip_if_no_scipy
- def test_corrwith_spearman(self):
- # GH 21925
- df = pd.DataFrame(np.random.random(size=(100, 3)))
- result = df.corrwith(df**2, method="spearman")
- expected = Series(np.ones(len(result)))
- tm.assert_series_equal(result, expected)
- @td.skip_if_no_scipy
- def test_corrwith_kendall(self):
- # GH 21925
- df = pd.DataFrame(np.random.random(size=(100, 3)))
- result = df.corrwith(df**2, method="kendall")
- expected = Series(np.ones(len(result)))
- tm.assert_series_equal(result, expected)
- def test_bool_describe_in_mixed_frame(self):
- df = DataFrame({
- 'string_data': ['a', 'b', 'c', 'd', 'e'],
- 'bool_data': [True, True, False, False, False],
- 'int_data': [10, 20, 30, 40, 50],
- })
- # Integer data are included in .describe() output,
- # Boolean and string data are not.
- result = df.describe()
- expected = DataFrame({'int_data': [5, 30, df.int_data.std(),
- 10, 20, 30, 40, 50]},
- index=['count', 'mean', 'std', 'min', '25%',
- '50%', '75%', 'max'])
- tm.assert_frame_equal(result, expected)
- # Top value is a boolean value that is False
- result = df.describe(include=['bool'])
- expected = DataFrame({'bool_data': [5, 2, False, 3]},
- index=['count', 'unique', 'top', 'freq'])
- tm.assert_frame_equal(result, expected)
- def test_describe_bool_frame(self):
- # GH 13891
- df = pd.DataFrame({
- 'bool_data_1': [False, False, True, True],
- 'bool_data_2': [False, True, True, True]
- })
- result = df.describe()
- expected = DataFrame({'bool_data_1': [4, 2, True, 2],
- 'bool_data_2': [4, 2, True, 3]},
- index=['count', 'unique', 'top', 'freq'])
- tm.assert_frame_equal(result, expected)
- df = pd.DataFrame({
- 'bool_data': [False, False, True, True, False],
- 'int_data': [0, 1, 2, 3, 4]
- })
- result = df.describe()
- expected = DataFrame({'int_data': [5, 2, df.int_data.std(), 0, 1,
- 2, 3, 4]},
- index=['count', 'mean', 'std', 'min', '25%',
- '50%', '75%', 'max'])
- tm.assert_frame_equal(result, expected)
- df = pd.DataFrame({
- 'bool_data': [False, False, True, True],
- 'str_data': ['a', 'b', 'c', 'a']
- })
- result = df.describe()
- expected = DataFrame({'bool_data': [4, 2, True, 2],
- 'str_data': [4, 3, 'a', 2]},
- index=['count', 'unique', 'top', 'freq'])
- tm.assert_frame_equal(result, expected)
- def test_describe_categorical(self):
- df = DataFrame({'value': np.random.randint(0, 10000, 100)})
- labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
- cat_labels = Categorical(labels, labels)
- df = df.sort_values(by=['value'], ascending=True)
- df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
- right=False, labels=cat_labels)
- cat = df
- # Categoricals should not show up together with numerical columns
- result = cat.describe()
- assert len(result.columns) == 1
- # In a frame, describe() for the cat should be the same as for string
- # arrays (count, unique, top, freq)
- cat = Categorical(["a", "b", "b", "b"], categories=['a', 'b', 'c'],
- ordered=True)
- s = Series(cat)
- result = s.describe()
- expected = Series([4, 2, "b", 3],
- index=['count', 'unique', 'top', 'freq'])
- tm.assert_series_equal(result, expected)
- cat = Series(Categorical(["a", "b", "c", "c"]))
- df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]})
- result = df3.describe()
- tm.assert_numpy_array_equal(result["cat"].values, result["s"].values)
- def test_describe_categorical_columns(self):
- # GH 11558
- columns = pd.CategoricalIndex(['int1', 'int2', 'obj'],
- ordered=True, name='XXX')
- df = DataFrame({'int1': [10, 20, 30, 40, 50],
- 'int2': [10, 20, 30, 40, 50],
- 'obj': ['A', 0, None, 'X', 1]},
- columns=columns)
- result = df.describe()
- exp_columns = pd.CategoricalIndex(['int1', 'int2'],
- categories=['int1', 'int2', 'obj'],
- ordered=True, name='XXX')
- expected = DataFrame({'int1': [5, 30, df.int1.std(),
- 10, 20, 30, 40, 50],
- 'int2': [5, 30, df.int2.std(),
- 10, 20, 30, 40, 50]},
- index=['count', 'mean', 'std', 'min', '25%',
- '50%', '75%', 'max'],
- columns=exp_columns)
- tm.assert_frame_equal(result, expected)
- tm.assert_categorical_equal(result.columns.values,
- expected.columns.values)
- def test_describe_datetime_columns(self):
- columns = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'],
- freq='MS', tz='US/Eastern', name='XXX')
- df = DataFrame({0: [10, 20, 30, 40, 50],
- 1: [10, 20, 30, 40, 50],
- 2: ['A', 0, None, 'X', 1]})
- df.columns = columns
- result = df.describe()
- exp_columns = pd.DatetimeIndex(['2011-01-01', '2011-02-01'],
- freq='MS', tz='US/Eastern', name='XXX')
- expected = DataFrame({0: [5, 30, df.iloc[:, 0].std(),
- 10, 20, 30, 40, 50],
- 1: [5, 30, df.iloc[:, 1].std(),
- 10, 20, 30, 40, 50]},
- index=['count', 'mean', 'std', 'min', '25%',
- '50%', '75%', 'max'])
- expected.columns = exp_columns
- tm.assert_frame_equal(result, expected)
- assert result.columns.freq == 'MS'
- assert result.columns.tz == expected.columns.tz
- def test_describe_timedelta_values(self):
- # GH 6145
- t1 = pd.timedelta_range('1 days', freq='D', periods=5)
- t2 = pd.timedelta_range('1 hours', freq='H', periods=5)
- df = pd.DataFrame({'t1': t1, 't2': t2})
- expected = DataFrame({'t1': [5, pd.Timedelta('3 days'),
- df.iloc[:, 0].std(),
- pd.Timedelta('1 days'),
- pd.Timedelta('2 days'),
- pd.Timedelta('3 days'),
- pd.Timedelta('4 days'),
- pd.Timedelta('5 days')],
- 't2': [5, pd.Timedelta('3 hours'),
- df.iloc[:, 1].std(),
- pd.Timedelta('1 hours'),
- pd.Timedelta('2 hours'),
- pd.Timedelta('3 hours'),
- pd.Timedelta('4 hours'),
- pd.Timedelta('5 hours')]},
- index=['count', 'mean', 'std', 'min', '25%',
- '50%', '75%', 'max'])
- result = df.describe()
- tm.assert_frame_equal(result, expected)
- exp_repr = (" t1 t2\n"
- "count 5 5\n"
- "mean 3 days 00:00:00 0 days 03:00:00\n"
- "std 1 days 13:56:50.394919 0 days 01:34:52.099788\n"
- "min 1 days 00:00:00 0 days 01:00:00\n"
- "25% 2 days 00:00:00 0 days 02:00:00\n"
- "50% 3 days 00:00:00 0 days 03:00:00\n"
- "75% 4 days 00:00:00 0 days 04:00:00\n"
- "max 5 days 00:00:00 0 days 05:00:00")
- assert repr(result) == exp_repr
- def test_describe_tz_values(self, tz_naive_fixture):
- # GH 21332
- tz = tz_naive_fixture
- s1 = Series(range(5))
- start = Timestamp(2018, 1, 1)
- end = Timestamp(2018, 1, 5)
- s2 = Series(date_range(start, end, tz=tz))
- df = pd.DataFrame({'s1': s1, 's2': s2})
- expected = DataFrame({'s1': [5, np.nan, np.nan, np.nan, np.nan, np.nan,
- 2, 1.581139, 0, 1, 2, 3, 4],
- 's2': [5, 5, s2.value_counts().index[0], 1,
- start.tz_localize(tz),
- end.tz_localize(tz), np.nan, np.nan,
- np.nan, np.nan, np.nan, np.nan, np.nan]},
- index=['count', 'unique', 'top', 'freq', 'first',
- 'last', 'mean', 'std', 'min', '25%', '50%',
- '75%', 'max']
- )
- result = df.describe(include='all')
- tm.assert_frame_equal(result, expected)
- def test_reduce_mixed_frame(self):
- # GH 6806
- df = DataFrame({
- 'bool_data': [True, True, False, False, False],
- 'int_data': [10, 20, 30, 40, 50],
- 'string_data': ['a', 'b', 'c', 'd', 'e'],
- })
- df.reindex(columns=['bool_data', 'int_data', 'string_data'])
- test = df.sum(axis=0)
- tm.assert_numpy_array_equal(test.values,
- np.array([2, 150, 'abcde'], dtype=object))
- tm.assert_series_equal(test, df.T.sum(axis=1))
- def test_count(self, float_frame_with_na, float_frame, float_string_frame):
- f = lambda s: notna(s).sum()
- assert_stat_op_calc('count', f, float_frame_with_na, has_skipna=False,
- check_dtype=False, check_dates=True)
- assert_stat_op_api('count', float_frame, float_string_frame,
- has_numeric_only=True)
- # corner case
- frame = DataFrame()
- ct1 = frame.count(1)
- assert isinstance(ct1, Series)
- ct2 = frame.count(0)
- assert isinstance(ct2, Series)
- # GH 423
- df = DataFrame(index=lrange(10))
- result = df.count(1)
- expected = Series(0, index=df.index)
- tm.assert_series_equal(result, expected)
- df = DataFrame(columns=lrange(10))
- result = df.count(0)
- expected = Series(0, index=df.columns)
- tm.assert_series_equal(result, expected)
- df = DataFrame()
- result = df.count()
- expected = Series(0, index=[])
- tm.assert_series_equal(result, expected)
- def test_nunique(self, float_frame_with_na, float_frame,
- float_string_frame):
- f = lambda s: len(algorithms.unique1d(s.dropna()))
- assert_stat_op_calc('nunique', f, float_frame_with_na,
- has_skipna=False, check_dtype=False,
- check_dates=True)
- assert_stat_op_api('nunique', float_frame, float_string_frame)
- df = DataFrame({'A': [1, 1, 1],
- 'B': [1, 2, 3],
- 'C': [1, np.nan, 3]})
- tm.assert_series_equal(df.nunique(), Series({'A': 1, 'B': 3, 'C': 2}))
- tm.assert_series_equal(df.nunique(dropna=False),
- Series({'A': 1, 'B': 3, 'C': 3}))
- tm.assert_series_equal(df.nunique(axis=1), Series({0: 1, 1: 2, 2: 2}))
- tm.assert_series_equal(df.nunique(axis=1, dropna=False),
- Series({0: 1, 1: 3, 2: 2}))
- def test_sum(self, float_frame_with_na, mixed_float_frame,
- float_frame, float_string_frame):
- assert_stat_op_api('sum', float_frame, float_string_frame,
- has_numeric_only=True)
- assert_stat_op_calc('sum', np.sum, float_frame_with_na,
- skipna_alternative=np.nansum)
- # mixed types (with upcasting happening)
- assert_stat_op_calc('sum', np.sum, mixed_float_frame.astype('float32'),
- check_dtype=False, check_less_precise=True)
- @pytest.mark.parametrize('method', ['sum', 'mean', 'prod', 'var',
- 'std', 'skew', 'min', 'max'])
- def test_stat_operators_attempt_obj_array(self, method):
- # GH 676
- data = {
- 'a': [-0.00049987540199591344, -0.0016467257772919831,
- 0.00067695870775883013],
- 'b': [-0, -0, 0.0],
- 'c': [0.00031111847529610595, 0.0014902627951905339,
- -0.00094099200035979691]
- }
- df1 = DataFrame(data, index=['foo', 'bar', 'baz'], dtype='O')
- df2 = DataFrame({0: [np.nan, 2], 1: [np.nan, 3],
- 2: [np.nan, 4]}, dtype=object)
- for df in [df1, df2]:
- assert df.values.dtype == np.object_
- result = getattr(df, method)(1)
- expected = getattr(df.astype('f8'), method)(1)
- if method in ['sum', 'prod']:
- tm.assert_series_equal(result, expected)
- def test_mean(self, float_frame_with_na, float_frame, float_string_frame):
- assert_stat_op_calc('mean', np.mean, float_frame_with_na,
- check_dates=True)
- assert_stat_op_api('mean', float_frame, float_string_frame)
- @pytest.mark.parametrize('tz', [None, 'UTC'])
- def test_mean_mixed_datetime_numeric(self, tz):
- # https://github.com/pandas-dev/pandas/issues/24752
- df = pd.DataFrame({"A": [1, 1],
- "B": [pd.Timestamp('2000', tz=tz)] * 2})
- result = df.mean()
- expected = pd.Series([1.0], index=['A'])
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize('tz', [None, 'UTC'])
- def test_mean_excludeds_datetimes(self, tz):
- # https://github.com/pandas-dev/pandas/issues/24752
- # Our long-term desired behavior is unclear, but the behavior in
- # 0.24.0rc1 was buggy.
- df = pd.DataFrame({"A": [pd.Timestamp('2000', tz=tz)] * 2})
- result = df.mean()
- expected = pd.Series()
- tm.assert_series_equal(result, expected)
- def test_product(self, float_frame_with_na, float_frame,
- float_string_frame):
- assert_stat_op_calc('product', np.prod, float_frame_with_na)
- assert_stat_op_api('product', float_frame, float_string_frame)
- # TODO: Ensure warning isn't emitted in the first place
- @pytest.mark.filterwarnings("ignore:All-NaN:RuntimeWarning")
- def test_median(self, float_frame_with_na, float_frame,
- float_string_frame):
- def wrapper(x):
- if isna(x).any():
- return np.nan
- return np.median(x)
- assert_stat_op_calc('median', wrapper, float_frame_with_na,
- check_dates=True)
- assert_stat_op_api('median', float_frame, float_string_frame)
- def test_min(self, float_frame_with_na, int_frame,
- float_frame, float_string_frame):
- with warnings.catch_warnings(record=True):
- warnings.simplefilter("ignore", RuntimeWarning)
- assert_stat_op_calc('min', np.min, float_frame_with_na,
- check_dates=True)
- assert_stat_op_calc('min', np.min, int_frame)
- assert_stat_op_api('min', float_frame, float_string_frame)
- def test_cummin(self, datetime_frame):
- datetime_frame.loc[5:10, 0] = np.nan
- datetime_frame.loc[10:15, 1] = np.nan
- datetime_frame.loc[15:, 2] = np.nan
- # axis = 0
- cummin = datetime_frame.cummin()
- expected = datetime_frame.apply(Series.cummin)
- tm.assert_frame_equal(cummin, expected)
- # axis = 1
- cummin = datetime_frame.cummin(axis=1)
- expected = datetime_frame.apply(Series.cummin, axis=1)
- tm.assert_frame_equal(cummin, expected)
- # it works
- df = DataFrame({'A': np.arange(20)}, index=np.arange(20))
- result = df.cummin() # noqa
- # fix issue
- cummin_xs = datetime_frame.cummin(axis=1)
- assert np.shape(cummin_xs) == np.shape(datetime_frame)
- def test_cummax(self, datetime_frame):
- datetime_frame.loc[5:10, 0] = np.nan
- datetime_frame.loc[10:15, 1] = np.nan
- datetime_frame.loc[15:, 2] = np.nan
- # axis = 0
- cummax = datetime_frame.cummax()
- expected = datetime_frame.apply(Series.cummax)
- tm.assert_frame_equal(cummax, expected)
- # axis = 1
- cummax = datetime_frame.cummax(axis=1)
- expected = datetime_frame.apply(Series.cummax, axis=1)
- tm.assert_frame_equal(cummax, expected)
- # it works
- df = DataFrame({'A': np.arange(20)}, index=np.arange(20))
- result = df.cummax() # noqa
- # fix issue
- cummax_xs = datetime_frame.cummax(axis=1)
- assert np.shape(cummax_xs) == np.shape(datetime_frame)
- def test_max(self, float_frame_with_na, int_frame,
- float_frame, float_string_frame):
- with warnings.catch_warnings(record=True):
- warnings.simplefilter("ignore", RuntimeWarning)
- assert_stat_op_calc('max', np.max, float_frame_with_na,
- check_dates=True)
- assert_stat_op_calc('max', np.max, int_frame)
- assert_stat_op_api('max', float_frame, float_string_frame)
- def test_mad(self, float_frame_with_na, float_frame, float_string_frame):
- f = lambda x: np.abs(x - x.mean()).mean()
- assert_stat_op_calc('mad', f, float_frame_with_na)
- assert_stat_op_api('mad', float_frame, float_string_frame)
- def test_var_std(self, float_frame_with_na, datetime_frame, float_frame,
- float_string_frame):
- alt = lambda x: np.var(x, ddof=1)
- assert_stat_op_calc('var', alt, float_frame_with_na)
- assert_stat_op_api('var', float_frame, float_string_frame)
- alt = lambda x: np.std(x, ddof=1)
- assert_stat_op_calc('std', alt, float_frame_with_na)
- assert_stat_op_api('std', float_frame, float_string_frame)
- result = datetime_frame.std(ddof=4)
- expected = datetime_frame.apply(lambda x: x.std(ddof=4))
- tm.assert_almost_equal(result, expected)
- result = datetime_frame.var(ddof=4)
- expected = datetime_frame.apply(lambda x: x.var(ddof=4))
- tm.assert_almost_equal(result, expected)
- arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
- result = nanops.nanvar(arr, axis=0)
- assert not (result < 0).any()
- with pd.option_context('use_bottleneck', False):
- result = nanops.nanvar(arr, axis=0)
- assert not (result < 0).any()
- @pytest.mark.parametrize(
- "meth", ['sem', 'var', 'std'])
- def test_numeric_only_flag(self, meth):
- # GH 9201
- df1 = DataFrame(np.random.randn(5, 3), columns=['foo', 'bar', 'baz'])
- # set one entry to a number in str format
- df1.loc[0, 'foo'] = '100'
- df2 = DataFrame(np.random.randn(5, 3), columns=['foo', 'bar', 'baz'])
- # set one entry to a non-number str
- df2.loc[0, 'foo'] = 'a'
- result = getattr(df1, meth)(axis=1, numeric_only=True)
- expected = getattr(df1[['bar', 'baz']], meth)(axis=1)
- tm.assert_series_equal(expected, result)
- result = getattr(df2, meth)(axis=1, numeric_only=True)
- expected = getattr(df2[['bar', 'baz']], meth)(axis=1)
- tm.assert_series_equal(expected, result)
- # df1 has all numbers, df2 has a letter inside
- pytest.raises(TypeError, lambda: getattr(df1, meth)(
- axis=1, numeric_only=False))
- pytest.raises(TypeError, lambda: getattr(df2, meth)(
- axis=1, numeric_only=False))
- @pytest.mark.parametrize('op', ['mean', 'std', 'var',
- 'skew', 'kurt', 'sem'])
- def test_mixed_ops(self, op):
- # GH 16116
- df = DataFrame({'int': [1, 2, 3, 4],
- 'float': [1., 2., 3., 4.],
- 'str': ['a', 'b', 'c', 'd']})
- result = getattr(df, op)()
- assert len(result) == 2
- with pd.option_context('use_bottleneck', False):
- result = getattr(df, op)()
- assert len(result) == 2
- def test_cumsum(self, datetime_frame):
- datetime_frame.loc[5:10, 0] = np.nan
- datetime_frame.loc[10:15, 1] = np.nan
- datetime_frame.loc[15:, 2] = np.nan
- # axis = 0
- cumsum = datetime_frame.cumsum()
- expected = datetime_frame.apply(Series.cumsum)
- tm.assert_frame_equal(cumsum, expected)
- # axis = 1
- cumsum = datetime_frame.cumsum(axis=1)
- expected = datetime_frame.apply(Series.cumsum, axis=1)
- tm.assert_frame_equal(cumsum, expected)
- # works
- df = DataFrame({'A': np.arange(20)}, index=np.arange(20))
- result = df.cumsum() # noqa
- # fix issue
- cumsum_xs = datetime_frame.cumsum(axis=1)
- assert np.shape(cumsum_xs) == np.shape(datetime_frame)
- def test_cumprod(self, datetime_frame):
- datetime_frame.loc[5:10, 0] = np.nan
- datetime_frame.loc[10:15, 1] = np.nan
- datetime_frame.loc[15:, 2] = np.nan
- # axis = 0
- cumprod = datetime_frame.cumprod()
- expected = datetime_frame.apply(Series.cumprod)
- tm.assert_frame_equal(cumprod, expected)
- # axis = 1
- cumprod = datetime_frame.cumprod(axis=1)
- expected = datetime_frame.apply(Series.cumprod, axis=1)
- tm.assert_frame_equal(cumprod, expected)
- # fix issue
- cumprod_xs = datetime_frame.cumprod(axis=1)
- assert np.shape(cumprod_xs) == np.shape(datetime_frame)
- # ints
- df = datetime_frame.fillna(0).astype(int)
- df.cumprod(0)
- df.cumprod(1)
- # ints32
- df = datetime_frame.fillna(0).astype(np.int32)
- df.cumprod(0)
- df.cumprod(1)
- def test_sem(self, float_frame_with_na, datetime_frame,
- float_frame, float_string_frame):
- alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x))
- assert_stat_op_calc('sem', alt, float_frame_with_na)
- assert_stat_op_api('sem', float_frame, float_string_frame)
- result = datetime_frame.sem(ddof=4)
- expected = datetime_frame.apply(
- lambda x: x.std(ddof=4) / np.sqrt(len(x)))
- tm.assert_almost_equal(result, expected)
- arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
- result = nanops.nansem(arr, axis=0)
- assert not (result < 0).any()
- with pd.option_context('use_bottleneck', False):
- result = nanops.nansem(arr, axis=0)
- assert not (result < 0).any()
- @td.skip_if_no_scipy
- def test_skew(self, float_frame_with_na, float_frame, float_string_frame):
- from scipy.stats import skew
- def alt(x):
- if len(x) < 3:
- return np.nan
- return skew(x, bias=False)
- assert_stat_op_calc('skew', alt, float_frame_with_na)
- assert_stat_op_api('skew', float_frame, float_string_frame)
- @td.skip_if_no_scipy
- def test_kurt(self, float_frame_with_na, float_frame, float_string_frame):
- from scipy.stats import kurtosis
- def alt(x):
- if len(x) < 4:
- return np.nan
- return kurtosis(x, bias=False)
- assert_stat_op_calc('kurt', alt, float_frame_with_na)
- assert_stat_op_api('kurt', float_frame, float_string_frame)
- index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]],
- codes=[[0, 0, 0, 0, 0, 0],
- [0, 1, 2, 0, 1, 2],
- [0, 1, 0, 1, 0, 1]])
- df = DataFrame(np.random.randn(6, 3), index=index)
- kurt = df.kurt()
- kurt2 = df.kurt(level=0).xs('bar')
- tm.assert_series_equal(kurt, kurt2, check_names=False)
- assert kurt.name is None
- assert kurt2.name == 'bar'
- @pytest.mark.parametrize("dropna, expected", [
- (True, {'A': [12],
- 'B': [10.0],
- 'C': [1.0],
- 'D': ['a'],
- 'E': Categorical(['a'], categories=['a']),
- 'F': to_datetime(['2000-1-2']),
- 'G': to_timedelta(['1 days'])}),
- (False, {'A': [12],
- 'B': [10.0],
- 'C': [np.nan],
- 'D': np.array([np.nan], dtype=object),
- 'E': Categorical([np.nan], categories=['a']),
- 'F': [pd.NaT],
- 'G': to_timedelta([pd.NaT])}),
- (True, {'H': [8, 9, np.nan, np.nan],
- 'I': [8, 9, np.nan, np.nan],
- 'J': [1, np.nan, np.nan, np.nan],
- 'K': Categorical(['a', np.nan, np.nan, np.nan],
- categories=['a']),
- 'L': to_datetime(['2000-1-2', 'NaT', 'NaT', 'NaT']),
- 'M': to_timedelta(['1 days', 'nan', 'nan', 'nan']),
- 'N': [0, 1, 2, 3]}),
- (False, {'H': [8, 9, np.nan, np.nan],
- 'I': [8, 9, np.nan, np.nan],
- 'J': [1, np.nan, np.nan, np.nan],
- 'K': Categorical([np.nan, 'a', np.nan, np.nan],
- categories=['a']),
- 'L': to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']),
- 'M': to_timedelta(['nan', '1 days', 'nan', 'nan']),
- 'N': [0, 1, 2, 3]})
- ])
- def test_mode_dropna(self, dropna, expected):
- df = DataFrame({"A": [12, 12, 19, 11],
- "B": [10, 10, np.nan, 3],
- "C": [1, np.nan, np.nan, np.nan],
- "D": [np.nan, np.nan, 'a', np.nan],
- "E": Categorical([np.nan, np.nan, 'a', np.nan]),
- "F": to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']),
- "G": to_timedelta(['1 days', 'nan', 'nan', 'nan']),
- "H": [8, 8, 9, 9],
- "I": [9, 9, 8, 8],
- "J": [1, 1, np.nan, np.nan],
- "K": Categorical(['a', np.nan, 'a', np.nan]),
- "L": to_datetime(['2000-1-2', '2000-1-2',
- 'NaT', 'NaT']),
- "M": to_timedelta(['1 days', 'nan',
- '1 days', 'nan']),
- "N": np.arange(4, dtype='int64')})
- result = df[sorted(list(expected.keys()))].mode(dropna=dropna)
- expected = DataFrame(expected)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.skipif(not compat.PY3, reason="only PY3")
- def test_mode_sortwarning(self):
- # Check for the warning that is raised when the mode
- # results cannot be sorted
- df = DataFrame({"A": [np.nan, np.nan, 'a', 'a']})
- expected = DataFrame({'A': ['a', np.nan]})
- with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
- result = df.mode(dropna=False)
- result = result.sort_values(by='A').reset_index(drop=True)
- tm.assert_frame_equal(result, expected)
- def test_operators_timedelta64(self):
- df = DataFrame(dict(A=date_range('2012-1-1', periods=3, freq='D'),
- B=date_range('2012-1-2', periods=3, freq='D'),
- C=Timestamp('20120101') -
- timedelta(minutes=5, seconds=5)))
- diffs = DataFrame(dict(A=df['A'] - df['C'],
- B=df['A'] - df['B']))
- # min
- result = diffs.min()
- assert result[0] == diffs.loc[0, 'A']
- assert result[1] == diffs.loc[0, 'B']
- result = diffs.min(axis=1)
- assert (result == diffs.loc[0, 'B']).all()
- # max
- result = diffs.max()
- assert result[0] == diffs.loc[2, 'A']
- assert result[1] == diffs.loc[2, 'B']
- result = diffs.max(axis=1)
- assert (result == diffs['A']).all()
- # abs
- result = diffs.abs()
- result2 = abs(diffs)
- expected = DataFrame(dict(A=df['A'] - df['C'],
- B=df['B'] - df['A']))
- tm.assert_frame_equal(result, expected)
- tm.assert_frame_equal(result2, expected)
- # mixed frame
- mixed = diffs.copy()
- mixed['C'] = 'foo'
- mixed['D'] = 1
- mixed['E'] = 1.
- mixed['F'] = Timestamp('20130101')
- # results in an object array
- result = mixed.min()
- expected = Series([pd.Timedelta(timedelta(seconds=5 * 60 + 5)),
- pd.Timedelta(timedelta(days=-1)),
- 'foo', 1, 1.0,
- Timestamp('20130101')],
- index=mixed.columns)
- tm.assert_series_equal(result, expected)
- # excludes numeric
- result = mixed.min(axis=1)
- expected = Series([1, 1, 1.], index=[0, 1, 2])
- tm.assert_series_equal(result, expected)
- # works when only those columns are selected
- result = mixed[['A', 'B']].min(1)
- expected = Series([timedelta(days=-1)] * 3)
- tm.assert_series_equal(result, expected)
- result = mixed[['A', 'B']].min()
- expected = Series([timedelta(seconds=5 * 60 + 5),
- timedelta(days=-1)], index=['A', 'B'])
- tm.assert_series_equal(result, expected)
- # GH 3106
- df = DataFrame({'time': date_range('20130102', periods=5),
- 'time2': date_range('20130105', periods=5)})
- df['off1'] = df['time2'] - df['time']
- assert df['off1'].dtype == 'timedelta64[ns]'
- df['off2'] = df['time'] - df['time2']
- df._consolidate_inplace()
- assert df['off1'].dtype == 'timedelta64[ns]'
- assert df['off2'].dtype == 'timedelta64[ns]'
- def test_sum_corner(self, empty_frame):
- axis0 = empty_frame.sum(0)
- axis1 = empty_frame.sum(1)
- assert isinstance(axis0, Series)
- assert isinstance(axis1, Series)
- assert len(axis0) == 0
- assert len(axis1) == 0
- @pytest.mark.parametrize('method, unit', [
- ('sum', 0),
- ('prod', 1),
- ])
- def test_sum_prod_nanops(self, method, unit):
- idx = ['a', 'b', 'c']
- df = pd.DataFrame({"a": [unit, unit],
- "b": [unit, np.nan],
- "c": [np.nan, np.nan]})
- # The default
- result = getattr(df, method)
- expected = pd.Series([unit, unit, unit], index=idx, dtype='float64')
- # min_count=1
- result = getattr(df, method)(min_count=1)
- expected = pd.Series([unit, unit, np.nan], index=idx)
- tm.assert_series_equal(result, expected)
- # min_count=0
- result = getattr(df, method)(min_count=0)
- expected = pd.Series([unit, unit, unit], index=idx, dtype='float64')
- tm.assert_series_equal(result, expected)
- result = getattr(df.iloc[1:], method)(min_count=1)
- expected = pd.Series([unit, np.nan, np.nan], index=idx)
- tm.assert_series_equal(result, expected)
- # min_count > 1
- df = pd.DataFrame({"A": [unit] * 10, "B": [unit] * 5 + [np.nan] * 5})
- result = getattr(df, method)(min_count=5)
- expected = pd.Series(result, index=['A', 'B'])
- tm.assert_series_equal(result, expected)
- result = getattr(df, method)(min_count=6)
- expected = pd.Series(result, index=['A', 'B'])
- tm.assert_series_equal(result, expected)
- def test_sum_nanops_timedelta(self):
- # prod isn't defined on timedeltas
- idx = ['a', 'b', 'c']
- df = pd.DataFrame({"a": [0, 0],
- "b": [0, np.nan],
- "c": [np.nan, np.nan]})
- df2 = df.apply(pd.to_timedelta)
- # 0 by default
- result = df2.sum()
- expected = pd.Series([0, 0, 0], dtype='m8[ns]', index=idx)
- tm.assert_series_equal(result, expected)
- # min_count=0
- result = df2.sum(min_count=0)
- tm.assert_series_equal(result, expected)
- # min_count=1
- result = df2.sum(min_count=1)
- expected = pd.Series([0, 0, np.nan], dtype='m8[ns]', index=idx)
- tm.assert_series_equal(result, expected)
- def test_sum_object(self, float_frame):
- values = float_frame.values.astype(int)
- frame = DataFrame(values, index=float_frame.index,
- columns=float_frame.columns)
- deltas = frame * timedelta(1)
- deltas.sum()
- def test_sum_bool(self, float_frame):
- # ensure this works, bug report
- bools = np.isnan(float_frame)
- bools.sum(1)
- bools.sum(0)
- def test_mean_corner(self, float_frame, float_string_frame):
- # unit test when have object data
- the_mean = float_string_frame.mean(axis=0)
- the_sum = float_string_frame.sum(axis=0, numeric_only=True)
- tm.assert_index_equal(the_sum.index, the_mean.index)
- assert len(the_mean.index) < len(float_string_frame.columns)
- # xs sum mixed type, just want to know it works...
- the_mean = float_string_frame.mean(axis=1)
- the_sum = float_string_frame.sum(axis=1, numeric_only=True)
- tm.assert_index_equal(the_sum.index, the_mean.index)
- # take mean of boolean column
- float_frame['bool'] = float_frame['A'] > 0
- means = float_frame.mean(0)
- assert means['bool'] == float_frame['bool'].values.mean()
- def test_stats_mixed_type(self, float_string_frame):
- # don't blow up
- float_string_frame.std(1)
- float_string_frame.var(1)
- float_string_frame.mean(1)
- float_string_frame.skew(1)
- # TODO: Ensure warning isn't emitted in the first place
- @pytest.mark.filterwarnings("ignore:All-NaN:RuntimeWarning")
- def test_median_corner(self, int_frame, float_frame, float_string_frame):
- def wrapper(x):
- if isna(x).any():
- return np.nan
- return np.median(x)
- assert_stat_op_calc('median', wrapper, int_frame, check_dtype=False,
- check_dates=True)
- assert_stat_op_api('median', float_frame, float_string_frame)
- # Miscellanea
- def test_count_objects(self, float_string_frame):
- dm = DataFrame(float_string_frame._series)
- df = DataFrame(float_string_frame._series)
- tm.assert_series_equal(dm.count(), df.count())
- tm.assert_series_equal(dm.count(1), df.count(1))
- def test_cumsum_corner(self):
- dm = DataFrame(np.arange(20).reshape(4, 5),
- index=lrange(4), columns=lrange(5))
- # ?(wesm)
- result = dm.cumsum() # noqa
- def test_sum_bools(self):
- df = DataFrame(index=lrange(1), columns=lrange(10))
- bools = isna(df)
- assert bools.sum(axis=1)[0] == 10
- # Index of max / min
- def test_idxmin(self, float_frame, int_frame):
- frame = float_frame
- frame.loc[5:10] = np.nan
- frame.loc[15:20, -2:] = np.nan
- for skipna in [True, False]:
- for axis in [0, 1]:
- for df in [frame, int_frame]:
- result = df.idxmin(axis=axis, skipna=skipna)
- expected = df.apply(Series.idxmin, axis=axis,
- skipna=skipna)
- tm.assert_series_equal(result, expected)
- pytest.raises(ValueError, frame.idxmin, axis=2)
- def test_idxmax(self, float_frame, int_frame):
- frame = float_frame
- frame.loc[5:10] = np.nan
- frame.loc[15:20, -2:] = np.nan
- for skipna in [True, False]:
- for axis in [0, 1]:
- for df in [frame, int_frame]:
- result = df.idxmax(axis=axis, skipna=skipna)
- expected = df.apply(Series.idxmax, axis=axis,
- skipna=skipna)
- tm.assert_series_equal(result, expected)
- pytest.raises(ValueError, frame.idxmax, axis=2)
- # ----------------------------------------------------------------------
- # Logical reductions
- @pytest.mark.parametrize('opname', ['any', 'all'])
- def test_any_all(self, opname, bool_frame_with_na, float_string_frame):
- assert_bool_op_calc(opname, getattr(np, opname), bool_frame_with_na,
- has_skipna=True)
- assert_bool_op_api(opname, bool_frame_with_na, float_string_frame,
- has_bool_only=True)
- def test_any_all_extra(self):
- df = DataFrame({
- 'A': [True, False, False],
- 'B': [True, True, False],
- 'C': [True, True, True],
- }, index=['a', 'b', 'c'])
- result = df[['A', 'B']].any(1)
- expected = Series([True, True, False], index=['a', 'b', 'c'])
- tm.assert_series_equal(result, expected)
- result = df[['A', 'B']].any(1, bool_only=True)
- tm.assert_series_equal(result, expected)
- result = df.all(1)
- expected = Series([True, False, False], index=['a', 'b', 'c'])
- tm.assert_series_equal(result, expected)
- result = df.all(1, bool_only=True)
- tm.assert_series_equal(result, expected)
- # Axis is None
- result = df.all(axis=None).item()
- assert result is False
- result = df.any(axis=None).item()
- assert result is True
- result = df[['C']].all(axis=None).item()
- assert result is True
- def test_any_datetime(self):
- # GH 23070
- float_data = [1, np.nan, 3, np.nan]
- datetime_data = [pd.Timestamp('1960-02-15'),
- pd.Timestamp('1960-02-16'),
- pd.NaT,
- pd.NaT]
- df = DataFrame({
- "A": float_data,
- "B": datetime_data
- })
- result = df.any(1)
- expected = Series([True, True, True, False])
- tm.assert_series_equal(result, expected)
- def test_any_all_bool_only(self):
- # GH 25101
- df = DataFrame({"col1": [1, 2, 3],
- "col2": [4, 5, 6],
- "col3": [None, None, None]})
- result = df.all(bool_only=True)
- expected = Series(dtype=np.bool)
- tm.assert_series_equal(result, expected)
- df = DataFrame({"col1": [1, 2, 3],
- "col2": [4, 5, 6],
- "col3": [None, None, None],
- "col4": [False, False, True]})
- result = df.all(bool_only=True)
- expected = Series({"col4": False})
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize('func, data, expected', [
- (np.any, {}, False),
- (np.all, {}, True),
- (np.any, {'A': []}, False),
- (np.all, {'A': []}, True),
- (np.any, {'A': [False, False]}, False),
- (np.all, {'A': [False, False]}, False),
- (np.any, {'A': [True, False]}, True),
- (np.all, {'A': [True, False]}, False),
- (np.any, {'A': [True, True]}, True),
- (np.all, {'A': [True, True]}, True),
- (np.any, {'A': [False], 'B': [False]}, False),
- (np.all, {'A': [False], 'B': [False]}, False),
- (np.any, {'A': [False, False], 'B': [False, True]}, True),
- (np.all, {'A': [False, False], 'B': [False, True]}, False),
- # other types
- (np.all, {'A': pd.Series([0.0, 1.0], dtype='float')}, False),
- (np.any, {'A': pd.Series([0.0, 1.0], dtype='float')}, True),
- (np.all, {'A': pd.Series([0, 1], dtype=int)}, False),
- (np.any, {'A': pd.Series([0, 1], dtype=int)}, True),
- pytest.param(np.all, {'A': pd.Series([0, 1], dtype='M8[ns]')}, False,
- marks=[td.skip_if_np_lt_115]),
- pytest.param(np.any, {'A': pd.Series([0, 1], dtype='M8[ns]')}, True,
- marks=[td.skip_if_np_lt_115]),
- pytest.param(np.all, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True,
- marks=[td.skip_if_np_lt_115]),
- pytest.param(np.any, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True,
- marks=[td.skip_if_np_lt_115]),
- pytest.param(np.all, {'A': pd.Series([0, 1], dtype='m8[ns]')}, False,
- marks=[td.skip_if_np_lt_115]),
- pytest.param(np.any, {'A': pd.Series([0, 1], dtype='m8[ns]')}, True,
- marks=[td.skip_if_np_lt_115]),
- pytest.param(np.all, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True,
- marks=[td.skip_if_np_lt_115]),
- pytest.param(np.any, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True,
- marks=[td.skip_if_np_lt_115]),
- (np.all, {'A': pd.Series([0, 1], dtype='category')}, False),
- (np.any, {'A': pd.Series([0, 1], dtype='category')}, True),
- (np.all, {'A': pd.Series([1, 2], dtype='category')}, True),
- (np.any, {'A': pd.Series([1, 2], dtype='category')}, True),
- # # Mix
- # GH 21484
- # (np.all, {'A': pd.Series([10, 20], dtype='M8[ns]'),
- # 'B': pd.Series([10, 20], dtype='m8[ns]')}, True),
- ])
- def test_any_all_np_func(self, func, data, expected):
- # GH 19976
- data = DataFrame(data)
- result = func(data)
- assert isinstance(result, np.bool_)
- assert result.item() is expected
- # method version
- result = getattr(DataFrame(data), func.__name__)(axis=None)
- assert isinstance(result, np.bool_)
- assert result.item() is expected
- def test_any_all_object(self):
- # GH 19976
- result = np.all(DataFrame(columns=['a', 'b'])).item()
- assert result is True
- result = np.any(DataFrame(columns=['a', 'b'])).item()
- assert result is False
- @pytest.mark.parametrize('method', ['any', 'all'])
- def test_any_all_level_axis_none_raises(self, method):
- df = DataFrame(
- {"A": 1},
- index=MultiIndex.from_product([['A', 'B'], ['a', 'b']],
- names=['out', 'in'])
- )
- xpr = "Must specify 'axis' when aggregating by level."
- with pytest.raises(ValueError, match=xpr):
- getattr(df, method)(axis=None, level='out')
- # ----------------------------------------------------------------------
- # Isin
- def test_isin(self):
- # GH 4211
- df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],
- 'ids2': ['a', 'n', 'c', 'n']},
- index=['foo', 'bar', 'baz', 'qux'])
- other = ['a', 'b', 'c']
- result = df.isin(other)
- expected = DataFrame([df.loc[s].isin(other) for s in df.index])
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("empty", [[], Series(), np.array([])])
- def test_isin_empty(self, empty):
- # GH 16991
- df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']})
- expected = DataFrame(False, df.index, df.columns)
- result = df.isin(empty)
- tm.assert_frame_equal(result, expected)
- def test_isin_dict(self):
- df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']})
- d = {'A': ['a']}
- expected = DataFrame(False, df.index, df.columns)
- expected.loc[0, 'A'] = True
- result = df.isin(d)
- tm.assert_frame_equal(result, expected)
- # non unique columns
- df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']})
- df.columns = ['A', 'A']
- expected = DataFrame(False, df.index, df.columns)
- expected.loc[0, 'A'] = True
- result = df.isin(d)
- tm.assert_frame_equal(result, expected)
- def test_isin_with_string_scalar(self):
- # GH 4763
- df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],
- 'ids2': ['a', 'n', 'c', 'n']},
- index=['foo', 'bar', 'baz', 'qux'])
- with pytest.raises(TypeError):
- df.isin('a')
- with pytest.raises(TypeError):
- df.isin('aaa')
- def test_isin_df(self):
- df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]})
- df2 = DataFrame({'A': [0, 2, 12, 4], 'B': [2, np.nan, 4, 5]})
- expected = DataFrame(False, df1.index, df1.columns)
- result = df1.isin(df2)
- expected['A'].loc[[1, 3]] = True
- expected['B'].loc[[0, 2]] = True
- tm.assert_frame_equal(result, expected)
- # partial overlapping columns
- df2.columns = ['A', 'C']
- result = df1.isin(df2)
- expected['B'] = False
- tm.assert_frame_equal(result, expected)
- def test_isin_tuples(self):
- # GH 16394
- df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
- df['C'] = list(zip(df['A'], df['B']))
- result = df['C'].isin([(1, 'a')])
- tm.assert_series_equal(result,
- Series([True, False, False], name="C"))
- def test_isin_df_dupe_values(self):
- df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]})
- # just cols duped
- df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]],
- columns=['B', 'B'])
- with pytest.raises(ValueError):
- df1.isin(df2)
- # just index duped
- df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]],
- columns=['A', 'B'], index=[0, 0, 1, 1])
- with pytest.raises(ValueError):
- df1.isin(df2)
- # cols and index:
- df2.columns = ['B', 'B']
- with pytest.raises(ValueError):
- df1.isin(df2)
- def test_isin_dupe_self(self):
- other = DataFrame({'A': [1, 0, 1, 0], 'B': [1, 1, 0, 0]})
- df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=['A', 'A'])
- result = df.isin(other)
- expected = DataFrame(False, index=df.index, columns=df.columns)
- expected.loc[0] = True
- expected.iloc[1, 1] = True
- tm.assert_frame_equal(result, expected)
- def test_isin_against_series(self):
- df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]},
- index=['a', 'b', 'c', 'd'])
- s = pd.Series([1, 3, 11, 4], index=['a', 'b', 'c', 'd'])
- expected = DataFrame(False, index=df.index, columns=df.columns)
- expected['A'].loc['a'] = True
- expected.loc['d'] = True
- result = df.isin(s)
- tm.assert_frame_equal(result, expected)
- def test_isin_multiIndex(self):
- idx = MultiIndex.from_tuples([(0, 'a', 'foo'), (0, 'a', 'bar'),
- (0, 'b', 'bar'), (0, 'b', 'baz'),
- (2, 'a', 'foo'), (2, 'a', 'bar'),
- (2, 'c', 'bar'), (2, 'c', 'baz'),
- (1, 'b', 'foo'), (1, 'b', 'bar'),
- (1, 'c', 'bar'), (1, 'c', 'baz')])
- df1 = DataFrame({'A': np.ones(12),
- 'B': np.zeros(12)}, index=idx)
- df2 = DataFrame({'A': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
- 'B': [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1]})
- # against regular index
- expected = DataFrame(False, index=df1.index, columns=df1.columns)
- result = df1.isin(df2)
- tm.assert_frame_equal(result, expected)
- df2.index = idx
- expected = df2.values.astype(np.bool)
- expected[:, 1] = ~expected[:, 1]
- expected = DataFrame(expected, columns=['A', 'B'], index=idx)
- result = df1.isin(df2)
- tm.assert_frame_equal(result, expected)
- def test_isin_empty_datetimelike(self):
- # GH 15473
- df1_ts = DataFrame({'date':
- pd.to_datetime(['2014-01-01', '2014-01-02'])})
- df1_td = DataFrame({'date':
- [pd.Timedelta(1, 's'), pd.Timedelta(2, 's')]})
- df2 = DataFrame({'date': []})
- df3 = DataFrame()
- expected = DataFrame({'date': [False, False]})
- result = df1_ts.isin(df2)
- tm.assert_frame_equal(result, expected)
- result = df1_ts.isin(df3)
- tm.assert_frame_equal(result, expected)
- result = df1_td.isin(df2)
- tm.assert_frame_equal(result, expected)
- result = df1_td.isin(df3)
- tm.assert_frame_equal(result, expected)
- # Rounding
- def test_round(self):
- # GH 2665
- # Test that rounding an empty DataFrame does nothing
- df = DataFrame()
- tm.assert_frame_equal(df, df.round())
- # Here's the test frame we'll be working with
- df = DataFrame({'col1': [1.123, 2.123, 3.123],
- 'col2': [1.234, 2.234, 3.234]})
- # Default round to integer (i.e. decimals=0)
- expected_rounded = DataFrame(
- {'col1': [1., 2., 3.], 'col2': [1., 2., 3.]})
- tm.assert_frame_equal(df.round(), expected_rounded)
- # Round with an integer
- decimals = 2
- expected_rounded = DataFrame({'col1': [1.12, 2.12, 3.12],
- 'col2': [1.23, 2.23, 3.23]})
- tm.assert_frame_equal(df.round(decimals), expected_rounded)
- # This should also work with np.round (since np.round dispatches to
- # df.round)
- tm.assert_frame_equal(np.round(df, decimals), expected_rounded)
- # Round with a list
- round_list = [1, 2]
- with pytest.raises(TypeError):
- df.round(round_list)
- # Round with a dictionary
- expected_rounded = DataFrame(
- {'col1': [1.1, 2.1, 3.1], 'col2': [1.23, 2.23, 3.23]})
- round_dict = {'col1': 1, 'col2': 2}
- tm.assert_frame_equal(df.round(round_dict), expected_rounded)
- # Incomplete dict
- expected_partially_rounded = DataFrame(
- {'col1': [1.123, 2.123, 3.123], 'col2': [1.2, 2.2, 3.2]})
- partial_round_dict = {'col2': 1}
- tm.assert_frame_equal(df.round(partial_round_dict),
- expected_partially_rounded)
- # Dict with unknown elements
- wrong_round_dict = {'col3': 2, 'col2': 1}
- tm.assert_frame_equal(df.round(wrong_round_dict),
- expected_partially_rounded)
- # float input to `decimals`
- non_int_round_dict = {'col1': 1, 'col2': 0.5}
- with pytest.raises(TypeError):
- df.round(non_int_round_dict)
- # String input
- non_int_round_dict = {'col1': 1, 'col2': 'foo'}
- with pytest.raises(TypeError):
- df.round(non_int_round_dict)
- non_int_round_Series = Series(non_int_round_dict)
- with pytest.raises(TypeError):
- df.round(non_int_round_Series)
- # List input
- non_int_round_dict = {'col1': 1, 'col2': [1, 2]}
- with pytest.raises(TypeError):
- df.round(non_int_round_dict)
- non_int_round_Series = Series(non_int_round_dict)
- with pytest.raises(TypeError):
- df.round(non_int_round_Series)
- # Non integer Series inputs
- non_int_round_Series = Series(non_int_round_dict)
- with pytest.raises(TypeError):
- df.round(non_int_round_Series)
- non_int_round_Series = Series(non_int_round_dict)
- with pytest.raises(TypeError):
- df.round(non_int_round_Series)
- # Negative numbers
- negative_round_dict = {'col1': -1, 'col2': -2}
- big_df = df * 100
- expected_neg_rounded = DataFrame(
- {'col1': [110., 210, 310], 'col2': [100., 200, 300]})
- tm.assert_frame_equal(big_df.round(negative_round_dict),
- expected_neg_rounded)
- # nan in Series round
- nan_round_Series = Series({'col1': np.nan, 'col2': 1})
- # TODO(wesm): unused?
- expected_nan_round = DataFrame({ # noqa
- 'col1': [1.123, 2.123, 3.123],
- 'col2': [1.2, 2.2, 3.2]})
- with pytest.raises(TypeError):
- df.round(nan_round_Series)
- # Make sure this doesn't break existing Series.round
- tm.assert_series_equal(df['col1'].round(1), expected_rounded['col1'])
- # named columns
- # GH 11986
- decimals = 2
- expected_rounded = DataFrame(
- {'col1': [1.12, 2.12, 3.12], 'col2': [1.23, 2.23, 3.23]})
- df.columns.name = "cols"
- expected_rounded.columns.name = "cols"
- tm.assert_frame_equal(df.round(decimals), expected_rounded)
- # interaction of named columns & series
- tm.assert_series_equal(df['col1'].round(decimals),
- expected_rounded['col1'])
- tm.assert_series_equal(df.round(decimals)['col1'],
- expected_rounded['col1'])
- def test_numpy_round(self):
- # GH 12600
- df = DataFrame([[1.53, 1.36], [0.06, 7.01]])
- out = np.round(df, decimals=0)
- expected = DataFrame([[2., 1.], [0., 7.]])
- tm.assert_frame_equal(out, expected)
- msg = "the 'out' parameter is not supported"
- with pytest.raises(ValueError, match=msg):
- np.round(df, decimals=0, out=df)
- def test_round_mixed_type(self):
- # GH 11885
- df = DataFrame({'col1': [1.1, 2.2, 3.3, 4.4],
- 'col2': ['1', 'a', 'c', 'f'],
- 'col3': date_range('20111111', periods=4)})
- round_0 = DataFrame({'col1': [1., 2., 3., 4.],
- 'col2': ['1', 'a', 'c', 'f'],
- 'col3': date_range('20111111', periods=4)})
- tm.assert_frame_equal(df.round(), round_0)
- tm.assert_frame_equal(df.round(1), df)
- tm.assert_frame_equal(df.round({'col1': 1}), df)
- tm.assert_frame_equal(df.round({'col1': 0}), round_0)
- tm.assert_frame_equal(df.round({'col1': 0, 'col2': 1}), round_0)
- tm.assert_frame_equal(df.round({'col3': 1}), df)
- def test_round_issue(self):
- # GH 11611
- df = pd.DataFrame(np.random.random([3, 3]), columns=['A', 'B', 'C'],
- index=['first', 'second', 'third'])
- dfs = pd.concat((df, df), axis=1)
- rounded = dfs.round()
- tm.assert_index_equal(rounded.index, dfs.index)
- decimals = pd.Series([1, 0, 2], index=['A', 'B', 'A'])
- pytest.raises(ValueError, df.round, decimals)
- def test_built_in_round(self):
- if not compat.PY3:
- pytest.skip("build in round cannot be overridden "
- "prior to Python 3")
- # GH 11763
- # Here's the test frame we'll be working with
- df = DataFrame(
- {'col1': [1.123, 2.123, 3.123], 'col2': [1.234, 2.234, 3.234]})
- # Default round to integer (i.e. decimals=0)
- expected_rounded = DataFrame(
- {'col1': [1., 2., 3.], 'col2': [1., 2., 3.]})
- tm.assert_frame_equal(round(df), expected_rounded)
- def test_round_nonunique_categorical(self):
- # See GH21809
- idx = pd.CategoricalIndex(['low'] * 3 + ['hi'] * 3)
- df = pd.DataFrame(np.random.rand(6, 3), columns=list('abc'))
- expected = df.round(3)
- expected.index = idx
- df_categorical = df.copy().set_index(idx)
- assert df_categorical.shape == (6, 3)
- result = df_categorical.round(3)
- assert result.shape == (6, 3)
- tm.assert_frame_equal(result, expected)
- def test_pct_change(self):
- # GH 11150
- pnl = DataFrame([np.arange(0, 40, 10), np.arange(0, 40, 10), np.arange(
- 0, 40, 10)]).astype(np.float64)
- pnl.iat[1, 0] = np.nan
- pnl.iat[1, 1] = np.nan
- pnl.iat[2, 3] = 60
- for axis in range(2):
- expected = pnl.ffill(axis=axis) / pnl.ffill(axis=axis).shift(
- axis=axis) - 1
- result = pnl.pct_change(axis=axis, fill_method='pad')
- tm.assert_frame_equal(result, expected)
- # Clip
- def test_clip(self, float_frame):
- median = float_frame.median().median()
- original = float_frame.copy()
- with tm.assert_produces_warning(FutureWarning):
- capped = float_frame.clip_upper(median)
- assert not (capped.values > median).any()
- with tm.assert_produces_warning(FutureWarning):
- floored = float_frame.clip_lower(median)
- assert not (floored.values < median).any()
- double = float_frame.clip(upper=median, lower=median)
- assert not (double.values != median).any()
- # Verify that float_frame was not changed inplace
- assert (float_frame.values == original.values).all()
- def test_inplace_clip(self, float_frame):
- # GH 15388
- median = float_frame.median().median()
- frame_copy = float_frame.copy()
- with tm.assert_produces_warning(FutureWarning):
- frame_copy.clip_upper(median, inplace=True)
- assert not (frame_copy.values > median).any()
- frame_copy = float_frame.copy()
- with tm.assert_produces_warning(FutureWarning):
- frame_copy.clip_lower(median, inplace=True)
- assert not (frame_copy.values < median).any()
- frame_copy = float_frame.copy()
- frame_copy.clip(upper=median, lower=median, inplace=True)
- assert not (frame_copy.values != median).any()
- def test_dataframe_clip(self):
- # GH 2747
- df = DataFrame(np.random.randn(1000, 2))
- for lb, ub in [(-1, 1), (1, -1)]:
- clipped_df = df.clip(lb, ub)
- lb, ub = min(lb, ub), max(ub, lb)
- lb_mask = df.values <= lb
- ub_mask = df.values >= ub
- mask = ~lb_mask & ~ub_mask
- assert (clipped_df.values[lb_mask] == lb).all()
- assert (clipped_df.values[ub_mask] == ub).all()
- assert (clipped_df.values[mask] == df.values[mask]).all()
- def test_clip_mixed_numeric(self):
- # TODO(jreback)
- # clip on mixed integer or floats
- # with integer clippers coerces to float
- df = DataFrame({'A': [1, 2, 3],
- 'B': [1., np.nan, 3.]})
- result = df.clip(1, 2)
- expected = DataFrame({'A': [1, 2, 2],
- 'B': [1., np.nan, 2.]})
- tm.assert_frame_equal(result, expected, check_like=True)
- # GH 24162, clipping now preserves numeric types per column
- df = DataFrame([[1, 2, 3.4], [3, 4, 5.6]],
- columns=['foo', 'bar', 'baz'])
- expected = df.dtypes
- result = df.clip(upper=3).dtypes
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("inplace", [True, False])
- def test_clip_against_series(self, inplace):
- # GH 6966
- df = DataFrame(np.random.randn(1000, 2))
- lb = Series(np.random.randn(1000))
- ub = lb + 1
- original = df.copy()
- clipped_df = df.clip(lb, ub, axis=0, inplace=inplace)
- if inplace:
- clipped_df = df
- for i in range(2):
- lb_mask = original.iloc[:, i] <= lb
- ub_mask = original.iloc[:, i] >= ub
- mask = ~lb_mask & ~ub_mask
- result = clipped_df.loc[lb_mask, i]
- tm.assert_series_equal(result, lb[lb_mask], check_names=False)
- assert result.name == i
- result = clipped_df.loc[ub_mask, i]
- tm.assert_series_equal(result, ub[ub_mask], check_names=False)
- assert result.name == i
- tm.assert_series_equal(clipped_df.loc[mask, i], df.loc[mask, i])
- @pytest.mark.parametrize("inplace", [True, False])
- @pytest.mark.parametrize("lower", [[2, 3, 4], np.asarray([2, 3, 4])])
- @pytest.mark.parametrize("axis,res", [
- (0, [[2., 2., 3.], [4., 5., 6.], [7., 7., 7.]]),
- (1, [[2., 3., 4.], [4., 5., 6.], [5., 6., 7.]])
- ])
- def test_clip_against_list_like(self, simple_frame,
- inplace, lower, axis, res):
- # GH 15390
- original = simple_frame.copy(deep=True)
- result = original.clip(lower=lower, upper=[5, 6, 7],
- axis=axis, inplace=inplace)
- expected = pd.DataFrame(res,
- columns=original.columns,
- index=original.index)
- if inplace:
- result = original
- tm.assert_frame_equal(result, expected, check_exact=True)
- @pytest.mark.parametrize("axis", [0, 1, None])
- def test_clip_against_frame(self, axis):
- df = DataFrame(np.random.randn(1000, 2))
- lb = DataFrame(np.random.randn(1000, 2))
- ub = lb + 1
- clipped_df = df.clip(lb, ub, axis=axis)
- lb_mask = df <= lb
- ub_mask = df >= ub
- mask = ~lb_mask & ~ub_mask
- tm.assert_frame_equal(clipped_df[lb_mask], lb[lb_mask])
- tm.assert_frame_equal(clipped_df[ub_mask], ub[ub_mask])
- tm.assert_frame_equal(clipped_df[mask], df[mask])
- def test_clip_against_unordered_columns(self):
- # GH 20911
- df1 = DataFrame(np.random.randn(1000, 4), columns=['A', 'B', 'C', 'D'])
- df2 = DataFrame(np.random.randn(1000, 4), columns=['D', 'A', 'B', 'C'])
- df3 = DataFrame(df2.values - 1, columns=['B', 'D', 'C', 'A'])
- result_upper = df1.clip(lower=0, upper=df2)
- expected_upper = df1.clip(lower=0, upper=df2[df1.columns])
- result_lower = df1.clip(lower=df3, upper=3)
- expected_lower = df1.clip(lower=df3[df1.columns], upper=3)
- result_lower_upper = df1.clip(lower=df3, upper=df2)
- expected_lower_upper = df1.clip(lower=df3[df1.columns],
- upper=df2[df1.columns])
- tm.assert_frame_equal(result_upper, expected_upper)
- tm.assert_frame_equal(result_lower, expected_lower)
- tm.assert_frame_equal(result_lower_upper, expected_lower_upper)
- def test_clip_with_na_args(self, float_frame):
- """Should process np.nan argument as None """
- # GH 17276
- tm.assert_frame_equal(float_frame.clip(np.nan), float_frame)
- tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan),
- float_frame)
- # GH 19992
- df = DataFrame({'col_0': [1, 2, 3], 'col_1': [4, 5, 6],
- 'col_2': [7, 8, 9]})
- result = df.clip(lower=[4, 5, np.nan], axis=0)
- expected = DataFrame({'col_0': [4, 5, np.nan], 'col_1': [4, 5, np.nan],
- 'col_2': [7, 8, np.nan]})
- tm.assert_frame_equal(result, expected)
- result = df.clip(lower=[4, 5, np.nan], axis=1)
- expected = DataFrame({'col_0': [4, 4, 4], 'col_1': [5, 5, 6],
- 'col_2': [np.nan, np.nan, np.nan]})
- tm.assert_frame_equal(result, expected)
- # Matrix-like
- def test_dot(self):
- a = DataFrame(np.random.randn(3, 4), index=['a', 'b', 'c'],
- columns=['p', 'q', 'r', 's'])
- b = DataFrame(np.random.randn(4, 2), index=['p', 'q', 'r', 's'],
- columns=['one', 'two'])
- result = a.dot(b)
- expected = DataFrame(np.dot(a.values, b.values),
- index=['a', 'b', 'c'],
- columns=['one', 'two'])
- # Check alignment
- b1 = b.reindex(index=reversed(b.index))
- result = a.dot(b)
- tm.assert_frame_equal(result, expected)
- # Check series argument
- result = a.dot(b['one'])
- tm.assert_series_equal(result, expected['one'], check_names=False)
- assert result.name is None
- result = a.dot(b1['one'])
- tm.assert_series_equal(result, expected['one'], check_names=False)
- assert result.name is None
- # can pass correct-length arrays
- row = a.iloc[0].values
- result = a.dot(row)
- expected = a.dot(a.iloc[0])
- tm.assert_series_equal(result, expected)
- with pytest.raises(ValueError, match='Dot product shape mismatch'):
- a.dot(row[:-1])
- a = np.random.rand(1, 5)
- b = np.random.rand(5, 1)
- A = DataFrame(a)
- # TODO(wesm): unused
- B = DataFrame(b) # noqa
- # it works
- result = A.dot(b)
- # unaligned
- df = DataFrame(np.random.randn(3, 4),
- index=[1, 2, 3], columns=lrange(4))
- df2 = DataFrame(np.random.randn(5, 3),
- index=lrange(5), columns=[1, 2, 3])
- with pytest.raises(ValueError, match='aligned'):
- df.dot(df2)
- @pytest.mark.skipif(not PY35,
- reason='matmul supported for Python>=3.5')
- def test_matmul(self):
- # matmul test is for GH 10259
- a = DataFrame(np.random.randn(3, 4), index=['a', 'b', 'c'],
- columns=['p', 'q', 'r', 's'])
- b = DataFrame(np.random.randn(4, 2), index=['p', 'q', 'r', 's'],
- columns=['one', 'two'])
- # DataFrame @ DataFrame
- result = operator.matmul(a, b)
- expected = DataFrame(np.dot(a.values, b.values),
- index=['a', 'b', 'c'],
- columns=['one', 'two'])
- tm.assert_frame_equal(result, expected)
- # DataFrame @ Series
- result = operator.matmul(a, b.one)
- expected = Series(np.dot(a.values, b.one.values),
- index=['a', 'b', 'c'])
- tm.assert_series_equal(result, expected)
- # np.array @ DataFrame
- result = operator.matmul(a.values, b)
- assert isinstance(result, DataFrame)
- assert result.columns.equals(b.columns)
- assert result.index.equals(pd.Index(range(3)))
- expected = np.dot(a.values, b.values)
- tm.assert_almost_equal(result.values, expected)
- # nested list @ DataFrame (__rmatmul__)
- result = operator.matmul(a.values.tolist(), b)
- expected = DataFrame(np.dot(a.values, b.values),
- index=['a', 'b', 'c'],
- columns=['one', 'two'])
- tm.assert_almost_equal(result.values, expected.values)
- # mixed dtype DataFrame @ DataFrame
- a['q'] = a.q.round().astype(int)
- result = operator.matmul(a, b)
- expected = DataFrame(np.dot(a.values, b.values),
- index=['a', 'b', 'c'],
- columns=['one', 'two'])
- tm.assert_frame_equal(result, expected)
- # different dtypes DataFrame @ DataFrame
- a = a.astype(int)
- result = operator.matmul(a, b)
- expected = DataFrame(np.dot(a.values, b.values),
- index=['a', 'b', 'c'],
- columns=['one', 'two'])
- tm.assert_frame_equal(result, expected)
- # unaligned
- df = DataFrame(np.random.randn(3, 4),
- index=[1, 2, 3], columns=lrange(4))
- df2 = DataFrame(np.random.randn(5, 3),
- index=lrange(5), columns=[1, 2, 3])
- with pytest.raises(ValueError, match='aligned'):
- operator.matmul(df, df2)
- @pytest.fixture
- def df_duplicates():
- return pd.DataFrame({'a': [1, 2, 3, 4, 4],
- 'b': [1, 1, 1, 1, 1],
- 'c': [0, 1, 2, 5, 4]},
- index=[0, 0, 1, 1, 1])
- @pytest.fixture
- def df_strings():
- return pd.DataFrame({'a': np.random.permutation(10),
- 'b': list(ascii_lowercase[:10]),
- 'c': np.random.permutation(10).astype('float64')})
- @pytest.fixture
- def df_main_dtypes():
- return pd.DataFrame(
- {'group': [1, 1, 2],
- 'int': [1, 2, 3],
- 'float': [4., 5., 6.],
- 'string': list('abc'),
- 'category_string': pd.Series(list('abc')).astype('category'),
- 'category_int': [7, 8, 9],
- 'datetime': pd.date_range('20130101', periods=3),
- 'datetimetz': pd.date_range('20130101',
- periods=3,
- tz='US/Eastern'),
- 'timedelta': pd.timedelta_range('1 s', periods=3, freq='s')},
- columns=['group', 'int', 'float', 'string',
- 'category_string', 'category_int',
- 'datetime', 'datetimetz',
- 'timedelta'])
- class TestNLargestNSmallest(object):
- dtype_error_msg_template = ("Column {column!r} has dtype {dtype}, cannot "
- "use method {method!r} with this dtype")
- # ----------------------------------------------------------------------
- # Top / bottom
- @pytest.mark.parametrize('order', [
- ['a'],
- ['c'],
- ['a', 'b'],
- ['a', 'c'],
- ['b', 'a'],
- ['b', 'c'],
- ['a', 'b', 'c'],
- ['c', 'a', 'b'],
- ['c', 'b', 'a'],
- ['b', 'c', 'a'],
- ['b', 'a', 'c'],
- # dups!
- ['b', 'c', 'c']])
- @pytest.mark.parametrize('n', range(1, 11))
- def test_n(self, df_strings, nselect_method, n, order):
- # GH 10393
- df = df_strings
- if 'b' in order:
- error_msg = self.dtype_error_msg_template.format(
- column='b', method=nselect_method, dtype='object')
- with pytest.raises(TypeError, match=error_msg):
- getattr(df, nselect_method)(n, order)
- else:
- ascending = nselect_method == 'nsmallest'
- result = getattr(df, nselect_method)(n, order)
- expected = df.sort_values(order, ascending=ascending).head(n)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize('columns', [
- ['group', 'category_string'], ['group', 'string']])
- def test_n_error(self, df_main_dtypes, nselect_method, columns):
- df = df_main_dtypes
- col = columns[1]
- error_msg = self.dtype_error_msg_template.format(
- column=col, method=nselect_method, dtype=df[col].dtype)
- # escape some characters that may be in the repr
- error_msg = (error_msg.replace('(', '\\(').replace(")", "\\)")
- .replace("[", "\\[").replace("]", "\\]"))
- with pytest.raises(TypeError, match=error_msg):
- getattr(df, nselect_method)(2, columns)
- def test_n_all_dtypes(self, df_main_dtypes):
- df = df_main_dtypes
- df.nsmallest(2, list(set(df) - {'category_string', 'string'}))
- df.nlargest(2, list(set(df) - {'category_string', 'string'}))
- @pytest.mark.parametrize('method,expected', [
- ('nlargest',
- pd.DataFrame({'a': [2, 2, 2, 1], 'b': [3, 2, 1, 3]},
- index=[2, 1, 0, 3])),
- ('nsmallest',
- pd.DataFrame({'a': [1, 1, 1, 2], 'b': [1, 2, 3, 1]},
- index=[5, 4, 3, 0]))])
- def test_duplicates_on_starter_columns(self, method, expected):
- # regression test for #22752
- df = pd.DataFrame({
- 'a': [2, 2, 2, 1, 1, 1],
- 'b': [1, 2, 3, 3, 2, 1]
- })
- result = getattr(df, method)(4, columns=['a', 'b'])
- tm.assert_frame_equal(result, expected)
- def test_n_identical_values(self):
- # GH 15297
- df = pd.DataFrame({'a': [1] * 5, 'b': [1, 2, 3, 4, 5]})
- result = df.nlargest(3, 'a')
- expected = pd.DataFrame(
- {'a': [1] * 3, 'b': [1, 2, 3]}, index=[0, 1, 2]
- )
- tm.assert_frame_equal(result, expected)
- result = df.nsmallest(3, 'a')
- expected = pd.DataFrame({'a': [1] * 3, 'b': [1, 2, 3]})
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize('order', [
- ['a', 'b', 'c'],
- ['c', 'b', 'a'],
- ['a'],
- ['b'],
- ['a', 'b'],
- ['c', 'b']])
- @pytest.mark.parametrize('n', range(1, 6))
- def test_n_duplicate_index(self, df_duplicates, n, order):
- # GH 13412
- df = df_duplicates
- result = df.nsmallest(n, order)
- expected = df.sort_values(order).head(n)
- tm.assert_frame_equal(result, expected)
- result = df.nlargest(n, order)
- expected = df.sort_values(order, ascending=False).head(n)
- tm.assert_frame_equal(result, expected)
- def test_duplicate_keep_all_ties(self):
- # GH 16818
- df = pd.DataFrame({'a': [5, 4, 4, 2, 3, 3, 3, 3],
- 'b': [10, 9, 8, 7, 5, 50, 10, 20]})
- result = df.nlargest(4, 'a', keep='all')
- expected = pd.DataFrame({'a': {0: 5, 1: 4, 2: 4, 4: 3,
- 5: 3, 6: 3, 7: 3},
- 'b': {0: 10, 1: 9, 2: 8, 4: 5,
- 5: 50, 6: 10, 7: 20}})
- tm.assert_frame_equal(result, expected)
- result = df.nsmallest(2, 'a', keep='all')
- expected = pd.DataFrame({'a': {3: 2, 4: 3, 5: 3, 6: 3, 7: 3},
- 'b': {3: 7, 4: 5, 5: 50, 6: 10, 7: 20}})
- tm.assert_frame_equal(result, expected)
- def test_series_broadcasting(self):
- # smoke test for numpy warnings
- # GH 16378, GH 16306
- df = DataFrame([1.0, 1.0, 1.0])
- df_nan = DataFrame({'A': [np.nan, 2.0, np.nan]})
- s = Series([1, 1, 1])
- s_nan = Series([np.nan, np.nan, 1])
- with tm.assert_produces_warning(None):
- with tm.assert_produces_warning(FutureWarning):
- df_nan.clip_lower(s, axis=0)
- for op in ['lt', 'le', 'gt', 'ge', 'eq', 'ne']:
- getattr(df, op)(s_nan, axis=0)
- def test_series_nat_conversion(self):
- # GH 18521
- # Check rank does not mutate DataFrame
- df = DataFrame(np.random.randn(10, 3), dtype='float64')
- expected = df.copy()
- df.rank()
- result = df
- tm.assert_frame_equal(result, expected)
- def test_multiindex_column_lookup(self):
- # Check whether tuples are correctly treated as multi-level lookups.
- # GH 23033
- df = pd.DataFrame(
- columns=pd.MultiIndex.from_product([['x'], ['a', 'b']]),
- data=[[0.33, 0.13], [0.86, 0.25], [0.25, 0.70], [0.85, 0.91]])
- # nsmallest
- result = df.nsmallest(3, ('x', 'a'))
- expected = df.iloc[[2, 0, 3]]
- tm.assert_frame_equal(result, expected)
- # nlargest
- result = df.nlargest(3, ('x', 'b'))
- expected = df.iloc[[3, 2, 1]]
- tm.assert_frame_equal(result, expected)
|