test_stat_reductions.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. # -*- coding: utf-8 -*-
  2. """
  3. Tests for statistical reductions of 2nd moment or higher: var, skew, kurt, ...
  4. """
  5. import numpy as np
  6. import pytest
  7. from pandas.compat import lrange
  8. import pandas.util._test_decorators as td
  9. import pandas as pd
  10. from pandas import DataFrame, Series, compat
  11. import pandas.util.testing as tm
  12. class TestSeriesStatReductions(object):
  13. # Note: the name TestSeriesStatReductions indicates these tests
  14. # were moved from a series-specific test file, _not_ that these tests are
  15. # intended long-term to be series-specific
  16. def _check_stat_op(self, name, alternate, string_series_,
  17. check_objects=False, check_allna=False):
  18. with pd.option_context('use_bottleneck', False):
  19. f = getattr(Series, name)
  20. # add some NaNs
  21. string_series_[5:15] = np.NaN
  22. # mean, idxmax, idxmin, min, and max are valid for dates
  23. if name not in ['max', 'min', 'mean']:
  24. ds = Series(pd.date_range('1/1/2001', periods=10))
  25. with pytest.raises(TypeError):
  26. f(ds)
  27. # skipna or no
  28. assert pd.notna(f(string_series_))
  29. assert pd.isna(f(string_series_, skipna=False))
  30. # check the result is correct
  31. nona = string_series_.dropna()
  32. tm.assert_almost_equal(f(nona), alternate(nona.values))
  33. tm.assert_almost_equal(f(string_series_), alternate(nona.values))
  34. allna = string_series_ * np.nan
  35. if check_allna:
  36. assert np.isnan(f(allna))
  37. # dtype=object with None, it works!
  38. s = Series([1, 2, 3, None, 5])
  39. f(s)
  40. # GH#2888
  41. items = [0]
  42. items.extend(lrange(2 ** 40, 2 ** 40 + 1000))
  43. s = Series(items, dtype='int64')
  44. tm.assert_almost_equal(float(f(s)), float(alternate(s.values)))
  45. # check date range
  46. if check_objects:
  47. s = Series(pd.bdate_range('1/1/2000', periods=10))
  48. res = f(s)
  49. exp = alternate(s)
  50. assert res == exp
  51. # check on string data
  52. if name not in ['sum', 'min', 'max']:
  53. with pytest.raises(TypeError):
  54. f(Series(list('abc')))
  55. # Invalid axis.
  56. with pytest.raises(ValueError):
  57. f(string_series_, axis=1)
  58. # Unimplemented numeric_only parameter.
  59. if 'numeric_only' in compat.signature(f).args:
  60. with pytest.raises(NotImplementedError, match=name):
  61. f(string_series_, numeric_only=True)
  62. def test_sum(self):
  63. string_series = tm.makeStringSeries().rename('series')
  64. self._check_stat_op('sum', np.sum, string_series, check_allna=False)
  65. def test_mean(self):
  66. string_series = tm.makeStringSeries().rename('series')
  67. self._check_stat_op('mean', np.mean, string_series)
  68. def test_median(self):
  69. string_series = tm.makeStringSeries().rename('series')
  70. self._check_stat_op('median', np.median, string_series)
  71. # test with integers, test failure
  72. int_ts = Series(np.ones(10, dtype=int), index=lrange(10))
  73. tm.assert_almost_equal(np.median(int_ts), int_ts.median())
  74. def test_prod(self):
  75. string_series = tm.makeStringSeries().rename('series')
  76. self._check_stat_op('prod', np.prod, string_series)
  77. def test_min(self):
  78. string_series = tm.makeStringSeries().rename('series')
  79. self._check_stat_op('min', np.min, string_series, check_objects=True)
  80. def test_max(self):
  81. string_series = tm.makeStringSeries().rename('series')
  82. self._check_stat_op('max', np.max, string_series, check_objects=True)
  83. def test_var_std(self):
  84. string_series = tm.makeStringSeries().rename('series')
  85. datetime_series = tm.makeTimeSeries().rename('ts')
  86. alt = lambda x: np.std(x, ddof=1)
  87. self._check_stat_op('std', alt, string_series)
  88. alt = lambda x: np.var(x, ddof=1)
  89. self._check_stat_op('var', alt, string_series)
  90. result = datetime_series.std(ddof=4)
  91. expected = np.std(datetime_series.values, ddof=4)
  92. tm.assert_almost_equal(result, expected)
  93. result = datetime_series.var(ddof=4)
  94. expected = np.var(datetime_series.values, ddof=4)
  95. tm.assert_almost_equal(result, expected)
  96. # 1 - element series with ddof=1
  97. s = datetime_series.iloc[[0]]
  98. result = s.var(ddof=1)
  99. assert pd.isna(result)
  100. result = s.std(ddof=1)
  101. assert pd.isna(result)
  102. def test_sem(self):
  103. string_series = tm.makeStringSeries().rename('series')
  104. datetime_series = tm.makeTimeSeries().rename('ts')
  105. alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x))
  106. self._check_stat_op('sem', alt, string_series)
  107. result = datetime_series.sem(ddof=4)
  108. expected = np.std(datetime_series.values,
  109. ddof=4) / np.sqrt(len(datetime_series.values))
  110. tm.assert_almost_equal(result, expected)
  111. # 1 - element series with ddof=1
  112. s = datetime_series.iloc[[0]]
  113. result = s.sem(ddof=1)
  114. assert pd.isna(result)
  115. @td.skip_if_no_scipy
  116. def test_skew(self):
  117. from scipy.stats import skew
  118. string_series = tm.makeStringSeries().rename('series')
  119. alt = lambda x: skew(x, bias=False)
  120. self._check_stat_op('skew', alt, string_series)
  121. # test corner cases, skew() returns NaN unless there's at least 3
  122. # values
  123. min_N = 3
  124. for i in range(1, min_N + 1):
  125. s = Series(np.ones(i))
  126. df = DataFrame(np.ones((i, i)))
  127. if i < min_N:
  128. assert np.isnan(s.skew())
  129. assert np.isnan(df.skew()).all()
  130. else:
  131. assert 0 == s.skew()
  132. assert (df.skew() == 0).all()
  133. @td.skip_if_no_scipy
  134. def test_kurt(self):
  135. from scipy.stats import kurtosis
  136. string_series = tm.makeStringSeries().rename('series')
  137. alt = lambda x: kurtosis(x, bias=False)
  138. self._check_stat_op('kurt', alt, string_series)
  139. index = pd.MultiIndex(
  140. levels=[['bar'], ['one', 'two', 'three'], [0, 1]],
  141. codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]]
  142. )
  143. s = Series(np.random.randn(6), index=index)
  144. tm.assert_almost_equal(s.kurt(), s.kurt(level=0)['bar'])
  145. # test corner cases, kurt() returns NaN unless there's at least 4
  146. # values
  147. min_N = 4
  148. for i in range(1, min_N + 1):
  149. s = Series(np.ones(i))
  150. df = DataFrame(np.ones((i, i)))
  151. if i < min_N:
  152. assert np.isnan(s.kurt())
  153. assert np.isnan(df.kurt()).all()
  154. else:
  155. assert 0 == s.kurt()
  156. assert (df.kurt() == 0).all()