test_qcut.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199
  1. import os
  2. import numpy as np
  3. import pytest
  4. from pandas.compat import zip
  5. from pandas import (
  6. Categorical, DatetimeIndex, Interval, IntervalIndex, NaT, Series,
  7. TimedeltaIndex, Timestamp, cut, date_range, isna, qcut, timedelta_range)
  8. from pandas.api.types import CategoricalDtype as CDT
  9. from pandas.core.algorithms import quantile
  10. import pandas.util.testing as tm
  11. from pandas.tseries.offsets import Day, Nano
  12. def test_qcut():
  13. arr = np.random.randn(1000)
  14. # We store the bins as Index that have been
  15. # rounded to comparisons are a bit tricky.
  16. labels, bins = qcut(arr, 4, retbins=True)
  17. ex_bins = quantile(arr, [0, .25, .5, .75, 1.])
  18. result = labels.categories.left.values
  19. assert np.allclose(result, ex_bins[:-1], atol=1e-2)
  20. result = labels.categories.right.values
  21. assert np.allclose(result, ex_bins[1:], atol=1e-2)
  22. ex_levels = cut(arr, ex_bins, include_lowest=True)
  23. tm.assert_categorical_equal(labels, ex_levels)
  24. def test_qcut_bounds():
  25. arr = np.random.randn(1000)
  26. factor = qcut(arr, 10, labels=False)
  27. assert len(np.unique(factor)) == 10
  28. def test_qcut_specify_quantiles():
  29. arr = np.random.randn(100)
  30. factor = qcut(arr, [0, .25, .5, .75, 1.])
  31. expected = qcut(arr, 4)
  32. tm.assert_categorical_equal(factor, expected)
  33. def test_qcut_all_bins_same():
  34. with pytest.raises(ValueError, match="edges.*unique"):
  35. qcut([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3)
  36. def test_qcut_include_lowest():
  37. values = np.arange(10)
  38. ii = qcut(values, 4)
  39. ex_levels = IntervalIndex([Interval(-0.001, 2.25), Interval(2.25, 4.5),
  40. Interval(4.5, 6.75), Interval(6.75, 9)])
  41. tm.assert_index_equal(ii.categories, ex_levels)
  42. def test_qcut_nas():
  43. arr = np.random.randn(100)
  44. arr[:20] = np.nan
  45. result = qcut(arr, 4)
  46. assert isna(result[:20]).all()
  47. def test_qcut_index():
  48. result = qcut([0, 2], 2)
  49. intervals = [Interval(-0.001, 1), Interval(1, 2)]
  50. expected = Categorical(intervals, ordered=True)
  51. tm.assert_categorical_equal(result, expected)
  52. def test_qcut_binning_issues(datapath):
  53. # see gh-1978, gh-1979
  54. cut_file = datapath(os.path.join("reshape", "data", "cut_data.csv"))
  55. arr = np.loadtxt(cut_file)
  56. result = qcut(arr, 20)
  57. starts = []
  58. ends = []
  59. for lev in np.unique(result):
  60. s = lev.left
  61. e = lev.right
  62. assert s != e
  63. starts.append(float(s))
  64. ends.append(float(e))
  65. for (sp, sn), (ep, en) in zip(zip(starts[:-1], starts[1:]),
  66. zip(ends[:-1], ends[1:])):
  67. assert sp < sn
  68. assert ep < en
  69. assert ep <= sn
  70. def test_qcut_return_intervals():
  71. ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
  72. res = qcut(ser, [0, 0.333, 0.666, 1])
  73. exp_levels = np.array([Interval(-0.001, 2.664),
  74. Interval(2.664, 5.328), Interval(5.328, 8)])
  75. exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(
  76. CDT(ordered=True))
  77. tm.assert_series_equal(res, exp)
  78. @pytest.mark.parametrize("kwargs,msg", [
  79. (dict(duplicates="drop"), None),
  80. (dict(), "Bin edges must be unique"),
  81. (dict(duplicates="raise"), "Bin edges must be unique"),
  82. (dict(duplicates="foo"), "invalid value for 'duplicates' parameter")
  83. ])
  84. def test_qcut_duplicates_bin(kwargs, msg):
  85. # see gh-7751
  86. values = [0, 0, 0, 0, 1, 2, 3]
  87. if msg is not None:
  88. with pytest.raises(ValueError, match=msg):
  89. qcut(values, 3, **kwargs)
  90. else:
  91. result = qcut(values, 3, **kwargs)
  92. expected = IntervalIndex([Interval(-0.001, 1), Interval(1, 3)])
  93. tm.assert_index_equal(result.categories, expected)
  94. @pytest.mark.parametrize("data,start,end", [
  95. (9.0, 8.999, 9.0),
  96. (0.0, -0.001, 0.0),
  97. (-9.0, -9.001, -9.0),
  98. ])
  99. @pytest.mark.parametrize("length", [1, 2])
  100. @pytest.mark.parametrize("labels", [None, False])
  101. def test_single_quantile(data, start, end, length, labels):
  102. # see gh-15431
  103. ser = Series([data] * length)
  104. result = qcut(ser, 1, labels=labels)
  105. if labels is None:
  106. intervals = IntervalIndex([Interval(start, end)] *
  107. length, closed="right")
  108. expected = Series(intervals).astype(CDT(ordered=True))
  109. else:
  110. expected = Series([0] * length)
  111. tm.assert_series_equal(result, expected)
  112. @pytest.mark.parametrize("ser", [
  113. Series(DatetimeIndex(["20180101", NaT, "20180103"])),
  114. Series(TimedeltaIndex(["0 days", NaT, "2 days"]))],
  115. ids=lambda x: str(x.dtype))
  116. def test_qcut_nat(ser):
  117. # see gh-19768
  118. intervals = IntervalIndex.from_tuples([
  119. (ser[0] - Nano(), ser[2] - Day()),
  120. np.nan, (ser[2] - Day(), ser[2])])
  121. expected = Series(Categorical(intervals, ordered=True))
  122. result = qcut(ser, 2)
  123. tm.assert_series_equal(result, expected)
  124. @pytest.mark.parametrize("bins", [3, np.linspace(0, 1, 4)])
  125. def test_datetime_tz_qcut(bins):
  126. # see gh-19872
  127. tz = "US/Eastern"
  128. ser = Series(date_range("20130101", periods=3, tz=tz))
  129. result = qcut(ser, bins)
  130. expected = Series(IntervalIndex([
  131. Interval(Timestamp("2012-12-31 23:59:59.999999999", tz=tz),
  132. Timestamp("2013-01-01 16:00:00", tz=tz)),
  133. Interval(Timestamp("2013-01-01 16:00:00", tz=tz),
  134. Timestamp("2013-01-02 08:00:00", tz=tz)),
  135. Interval(Timestamp("2013-01-02 08:00:00", tz=tz),
  136. Timestamp("2013-01-03 00:00:00", tz=tz))])).astype(
  137. CDT(ordered=True))
  138. tm.assert_series_equal(result, expected)
  139. @pytest.mark.parametrize("arg,expected_bins", [
  140. [timedelta_range("1day", periods=3),
  141. TimedeltaIndex(["1 days", "2 days", "3 days"])],
  142. [date_range("20180101", periods=3),
  143. DatetimeIndex(["2018-01-01", "2018-01-02", "2018-01-03"])]])
  144. def test_date_like_qcut_bins(arg, expected_bins):
  145. # see gh-19891
  146. ser = Series(arg)
  147. result, result_bins = qcut(ser, 2, retbins=True)
  148. tm.assert_index_equal(result_bins, expected_bins)