test_cut.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. from pandas import (
  5. Categorical, DataFrame, DatetimeIndex, Index, Interval, IntervalIndex,
  6. Series, TimedeltaIndex, Timestamp, cut, date_range, isna, qcut,
  7. timedelta_range, to_datetime)
  8. from pandas.api.types import CategoricalDtype as CDT
  9. import pandas.core.reshape.tile as tmod
  10. import pandas.util.testing as tm
  11. def test_simple():
  12. data = np.ones(5, dtype="int64")
  13. result = cut(data, 4, labels=False)
  14. expected = np.array([1, 1, 1, 1, 1])
  15. tm.assert_numpy_array_equal(result, expected, check_dtype=False)
  16. def test_bins():
  17. data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1])
  18. result, bins = cut(data, 3, retbins=True)
  19. intervals = IntervalIndex.from_breaks(bins.round(3))
  20. intervals = intervals.take([0, 0, 0, 1, 2, 0])
  21. expected = Categorical(intervals, ordered=True)
  22. tm.assert_categorical_equal(result, expected)
  23. tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667,
  24. 6.53333333, 9.7]))
  25. def test_right():
  26. data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
  27. result, bins = cut(data, 4, right=True, retbins=True)
  28. intervals = IntervalIndex.from_breaks(bins.round(3))
  29. expected = Categorical(intervals, ordered=True)
  30. expected = expected.take([0, 0, 0, 2, 3, 0, 0])
  31. tm.assert_categorical_equal(result, expected)
  32. tm.assert_almost_equal(bins, np.array([0.1905, 2.575, 4.95, 7.325, 9.7]))
  33. def test_no_right():
  34. data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
  35. result, bins = cut(data, 4, right=False, retbins=True)
  36. intervals = IntervalIndex.from_breaks(bins.round(3), closed="left")
  37. intervals = intervals.take([0, 0, 0, 2, 3, 0, 1])
  38. expected = Categorical(intervals, ordered=True)
  39. tm.assert_categorical_equal(result, expected)
  40. tm.assert_almost_equal(bins, np.array([0.2, 2.575, 4.95, 7.325, 9.7095]))
  41. def test_array_like():
  42. data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1]
  43. result, bins = cut(data, 3, retbins=True)
  44. intervals = IntervalIndex.from_breaks(bins.round(3))
  45. intervals = intervals.take([0, 0, 0, 1, 2, 0])
  46. expected = Categorical(intervals, ordered=True)
  47. tm.assert_categorical_equal(result, expected)
  48. tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667,
  49. 6.53333333, 9.7]))
  50. def test_bins_from_interval_index():
  51. c = cut(range(5), 3)
  52. expected = c
  53. result = cut(range(5), bins=expected.categories)
  54. tm.assert_categorical_equal(result, expected)
  55. expected = Categorical.from_codes(np.append(c.codes, -1),
  56. categories=c.categories,
  57. ordered=True)
  58. result = cut(range(6), bins=expected.categories)
  59. tm.assert_categorical_equal(result, expected)
  60. def test_bins_from_interval_index_doc_example():
  61. # Make sure we preserve the bins.
  62. ages = np.array([10, 15, 13, 12, 23, 25, 28, 59, 60])
  63. c = cut(ages, bins=[0, 18, 35, 70])
  64. expected = IntervalIndex.from_tuples([(0, 18), (18, 35), (35, 70)])
  65. tm.assert_index_equal(c.categories, expected)
  66. result = cut([25, 20, 50], bins=c.categories)
  67. tm.assert_index_equal(result.categories, expected)
  68. tm.assert_numpy_array_equal(result.codes,
  69. np.array([1, 1, 2], dtype="int8"))
  70. def test_bins_not_overlapping_from_interval_index():
  71. # see gh-23980
  72. msg = "Overlapping IntervalIndex is not accepted"
  73. ii = IntervalIndex.from_tuples([(0, 10), (2, 12), (4, 14)])
  74. with pytest.raises(ValueError, match=msg):
  75. cut([5, 6], bins=ii)
  76. def test_bins_not_monotonic():
  77. msg = "bins must increase monotonically"
  78. data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1]
  79. with pytest.raises(ValueError, match=msg):
  80. cut(data, [0.1, 1.5, 1, 10])
  81. def test_wrong_num_labels():
  82. msg = "Bin labels must be one fewer than the number of bin edges"
  83. data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1]
  84. with pytest.raises(ValueError, match=msg):
  85. cut(data, [0, 1, 10], labels=["foo", "bar", "baz"])
  86. @pytest.mark.parametrize("x,bins,msg", [
  87. ([], 2, "Cannot cut empty array"),
  88. ([1, 2, 3], 0.5, "`bins` should be a positive integer")
  89. ])
  90. def test_cut_corner(x, bins, msg):
  91. with pytest.raises(ValueError, match=msg):
  92. cut(x, bins)
  93. @pytest.mark.parametrize("arg", [2, np.eye(2), DataFrame(np.eye(2))])
  94. @pytest.mark.parametrize("cut_func", [cut, qcut])
  95. def test_cut_not_1d_arg(arg, cut_func):
  96. msg = "Input array must be 1 dimensional"
  97. with pytest.raises(ValueError, match=msg):
  98. cut_func(arg, 2)
  99. @pytest.mark.parametrize('data', [
  100. [0, 1, 2, 3, 4, np.inf],
  101. [-np.inf, 0, 1, 2, 3, 4],
  102. [-np.inf, 0, 1, 2, 3, 4, np.inf]])
  103. def test_int_bins_with_inf(data):
  104. # GH 24314
  105. msg = 'cannot specify integer `bins` when input data contains infinity'
  106. with pytest.raises(ValueError, match=msg):
  107. cut(data, bins=3)
  108. def test_cut_out_of_range_more():
  109. # see gh-1511
  110. name = "x"
  111. ser = Series([0, -1, 0, 1, -3], name=name)
  112. ind = cut(ser, [0, 1], labels=False)
  113. exp = Series([np.nan, np.nan, np.nan, 0, np.nan], name=name)
  114. tm.assert_series_equal(ind, exp)
  115. @pytest.mark.parametrize("right,breaks,closed", [
  116. (True, [-1e-3, 0.25, 0.5, 0.75, 1], "right"),
  117. (False, [0, 0.25, 0.5, 0.75, 1 + 1e-3], "left")
  118. ])
  119. def test_labels(right, breaks, closed):
  120. arr = np.tile(np.arange(0, 1.01, 0.1), 4)
  121. result, bins = cut(arr, 4, retbins=True, right=right)
  122. ex_levels = IntervalIndex.from_breaks(breaks, closed=closed)
  123. tm.assert_index_equal(result.categories, ex_levels)
  124. def test_cut_pass_series_name_to_factor():
  125. name = "foo"
  126. ser = Series(np.random.randn(100), name=name)
  127. factor = cut(ser, 4)
  128. assert factor.name == name
  129. def test_label_precision():
  130. arr = np.arange(0, 0.73, 0.01)
  131. result = cut(arr, 4, precision=2)
  132. ex_levels = IntervalIndex.from_breaks([-0.00072, 0.18, 0.36, 0.54, 0.72])
  133. tm.assert_index_equal(result.categories, ex_levels)
  134. @pytest.mark.parametrize("labels", [None, False])
  135. def test_na_handling(labels):
  136. arr = np.arange(0, 0.75, 0.01)
  137. arr[::3] = np.nan
  138. result = cut(arr, 4, labels=labels)
  139. result = np.asarray(result)
  140. expected = np.where(isna(arr), np.nan, result)
  141. tm.assert_almost_equal(result, expected)
  142. def test_inf_handling():
  143. data = np.arange(6)
  144. data_ser = Series(data, dtype="int64")
  145. bins = [-np.inf, 2, 4, np.inf]
  146. result = cut(data, bins)
  147. result_ser = cut(data_ser, bins)
  148. ex_uniques = IntervalIndex.from_breaks(bins)
  149. tm.assert_index_equal(result.categories, ex_uniques)
  150. assert result[5] == Interval(4, np.inf)
  151. assert result[0] == Interval(-np.inf, 2)
  152. assert result_ser[5] == Interval(4, np.inf)
  153. assert result_ser[0] == Interval(-np.inf, 2)
  154. def test_cut_out_of_bounds():
  155. arr = np.random.randn(100)
  156. result = cut(arr, [-1, 0, 1])
  157. mask = isna(result)
  158. ex_mask = (arr < -1) | (arr > 1)
  159. tm.assert_numpy_array_equal(mask, ex_mask)
  160. @pytest.mark.parametrize("get_labels,get_expected", [
  161. (lambda labels: labels,
  162. lambda labels: Categorical(["Medium"] + 4 * ["Small"] +
  163. ["Medium", "Large"],
  164. categories=labels, ordered=True)),
  165. (lambda labels: Categorical.from_codes([0, 1, 2], labels),
  166. lambda labels: Categorical.from_codes([1] + 4 * [0] + [1, 2], labels))
  167. ])
  168. def test_cut_pass_labels(get_labels, get_expected):
  169. bins = [0, 25, 50, 100]
  170. arr = [50, 5, 10, 15, 20, 30, 70]
  171. labels = ["Small", "Medium", "Large"]
  172. result = cut(arr, bins, labels=get_labels(labels))
  173. tm.assert_categorical_equal(result, get_expected(labels))
  174. def test_cut_pass_labels_compat():
  175. # see gh-16459
  176. arr = [50, 5, 10, 15, 20, 30, 70]
  177. labels = ["Good", "Medium", "Bad"]
  178. result = cut(arr, 3, labels=labels)
  179. exp = cut(arr, 3, labels=Categorical(labels, categories=labels,
  180. ordered=True))
  181. tm.assert_categorical_equal(result, exp)
  182. @pytest.mark.parametrize("x", [np.arange(11.), np.arange(11.) / 1e10])
  183. def test_round_frac_just_works(x):
  184. # It works.
  185. cut(x, 2)
  186. @pytest.mark.parametrize("val,precision,expected", [
  187. (-117.9998, 3, -118),
  188. (117.9998, 3, 118),
  189. (117.9998, 2, 118),
  190. (0.000123456, 2, 0.00012)
  191. ])
  192. def test_round_frac(val, precision, expected):
  193. # see gh-1979
  194. result = tmod._round_frac(val, precision=precision)
  195. assert result == expected
  196. def test_cut_return_intervals():
  197. ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
  198. result = cut(ser, 3)
  199. exp_bins = np.linspace(0, 8, num=4).round(3)
  200. exp_bins[0] -= 0.008
  201. expected = Series(IntervalIndex.from_breaks(exp_bins, closed="right").take(
  202. [0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(CDT(ordered=True))
  203. tm.assert_series_equal(result, expected)
  204. def test_series_ret_bins():
  205. # see gh-8589
  206. ser = Series(np.arange(4))
  207. result, bins = cut(ser, 2, retbins=True)
  208. expected = Series(IntervalIndex.from_breaks(
  209. [-0.003, 1.5, 3], closed="right").repeat(2)).astype(CDT(ordered=True))
  210. tm.assert_series_equal(result, expected)
  211. @pytest.mark.parametrize("kwargs,msg", [
  212. (dict(duplicates="drop"), None),
  213. (dict(), "Bin edges must be unique"),
  214. (dict(duplicates="raise"), "Bin edges must be unique"),
  215. (dict(duplicates="foo"), "invalid value for 'duplicates' parameter")
  216. ])
  217. def test_cut_duplicates_bin(kwargs, msg):
  218. # see gh-20947
  219. bins = [0, 2, 4, 6, 10, 10]
  220. values = Series(np.array([1, 3, 5, 7, 9]), index=["a", "b", "c", "d", "e"])
  221. if msg is not None:
  222. with pytest.raises(ValueError, match=msg):
  223. cut(values, bins, **kwargs)
  224. else:
  225. result = cut(values, bins, **kwargs)
  226. expected = cut(values, pd.unique(bins))
  227. tm.assert_series_equal(result, expected)
  228. @pytest.mark.parametrize("data", [9.0, -9.0, 0.0])
  229. @pytest.mark.parametrize("length", [1, 2])
  230. def test_single_bin(data, length):
  231. # see gh-14652, gh-15428
  232. ser = Series([data] * length)
  233. result = cut(ser, 1, labels=False)
  234. expected = Series([0] * length)
  235. tm.assert_series_equal(result, expected)
  236. @pytest.mark.parametrize(
  237. "array_1_writeable,array_2_writeable",
  238. [(True, True), (True, False), (False, False)])
  239. def test_cut_read_only(array_1_writeable, array_2_writeable):
  240. # issue 18773
  241. array_1 = np.arange(0, 100, 10)
  242. array_1.flags.writeable = array_1_writeable
  243. array_2 = np.arange(0, 100, 10)
  244. array_2.flags.writeable = array_2_writeable
  245. hundred_elements = np.arange(100)
  246. tm.assert_categorical_equal(cut(hundred_elements, array_1),
  247. cut(hundred_elements, array_2))
  248. @pytest.mark.parametrize("conv", [
  249. lambda v: Timestamp(v),
  250. lambda v: to_datetime(v),
  251. lambda v: np.datetime64(v),
  252. lambda v: Timestamp(v).to_pydatetime(),
  253. ])
  254. def test_datetime_bin(conv):
  255. data = [np.datetime64("2012-12-13"), np.datetime64("2012-12-15")]
  256. bin_data = ["2012-12-12", "2012-12-14", "2012-12-16"]
  257. expected = Series(IntervalIndex([
  258. Interval(Timestamp(bin_data[0]), Timestamp(bin_data[1])),
  259. Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2]))])).astype(
  260. CDT(ordered=True))
  261. bins = [conv(v) for v in bin_data]
  262. result = Series(cut(data, bins=bins))
  263. tm.assert_series_equal(result, expected)
  264. @pytest.mark.parametrize("data", [
  265. to_datetime(Series(["2013-01-01", "2013-01-02", "2013-01-03"])),
  266. [np.datetime64("2013-01-01"), np.datetime64("2013-01-02"),
  267. np.datetime64("2013-01-03")],
  268. np.array([np.datetime64("2013-01-01"), np.datetime64("2013-01-02"),
  269. np.datetime64("2013-01-03")]),
  270. DatetimeIndex(["2013-01-01", "2013-01-02", "2013-01-03"])
  271. ])
  272. def test_datetime_cut(data):
  273. # see gh-14714
  274. #
  275. # Testing time data when it comes in various collection types.
  276. result, _ = cut(data, 3, retbins=True)
  277. expected = Series(IntervalIndex([
  278. Interval(Timestamp("2012-12-31 23:57:07.200000"),
  279. Timestamp("2013-01-01 16:00:00")),
  280. Interval(Timestamp("2013-01-01 16:00:00"),
  281. Timestamp("2013-01-02 08:00:00")),
  282. Interval(Timestamp("2013-01-02 08:00:00"),
  283. Timestamp("2013-01-03 00:00:00"))])).astype(CDT(ordered=True))
  284. tm.assert_series_equal(Series(result), expected)
  285. @pytest.mark.parametrize("bins", [
  286. 3, [Timestamp("2013-01-01 04:57:07.200000"),
  287. Timestamp("2013-01-01 21:00:00"),
  288. Timestamp("2013-01-02 13:00:00"),
  289. Timestamp("2013-01-03 05:00:00")]])
  290. @pytest.mark.parametrize("box", [list, np.array, Index, Series])
  291. def test_datetime_tz_cut(bins, box):
  292. # see gh-19872
  293. tz = "US/Eastern"
  294. s = Series(date_range("20130101", periods=3, tz=tz))
  295. if not isinstance(bins, int):
  296. bins = box(bins)
  297. result = cut(s, bins)
  298. expected = Series(IntervalIndex([
  299. Interval(Timestamp("2012-12-31 23:57:07.200000", tz=tz),
  300. Timestamp("2013-01-01 16:00:00", tz=tz)),
  301. Interval(Timestamp("2013-01-01 16:00:00", tz=tz),
  302. Timestamp("2013-01-02 08:00:00", tz=tz)),
  303. Interval(Timestamp("2013-01-02 08:00:00", tz=tz),
  304. Timestamp("2013-01-03 00:00:00", tz=tz))])).astype(
  305. CDT(ordered=True))
  306. tm.assert_series_equal(result, expected)
  307. def test_datetime_nan_error():
  308. msg = "bins must be of datetime64 dtype"
  309. with pytest.raises(ValueError, match=msg):
  310. cut(date_range("20130101", periods=3), bins=[0, 2, 4])
  311. def test_datetime_nan_mask():
  312. result = cut(date_range("20130102", periods=5),
  313. bins=date_range("20130101", periods=2))
  314. mask = result.categories.isna()
  315. tm.assert_numpy_array_equal(mask, np.array([False]))
  316. mask = result.isna()
  317. tm.assert_numpy_array_equal(mask, np.array([False, True, True,
  318. True, True]))
  319. @pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"])
  320. def test_datetime_cut_roundtrip(tz):
  321. # see gh-19891
  322. ser = Series(date_range("20180101", periods=3, tz=tz))
  323. result, result_bins = cut(ser, 2, retbins=True)
  324. expected = cut(ser, result_bins)
  325. tm.assert_series_equal(result, expected)
  326. expected_bins = DatetimeIndex(["2017-12-31 23:57:07.200000",
  327. "2018-01-02 00:00:00",
  328. "2018-01-03 00:00:00"])
  329. expected_bins = expected_bins.tz_localize(tz)
  330. tm.assert_index_equal(result_bins, expected_bins)
  331. def test_timedelta_cut_roundtrip():
  332. # see gh-19891
  333. ser = Series(timedelta_range("1day", periods=3))
  334. result, result_bins = cut(ser, 2, retbins=True)
  335. expected = cut(ser, result_bins)
  336. tm.assert_series_equal(result, expected)
  337. expected_bins = TimedeltaIndex(["0 days 23:57:07.200000",
  338. "2 days 00:00:00",
  339. "3 days 00:00:00"])
  340. tm.assert_index_equal(result_bins, expected_bins)