test_api.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508
  1. # -*- coding: utf-8 -*-
  2. import numpy as np
  3. import pytest
  4. from pandas import Categorical, CategoricalIndex, DataFrame, Index, Series
  5. from pandas.core.arrays.categorical import _recode_for_categories
  6. from pandas.tests.arrays.categorical.common import TestCategorical
  7. import pandas.util.testing as tm
  8. class TestCategoricalAPI(object):
  9. def test_ordered_api(self):
  10. # GH 9347
  11. cat1 = Categorical(list('acb'), ordered=False)
  12. tm.assert_index_equal(cat1.categories, Index(['a', 'b', 'c']))
  13. assert not cat1.ordered
  14. cat2 = Categorical(list('acb'), categories=list('bca'), ordered=False)
  15. tm.assert_index_equal(cat2.categories, Index(['b', 'c', 'a']))
  16. assert not cat2.ordered
  17. cat3 = Categorical(list('acb'), ordered=True)
  18. tm.assert_index_equal(cat3.categories, Index(['a', 'b', 'c']))
  19. assert cat3.ordered
  20. cat4 = Categorical(list('acb'), categories=list('bca'), ordered=True)
  21. tm.assert_index_equal(cat4.categories, Index(['b', 'c', 'a']))
  22. assert cat4.ordered
  23. def test_set_ordered(self):
  24. cat = Categorical(["a", "b", "c", "a"], ordered=True)
  25. cat2 = cat.as_unordered()
  26. assert not cat2.ordered
  27. cat2 = cat.as_ordered()
  28. assert cat2.ordered
  29. cat2.as_unordered(inplace=True)
  30. assert not cat2.ordered
  31. cat2.as_ordered(inplace=True)
  32. assert cat2.ordered
  33. assert cat2.set_ordered(True).ordered
  34. assert not cat2.set_ordered(False).ordered
  35. cat2.set_ordered(True, inplace=True)
  36. assert cat2.ordered
  37. cat2.set_ordered(False, inplace=True)
  38. assert not cat2.ordered
  39. # removed in 0.19.0
  40. msg = "can\'t set attribute"
  41. with pytest.raises(AttributeError, match=msg):
  42. cat.ordered = True
  43. with pytest.raises(AttributeError, match=msg):
  44. cat.ordered = False
  45. def test_rename_categories(self):
  46. cat = Categorical(["a", "b", "c", "a"])
  47. # inplace=False: the old one must not be changed
  48. res = cat.rename_categories([1, 2, 3])
  49. tm.assert_numpy_array_equal(res.__array__(), np.array([1, 2, 3, 1],
  50. dtype=np.int64))
  51. tm.assert_index_equal(res.categories, Index([1, 2, 3]))
  52. exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_)
  53. tm.assert_numpy_array_equal(cat.__array__(), exp_cat)
  54. exp_cat = Index(["a", "b", "c"])
  55. tm.assert_index_equal(cat.categories, exp_cat)
  56. # GH18862 (let rename_categories take callables)
  57. result = cat.rename_categories(lambda x: x.upper())
  58. expected = Categorical(["A", "B", "C", "A"])
  59. tm.assert_categorical_equal(result, expected)
  60. # and now inplace
  61. res = cat.rename_categories([1, 2, 3], inplace=True)
  62. assert res is None
  63. tm.assert_numpy_array_equal(cat.__array__(), np.array([1, 2, 3, 1],
  64. dtype=np.int64))
  65. tm.assert_index_equal(cat.categories, Index([1, 2, 3]))
  66. # Lengthen
  67. with pytest.raises(ValueError):
  68. cat.rename_categories([1, 2, 3, 4])
  69. # Shorten
  70. with pytest.raises(ValueError):
  71. cat.rename_categories([1, 2])
  72. def test_rename_categories_series(self):
  73. # https://github.com/pandas-dev/pandas/issues/17981
  74. c = Categorical(['a', 'b'])
  75. xpr = "Treating Series 'new_categories' as a list-like "
  76. with tm.assert_produces_warning(FutureWarning) as rec:
  77. result = c.rename_categories(Series([0, 1]))
  78. assert len(rec) == 1
  79. assert xpr in str(rec[0].message)
  80. expected = Categorical([0, 1])
  81. tm.assert_categorical_equal(result, expected)
  82. def test_rename_categories_dict(self):
  83. # GH 17336
  84. cat = Categorical(['a', 'b', 'c', 'd'])
  85. res = cat.rename_categories({'a': 4, 'b': 3, 'c': 2, 'd': 1})
  86. expected = Index([4, 3, 2, 1])
  87. tm.assert_index_equal(res.categories, expected)
  88. # Test for inplace
  89. res = cat.rename_categories({'a': 4, 'b': 3, 'c': 2, 'd': 1},
  90. inplace=True)
  91. assert res is None
  92. tm.assert_index_equal(cat.categories, expected)
  93. # Test for dicts of smaller length
  94. cat = Categorical(['a', 'b', 'c', 'd'])
  95. res = cat.rename_categories({'a': 1, 'c': 3})
  96. expected = Index([1, 'b', 3, 'd'])
  97. tm.assert_index_equal(res.categories, expected)
  98. # Test for dicts with bigger length
  99. cat = Categorical(['a', 'b', 'c', 'd'])
  100. res = cat.rename_categories({'a': 1, 'b': 2, 'c': 3,
  101. 'd': 4, 'e': 5, 'f': 6})
  102. expected = Index([1, 2, 3, 4])
  103. tm.assert_index_equal(res.categories, expected)
  104. # Test for dicts with no items from old categories
  105. cat = Categorical(['a', 'b', 'c', 'd'])
  106. res = cat.rename_categories({'f': 1, 'g': 3})
  107. expected = Index(['a', 'b', 'c', 'd'])
  108. tm.assert_index_equal(res.categories, expected)
  109. def test_reorder_categories(self):
  110. cat = Categorical(["a", "b", "c", "a"], ordered=True)
  111. old = cat.copy()
  112. new = Categorical(["a", "b", "c", "a"], categories=["c", "b", "a"],
  113. ordered=True)
  114. # first inplace == False
  115. res = cat.reorder_categories(["c", "b", "a"])
  116. # cat must be the same as before
  117. tm.assert_categorical_equal(cat, old)
  118. # only res is changed
  119. tm.assert_categorical_equal(res, new)
  120. # inplace == True
  121. res = cat.reorder_categories(["c", "b", "a"], inplace=True)
  122. assert res is None
  123. tm.assert_categorical_equal(cat, new)
  124. # not all "old" included in "new"
  125. cat = Categorical(["a", "b", "c", "a"], ordered=True)
  126. with pytest.raises(ValueError):
  127. cat.reorder_categories(["a"])
  128. # still not all "old" in "new"
  129. with pytest.raises(ValueError):
  130. cat.reorder_categories(["a", "b", "d"])
  131. # all "old" included in "new", but too long
  132. with pytest.raises(ValueError):
  133. cat.reorder_categories(["a", "b", "c", "d"])
  134. def test_add_categories(self):
  135. cat = Categorical(["a", "b", "c", "a"], ordered=True)
  136. old = cat.copy()
  137. new = Categorical(["a", "b", "c", "a"],
  138. categories=["a", "b", "c", "d"], ordered=True)
  139. # first inplace == False
  140. res = cat.add_categories("d")
  141. tm.assert_categorical_equal(cat, old)
  142. tm.assert_categorical_equal(res, new)
  143. res = cat.add_categories(["d"])
  144. tm.assert_categorical_equal(cat, old)
  145. tm.assert_categorical_equal(res, new)
  146. # inplace == True
  147. res = cat.add_categories("d", inplace=True)
  148. tm.assert_categorical_equal(cat, new)
  149. assert res is None
  150. # new is in old categories
  151. with pytest.raises(ValueError):
  152. cat.add_categories(["d"])
  153. # GH 9927
  154. cat = Categorical(list("abc"), ordered=True)
  155. expected = Categorical(
  156. list("abc"), categories=list("abcde"), ordered=True)
  157. # test with Series, np.array, index, list
  158. res = cat.add_categories(Series(["d", "e"]))
  159. tm.assert_categorical_equal(res, expected)
  160. res = cat.add_categories(np.array(["d", "e"]))
  161. tm.assert_categorical_equal(res, expected)
  162. res = cat.add_categories(Index(["d", "e"]))
  163. tm.assert_categorical_equal(res, expected)
  164. res = cat.add_categories(["d", "e"])
  165. tm.assert_categorical_equal(res, expected)
  166. def test_set_categories(self):
  167. cat = Categorical(["a", "b", "c", "a"], ordered=True)
  168. exp_categories = Index(["c", "b", "a"])
  169. exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_)
  170. res = cat.set_categories(["c", "b", "a"], inplace=True)
  171. tm.assert_index_equal(cat.categories, exp_categories)
  172. tm.assert_numpy_array_equal(cat.__array__(), exp_values)
  173. assert res is None
  174. res = cat.set_categories(["a", "b", "c"])
  175. # cat must be the same as before
  176. tm.assert_index_equal(cat.categories, exp_categories)
  177. tm.assert_numpy_array_equal(cat.__array__(), exp_values)
  178. # only res is changed
  179. exp_categories_back = Index(["a", "b", "c"])
  180. tm.assert_index_equal(res.categories, exp_categories_back)
  181. tm.assert_numpy_array_equal(res.__array__(), exp_values)
  182. # not all "old" included in "new" -> all not included ones are now
  183. # np.nan
  184. cat = Categorical(["a", "b", "c", "a"], ordered=True)
  185. res = cat.set_categories(["a"])
  186. tm.assert_numpy_array_equal(res.codes, np.array([0, -1, -1, 0],
  187. dtype=np.int8))
  188. # still not all "old" in "new"
  189. res = cat.set_categories(["a", "b", "d"])
  190. tm.assert_numpy_array_equal(res.codes, np.array([0, 1, -1, 0],
  191. dtype=np.int8))
  192. tm.assert_index_equal(res.categories, Index(["a", "b", "d"]))
  193. # all "old" included in "new"
  194. cat = cat.set_categories(["a", "b", "c", "d"])
  195. exp_categories = Index(["a", "b", "c", "d"])
  196. tm.assert_index_equal(cat.categories, exp_categories)
  197. # internals...
  198. c = Categorical([1, 2, 3, 4, 1], categories=[1, 2, 3, 4], ordered=True)
  199. tm.assert_numpy_array_equal(c._codes, np.array([0, 1, 2, 3, 0],
  200. dtype=np.int8))
  201. tm.assert_index_equal(c.categories, Index([1, 2, 3, 4]))
  202. exp = np.array([1, 2, 3, 4, 1], dtype=np.int64)
  203. tm.assert_numpy_array_equal(c.get_values(), exp)
  204. # all "pointers" to '4' must be changed from 3 to 0,...
  205. c = c.set_categories([4, 3, 2, 1])
  206. # positions are changed
  207. tm.assert_numpy_array_equal(c._codes, np.array([3, 2, 1, 0, 3],
  208. dtype=np.int8))
  209. # categories are now in new order
  210. tm.assert_index_equal(c.categories, Index([4, 3, 2, 1]))
  211. # output is the same
  212. exp = np.array([1, 2, 3, 4, 1], dtype=np.int64)
  213. tm.assert_numpy_array_equal(c.get_values(), exp)
  214. assert c.min() == 4
  215. assert c.max() == 1
  216. # set_categories should set the ordering if specified
  217. c2 = c.set_categories([4, 3, 2, 1], ordered=False)
  218. assert not c2.ordered
  219. tm.assert_numpy_array_equal(c.get_values(), c2.get_values())
  220. # set_categories should pass thru the ordering
  221. c2 = c.set_ordered(False).set_categories([4, 3, 2, 1])
  222. assert not c2.ordered
  223. tm.assert_numpy_array_equal(c.get_values(), c2.get_values())
  224. @pytest.mark.parametrize('values, categories, new_categories', [
  225. # No NaNs, same cats, same order
  226. (['a', 'b', 'a'], ['a', 'b'], ['a', 'b'],),
  227. # No NaNs, same cats, different order
  228. (['a', 'b', 'a'], ['a', 'b'], ['b', 'a'],),
  229. # Same, unsorted
  230. (['b', 'a', 'a'], ['a', 'b'], ['a', 'b'],),
  231. # No NaNs, same cats, different order
  232. (['b', 'a', 'a'], ['a', 'b'], ['b', 'a'],),
  233. # NaNs
  234. (['a', 'b', 'c'], ['a', 'b'], ['a', 'b']),
  235. (['a', 'b', 'c'], ['a', 'b'], ['b', 'a']),
  236. (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
  237. (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
  238. # Introduce NaNs
  239. (['a', 'b', 'c'], ['a', 'b'], ['a']),
  240. (['a', 'b', 'c'], ['a', 'b'], ['b']),
  241. (['b', 'a', 'c'], ['a', 'b'], ['a']),
  242. (['b', 'a', 'c'], ['a', 'b'], ['a']),
  243. # No overlap
  244. (['a', 'b', 'c'], ['a', 'b'], ['d', 'e']),
  245. ])
  246. @pytest.mark.parametrize('ordered', [True, False])
  247. def test_set_categories_many(self, values, categories, new_categories,
  248. ordered):
  249. c = Categorical(values, categories)
  250. expected = Categorical(values, new_categories, ordered)
  251. result = c.set_categories(new_categories, ordered=ordered)
  252. tm.assert_categorical_equal(result, expected)
  253. def test_set_categories_rename_less(self):
  254. # GH 24675
  255. cat = Categorical(['A', 'B'])
  256. result = cat.set_categories(['A'], rename=True)
  257. expected = Categorical(['A', np.nan])
  258. tm.assert_categorical_equal(result, expected)
  259. def test_set_categories_private(self):
  260. cat = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c', 'd'])
  261. cat._set_categories(['a', 'c', 'd', 'e'])
  262. expected = Categorical(['a', 'c', 'd'], categories=list('acde'))
  263. tm.assert_categorical_equal(cat, expected)
  264. # fastpath
  265. cat = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c', 'd'])
  266. cat._set_categories(['a', 'c', 'd', 'e'], fastpath=True)
  267. expected = Categorical(['a', 'c', 'd'], categories=list('acde'))
  268. tm.assert_categorical_equal(cat, expected)
  269. def test_remove_categories(self):
  270. cat = Categorical(["a", "b", "c", "a"], ordered=True)
  271. old = cat.copy()
  272. new = Categorical(["a", "b", np.nan, "a"], categories=["a", "b"],
  273. ordered=True)
  274. # first inplace == False
  275. res = cat.remove_categories("c")
  276. tm.assert_categorical_equal(cat, old)
  277. tm.assert_categorical_equal(res, new)
  278. res = cat.remove_categories(["c"])
  279. tm.assert_categorical_equal(cat, old)
  280. tm.assert_categorical_equal(res, new)
  281. # inplace == True
  282. res = cat.remove_categories("c", inplace=True)
  283. tm.assert_categorical_equal(cat, new)
  284. assert res is None
  285. # removal is not in categories
  286. with pytest.raises(ValueError):
  287. cat.remove_categories(["c"])
  288. def test_remove_unused_categories(self):
  289. c = Categorical(["a", "b", "c", "d", "a"],
  290. categories=["a", "b", "c", "d", "e"])
  291. exp_categories_all = Index(["a", "b", "c", "d", "e"])
  292. exp_categories_dropped = Index(["a", "b", "c", "d"])
  293. tm.assert_index_equal(c.categories, exp_categories_all)
  294. res = c.remove_unused_categories()
  295. tm.assert_index_equal(res.categories, exp_categories_dropped)
  296. tm.assert_index_equal(c.categories, exp_categories_all)
  297. res = c.remove_unused_categories(inplace=True)
  298. tm.assert_index_equal(c.categories, exp_categories_dropped)
  299. assert res is None
  300. # with NaN values (GH11599)
  301. c = Categorical(["a", "b", "c", np.nan],
  302. categories=["a", "b", "c", "d", "e"])
  303. res = c.remove_unused_categories()
  304. tm.assert_index_equal(res.categories,
  305. Index(np.array(["a", "b", "c"])))
  306. exp_codes = np.array([0, 1, 2, -1], dtype=np.int8)
  307. tm.assert_numpy_array_equal(res.codes, exp_codes)
  308. tm.assert_index_equal(c.categories, exp_categories_all)
  309. val = ['F', np.nan, 'D', 'B', 'D', 'F', np.nan]
  310. cat = Categorical(values=val, categories=list('ABCDEFG'))
  311. out = cat.remove_unused_categories()
  312. tm.assert_index_equal(out.categories, Index(['B', 'D', 'F']))
  313. exp_codes = np.array([2, -1, 1, 0, 1, 2, -1], dtype=np.int8)
  314. tm.assert_numpy_array_equal(out.codes, exp_codes)
  315. assert out.get_values().tolist() == val
  316. alpha = list('abcdefghijklmnopqrstuvwxyz')
  317. val = np.random.choice(alpha[::2], 10000).astype('object')
  318. val[np.random.choice(len(val), 100)] = np.nan
  319. cat = Categorical(values=val, categories=alpha)
  320. out = cat.remove_unused_categories()
  321. assert out.get_values().tolist() == val.tolist()
  322. class TestCategoricalAPIWithFactor(TestCategorical):
  323. def test_describe(self):
  324. # string type
  325. desc = self.factor.describe()
  326. assert self.factor.ordered
  327. exp_index = CategoricalIndex(['a', 'b', 'c'], name='categories',
  328. ordered=self.factor.ordered)
  329. expected = DataFrame({'counts': [3, 2, 3],
  330. 'freqs': [3 / 8., 2 / 8., 3 / 8.]},
  331. index=exp_index)
  332. tm.assert_frame_equal(desc, expected)
  333. # check unused categories
  334. cat = self.factor.copy()
  335. cat.set_categories(["a", "b", "c", "d"], inplace=True)
  336. desc = cat.describe()
  337. exp_index = CategoricalIndex(
  338. list('abcd'), ordered=self.factor.ordered, name='categories')
  339. expected = DataFrame({'counts': [3, 2, 3, 0],
  340. 'freqs': [3 / 8., 2 / 8., 3 / 8., 0]},
  341. index=exp_index)
  342. tm.assert_frame_equal(desc, expected)
  343. # check an integer one
  344. cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1])
  345. desc = cat.describe()
  346. exp_index = CategoricalIndex([1, 2, 3], ordered=cat.ordered,
  347. name='categories')
  348. expected = DataFrame({'counts': [5, 3, 3],
  349. 'freqs': [5 / 11., 3 / 11., 3 / 11.]},
  350. index=exp_index)
  351. tm.assert_frame_equal(desc, expected)
  352. # https://github.com/pandas-dev/pandas/issues/3678
  353. # describe should work with NaN
  354. cat = Categorical([np.nan, 1, 2, 2])
  355. desc = cat.describe()
  356. expected = DataFrame({'counts': [1, 2, 1],
  357. 'freqs': [1 / 4., 2 / 4., 1 / 4.]},
  358. index=CategoricalIndex([1, 2, np.nan],
  359. categories=[1, 2],
  360. name='categories'))
  361. tm.assert_frame_equal(desc, expected)
  362. def test_set_categories_inplace(self):
  363. cat = self.factor.copy()
  364. cat.set_categories(['a', 'b', 'c', 'd'], inplace=True)
  365. tm.assert_index_equal(cat.categories, Index(['a', 'b', 'c', 'd']))
  366. class TestPrivateCategoricalAPI(object):
  367. def test_codes_immutable(self):
  368. # Codes should be read only
  369. c = Categorical(["a", "b", "c", "a", np.nan])
  370. exp = np.array([0, 1, 2, 0, -1], dtype='int8')
  371. tm.assert_numpy_array_equal(c.codes, exp)
  372. # Assignments to codes should raise
  373. with pytest.raises(ValueError):
  374. c.codes = np.array([0, 1, 2, 0, 1], dtype='int8')
  375. # changes in the codes array should raise
  376. codes = c.codes
  377. with pytest.raises(ValueError):
  378. codes[4] = 1
  379. # But even after getting the codes, the original array should still be
  380. # writeable!
  381. c[4] = "a"
  382. exp = np.array([0, 1, 2, 0, 0], dtype='int8')
  383. tm.assert_numpy_array_equal(c.codes, exp)
  384. c._codes[4] = 2
  385. exp = np.array([0, 1, 2, 0, 2], dtype='int8')
  386. tm.assert_numpy_array_equal(c.codes, exp)
  387. @pytest.mark.parametrize('codes, old, new, expected', [
  388. ([0, 1], ['a', 'b'], ['a', 'b'], [0, 1]),
  389. ([0, 1], ['b', 'a'], ['b', 'a'], [0, 1]),
  390. ([0, 1], ['a', 'b'], ['b', 'a'], [1, 0]),
  391. ([0, 1], ['b', 'a'], ['a', 'b'], [1, 0]),
  392. ([0, 1, 0, 1], ['a', 'b'], ['a', 'b', 'c'], [0, 1, 0, 1]),
  393. ([0, 1, 2, 2], ['a', 'b', 'c'], ['a', 'b'], [0, 1, -1, -1]),
  394. ([0, 1, -1], ['a', 'b', 'c'], ['a', 'b', 'c'], [0, 1, -1]),
  395. ([0, 1, -1], ['a', 'b', 'c'], ['b'], [-1, 0, -1]),
  396. ([0, 1, -1], ['a', 'b', 'c'], ['d'], [-1, -1, -1]),
  397. ([0, 1, -1], ['a', 'b', 'c'], [], [-1, -1, -1]),
  398. ([-1, -1], [], ['a', 'b'], [-1, -1]),
  399. ([1, 0], ['b', 'a'], ['a', 'b'], [0, 1]),
  400. ])
  401. def test_recode_to_categories(self, codes, old, new, expected):
  402. codes = np.asanyarray(codes, dtype=np.int8)
  403. expected = np.asanyarray(expected, dtype=np.int8)
  404. old = Index(old)
  405. new = Index(new)
  406. result = _recode_for_categories(codes, old, new)
  407. tm.assert_numpy_array_equal(result, expected)
  408. def test_recode_to_categories_large(self):
  409. N = 1000
  410. codes = np.arange(N)
  411. old = Index(codes)
  412. expected = np.arange(N - 1, -1, -1, dtype=np.int16)
  413. new = Index(expected)
  414. result = _recode_for_categories(codes, old, new)
  415. tm.assert_numpy_array_equal(result, expected)