test_analytics.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. # -*- coding: utf-8 -*-
  2. import sys
  3. import numpy as np
  4. import pytest
  5. from pandas.compat import PYPY
  6. from pandas import Categorical, Index, Series
  7. from pandas.api.types import is_scalar
  8. import pandas.util.testing as tm
  9. class TestCategoricalAnalytics(object):
  10. def test_min_max(self):
  11. # unordered cats have no min/max
  12. cat = Categorical(["a", "b", "c", "d"], ordered=False)
  13. pytest.raises(TypeError, lambda: cat.min())
  14. pytest.raises(TypeError, lambda: cat.max())
  15. cat = Categorical(["a", "b", "c", "d"], ordered=True)
  16. _min = cat.min()
  17. _max = cat.max()
  18. assert _min == "a"
  19. assert _max == "d"
  20. cat = Categorical(["a", "b", "c", "d"],
  21. categories=['d', 'c', 'b', 'a'], ordered=True)
  22. _min = cat.min()
  23. _max = cat.max()
  24. assert _min == "d"
  25. assert _max == "a"
  26. cat = Categorical([np.nan, "b", "c", np.nan],
  27. categories=['d', 'c', 'b', 'a'], ordered=True)
  28. _min = cat.min()
  29. _max = cat.max()
  30. assert np.isnan(_min)
  31. assert _max == "b"
  32. _min = cat.min(numeric_only=True)
  33. assert _min == "c"
  34. _max = cat.max(numeric_only=True)
  35. assert _max == "b"
  36. cat = Categorical([np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1],
  37. ordered=True)
  38. _min = cat.min()
  39. _max = cat.max()
  40. assert np.isnan(_min)
  41. assert _max == 1
  42. _min = cat.min(numeric_only=True)
  43. assert _min == 2
  44. _max = cat.max(numeric_only=True)
  45. assert _max == 1
  46. @pytest.mark.parametrize("values,categories,exp_mode", [
  47. ([1, 1, 2, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5]),
  48. ([1, 1, 1, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5, 1]),
  49. ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [5, 4, 3, 2, 1]),
  50. ([np.nan, np.nan, np.nan, 4, 5], [5, 4, 3, 2, 1], [5, 4]),
  51. ([np.nan, np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]),
  52. ([np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4])])
  53. def test_mode(self, values, categories, exp_mode):
  54. s = Categorical(values, categories=categories, ordered=True)
  55. res = s.mode()
  56. exp = Categorical(exp_mode, categories=categories, ordered=True)
  57. tm.assert_categorical_equal(res, exp)
  58. def test_searchsorted(self):
  59. # https://github.com/pandas-dev/pandas/issues/8420
  60. # https://github.com/pandas-dev/pandas/issues/14522
  61. c1 = Categorical(['cheese', 'milk', 'apple', 'bread', 'bread'],
  62. categories=['cheese', 'milk', 'apple', 'bread'],
  63. ordered=True)
  64. s1 = Series(c1)
  65. c2 = Categorical(['cheese', 'milk', 'apple', 'bread', 'bread'],
  66. categories=['cheese', 'milk', 'apple', 'bread'],
  67. ordered=False)
  68. s2 = Series(c2)
  69. # Searching for single item argument, side='left' (default)
  70. res_cat = c1.searchsorted('apple')
  71. assert res_cat == 2
  72. assert is_scalar(res_cat)
  73. res_ser = s1.searchsorted('apple')
  74. assert res_ser == 2
  75. assert is_scalar(res_ser)
  76. # Searching for single item array, side='left' (default)
  77. res_cat = c1.searchsorted(['bread'])
  78. res_ser = s1.searchsorted(['bread'])
  79. exp = np.array([3], dtype=np.intp)
  80. tm.assert_numpy_array_equal(res_cat, exp)
  81. tm.assert_numpy_array_equal(res_ser, exp)
  82. # Searching for several items array, side='right'
  83. res_cat = c1.searchsorted(['apple', 'bread'], side='right')
  84. res_ser = s1.searchsorted(['apple', 'bread'], side='right')
  85. exp = np.array([3, 5], dtype=np.intp)
  86. tm.assert_numpy_array_equal(res_cat, exp)
  87. tm.assert_numpy_array_equal(res_ser, exp)
  88. # Searching for a single value that is not from the Categorical
  89. pytest.raises(KeyError, lambda: c1.searchsorted('cucumber'))
  90. pytest.raises(KeyError, lambda: s1.searchsorted('cucumber'))
  91. # Searching for multiple values one of each is not from the Categorical
  92. pytest.raises(KeyError,
  93. lambda: c1.searchsorted(['bread', 'cucumber']))
  94. pytest.raises(KeyError,
  95. lambda: s1.searchsorted(['bread', 'cucumber']))
  96. # searchsorted call for unordered Categorical
  97. pytest.raises(ValueError, lambda: c2.searchsorted('apple'))
  98. pytest.raises(ValueError, lambda: s2.searchsorted('apple'))
  99. def test_unique(self):
  100. # categories are reordered based on value when ordered=False
  101. cat = Categorical(["a", "b"])
  102. exp = Index(["a", "b"])
  103. res = cat.unique()
  104. tm.assert_index_equal(res.categories, exp)
  105. tm.assert_categorical_equal(res, cat)
  106. cat = Categorical(["a", "b", "a", "a"], categories=["a", "b", "c"])
  107. res = cat.unique()
  108. tm.assert_index_equal(res.categories, exp)
  109. tm.assert_categorical_equal(res, Categorical(exp))
  110. cat = Categorical(["c", "a", "b", "a", "a"],
  111. categories=["a", "b", "c"])
  112. exp = Index(["c", "a", "b"])
  113. res = cat.unique()
  114. tm.assert_index_equal(res.categories, exp)
  115. exp_cat = Categorical(exp, categories=['c', 'a', 'b'])
  116. tm.assert_categorical_equal(res, exp_cat)
  117. # nan must be removed
  118. cat = Categorical(["b", np.nan, "b", np.nan, "a"],
  119. categories=["a", "b", "c"])
  120. res = cat.unique()
  121. exp = Index(["b", "a"])
  122. tm.assert_index_equal(res.categories, exp)
  123. exp_cat = Categorical(["b", np.nan, "a"], categories=["b", "a"])
  124. tm.assert_categorical_equal(res, exp_cat)
  125. def test_unique_ordered(self):
  126. # keep categories order when ordered=True
  127. cat = Categorical(['b', 'a', 'b'], categories=['a', 'b'], ordered=True)
  128. res = cat.unique()
  129. exp_cat = Categorical(['b', 'a'], categories=['a', 'b'], ordered=True)
  130. tm.assert_categorical_equal(res, exp_cat)
  131. cat = Categorical(['c', 'b', 'a', 'a'], categories=['a', 'b', 'c'],
  132. ordered=True)
  133. res = cat.unique()
  134. exp_cat = Categorical(['c', 'b', 'a'], categories=['a', 'b', 'c'],
  135. ordered=True)
  136. tm.assert_categorical_equal(res, exp_cat)
  137. cat = Categorical(['b', 'a', 'a'], categories=['a', 'b', 'c'],
  138. ordered=True)
  139. res = cat.unique()
  140. exp_cat = Categorical(['b', 'a'], categories=['a', 'b'], ordered=True)
  141. tm.assert_categorical_equal(res, exp_cat)
  142. cat = Categorical(['b', 'b', np.nan, 'a'], categories=['a', 'b', 'c'],
  143. ordered=True)
  144. res = cat.unique()
  145. exp_cat = Categorical(['b', np.nan, 'a'], categories=['a', 'b'],
  146. ordered=True)
  147. tm.assert_categorical_equal(res, exp_cat)
  148. def test_unique_index_series(self):
  149. c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1])
  150. # Categorical.unique sorts categories by appearance order
  151. # if ordered=False
  152. exp = Categorical([3, 1, 2], categories=[3, 1, 2])
  153. tm.assert_categorical_equal(c.unique(), exp)
  154. tm.assert_index_equal(Index(c).unique(), Index(exp))
  155. tm.assert_categorical_equal(Series(c).unique(), exp)
  156. c = Categorical([1, 1, 2, 2], categories=[3, 2, 1])
  157. exp = Categorical([1, 2], categories=[1, 2])
  158. tm.assert_categorical_equal(c.unique(), exp)
  159. tm.assert_index_equal(Index(c).unique(), Index(exp))
  160. tm.assert_categorical_equal(Series(c).unique(), exp)
  161. c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1], ordered=True)
  162. # Categorical.unique keeps categories order if ordered=True
  163. exp = Categorical([3, 1, 2], categories=[3, 2, 1], ordered=True)
  164. tm.assert_categorical_equal(c.unique(), exp)
  165. tm.assert_index_equal(Index(c).unique(), Index(exp))
  166. tm.assert_categorical_equal(Series(c).unique(), exp)
  167. def test_shift(self):
  168. # GH 9416
  169. cat = Categorical(['a', 'b', 'c', 'd', 'a'])
  170. # shift forward
  171. sp1 = cat.shift(1)
  172. xp1 = Categorical([np.nan, 'a', 'b', 'c', 'd'])
  173. tm.assert_categorical_equal(sp1, xp1)
  174. tm.assert_categorical_equal(cat[:-1], sp1[1:])
  175. # shift back
  176. sn2 = cat.shift(-2)
  177. xp2 = Categorical(['c', 'd', 'a', np.nan, np.nan],
  178. categories=['a', 'b', 'c', 'd'])
  179. tm.assert_categorical_equal(sn2, xp2)
  180. tm.assert_categorical_equal(cat[2:], sn2[:-2])
  181. # shift by zero
  182. tm.assert_categorical_equal(cat, cat.shift(0))
  183. def test_nbytes(self):
  184. cat = Categorical([1, 2, 3])
  185. exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories
  186. assert cat.nbytes == exp
  187. def test_memory_usage(self):
  188. cat = Categorical([1, 2, 3])
  189. # .categories is an index, so we include the hashtable
  190. assert 0 < cat.nbytes <= cat.memory_usage()
  191. assert 0 < cat.nbytes <= cat.memory_usage(deep=True)
  192. cat = Categorical(['foo', 'foo', 'bar'])
  193. assert cat.memory_usage(deep=True) > cat.nbytes
  194. if not PYPY:
  195. # sys.getsizeof will call the .memory_usage with
  196. # deep=True, and add on some GC overhead
  197. diff = cat.memory_usage(deep=True) - sys.getsizeof(cat)
  198. assert abs(diff) < 100
  199. def test_map(self):
  200. c = Categorical(list('ABABC'), categories=list('CBA'), ordered=True)
  201. result = c.map(lambda x: x.lower())
  202. exp = Categorical(list('ababc'), categories=list('cba'), ordered=True)
  203. tm.assert_categorical_equal(result, exp)
  204. c = Categorical(list('ABABC'), categories=list('ABC'), ordered=False)
  205. result = c.map(lambda x: x.lower())
  206. exp = Categorical(list('ababc'), categories=list('abc'), ordered=False)
  207. tm.assert_categorical_equal(result, exp)
  208. result = c.map(lambda x: 1)
  209. # GH 12766: Return an index not an array
  210. tm.assert_index_equal(result, Index(np.array([1] * 5, dtype=np.int64)))
  211. def test_validate_inplace(self):
  212. cat = Categorical(['A', 'B', 'B', 'C', 'A'])
  213. invalid_values = [1, "True", [1, 2, 3], 5.0]
  214. for value in invalid_values:
  215. with pytest.raises(ValueError):
  216. cat.set_ordered(value=True, inplace=value)
  217. with pytest.raises(ValueError):
  218. cat.as_ordered(inplace=value)
  219. with pytest.raises(ValueError):
  220. cat.as_unordered(inplace=value)
  221. with pytest.raises(ValueError):
  222. cat.set_categories(['X', 'Y', 'Z'], rename=True, inplace=value)
  223. with pytest.raises(ValueError):
  224. cat.rename_categories(['X', 'Y', 'Z'], inplace=value)
  225. with pytest.raises(ValueError):
  226. cat.reorder_categories(
  227. ['X', 'Y', 'Z'], ordered=True, inplace=value)
  228. with pytest.raises(ValueError):
  229. cat.add_categories(
  230. new_categories=['D', 'E', 'F'], inplace=value)
  231. with pytest.raises(ValueError):
  232. cat.remove_categories(removals=['D', 'E', 'F'], inplace=value)
  233. with pytest.raises(ValueError):
  234. cat.remove_unused_categories(inplace=value)
  235. with pytest.raises(ValueError):
  236. cat.sort_values(inplace=value)
  237. def test_isna(self):
  238. exp = np.array([False, False, True])
  239. c = Categorical(["a", "b", np.nan])
  240. res = c.isna()
  241. tm.assert_numpy_array_equal(res, exp)