test_union_categoricals.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. import numpy as np
  2. import pytest
  3. from pandas.core.dtypes.concat import union_categoricals
  4. import pandas as pd
  5. from pandas import Categorical, CategoricalIndex, Series
  6. from pandas.util import testing as tm
  7. class TestUnionCategoricals(object):
  8. def test_union_categorical(self):
  9. # GH 13361
  10. data = [
  11. (list('abc'), list('abd'), list('abcabd')),
  12. ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
  13. ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),
  14. (['b', 'b', np.nan, 'a'], ['a', np.nan, 'c'],
  15. ['b', 'b', np.nan, 'a', 'a', np.nan, 'c']),
  16. (pd.date_range('2014-01-01', '2014-01-05'),
  17. pd.date_range('2014-01-06', '2014-01-07'),
  18. pd.date_range('2014-01-01', '2014-01-07')),
  19. (pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'),
  20. pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'),
  21. pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')),
  22. (pd.period_range('2014-01-01', '2014-01-05'),
  23. pd.period_range('2014-01-06', '2014-01-07'),
  24. pd.period_range('2014-01-01', '2014-01-07')),
  25. ]
  26. for a, b, combined in data:
  27. for box in [Categorical, CategoricalIndex, Series]:
  28. result = union_categoricals([box(Categorical(a)),
  29. box(Categorical(b))])
  30. expected = Categorical(combined)
  31. tm.assert_categorical_equal(result, expected,
  32. check_category_order=True)
  33. # new categories ordered by appearance
  34. s = Categorical(['x', 'y', 'z'])
  35. s2 = Categorical(['a', 'b', 'c'])
  36. result = union_categoricals([s, s2])
  37. expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
  38. categories=['x', 'y', 'z', 'a', 'b', 'c'])
  39. tm.assert_categorical_equal(result, expected)
  40. s = Categorical([0, 1.2, 2], ordered=True)
  41. s2 = Categorical([0, 1.2, 2], ordered=True)
  42. result = union_categoricals([s, s2])
  43. expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True)
  44. tm.assert_categorical_equal(result, expected)
  45. # must exactly match types
  46. s = Categorical([0, 1.2, 2])
  47. s2 = Categorical([2, 3, 4])
  48. msg = 'dtype of categories must be the same'
  49. with pytest.raises(TypeError, match=msg):
  50. union_categoricals([s, s2])
  51. msg = 'No Categoricals to union'
  52. with pytest.raises(ValueError, match=msg):
  53. union_categoricals([])
  54. def test_union_categoricals_nan(self):
  55. # GH 13759
  56. res = union_categoricals([pd.Categorical([1, 2, np.nan]),
  57. pd.Categorical([3, 2, np.nan])])
  58. exp = Categorical([1, 2, np.nan, 3, 2, np.nan])
  59. tm.assert_categorical_equal(res, exp)
  60. res = union_categoricals([pd.Categorical(['A', 'B']),
  61. pd.Categorical(['B', 'B', np.nan])])
  62. exp = Categorical(['A', 'B', 'B', 'B', np.nan])
  63. tm.assert_categorical_equal(res, exp)
  64. val1 = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'),
  65. pd.NaT]
  66. val2 = [pd.NaT, pd.Timestamp('2011-01-01'),
  67. pd.Timestamp('2011-02-01')]
  68. res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)])
  69. exp = Categorical(val1 + val2,
  70. categories=[pd.Timestamp('2011-01-01'),
  71. pd.Timestamp('2011-03-01'),
  72. pd.Timestamp('2011-02-01')])
  73. tm.assert_categorical_equal(res, exp)
  74. # all NaN
  75. res = union_categoricals([pd.Categorical(np.array([np.nan, np.nan],
  76. dtype=object)),
  77. pd.Categorical(['X'])])
  78. exp = Categorical([np.nan, np.nan, 'X'])
  79. tm.assert_categorical_equal(res, exp)
  80. res = union_categoricals([pd.Categorical([np.nan, np.nan]),
  81. pd.Categorical([np.nan, np.nan])])
  82. exp = Categorical([np.nan, np.nan, np.nan, np.nan])
  83. tm.assert_categorical_equal(res, exp)
  84. def test_union_categoricals_empty(self):
  85. # GH 13759
  86. res = union_categoricals([pd.Categorical([]),
  87. pd.Categorical([])])
  88. exp = Categorical([])
  89. tm.assert_categorical_equal(res, exp)
  90. res = union_categoricals([Categorical([]),
  91. Categorical(['1'])])
  92. exp = Categorical(['1'])
  93. tm.assert_categorical_equal(res, exp)
  94. def test_union_categorical_same_category(self):
  95. # check fastpath
  96. c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
  97. c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4])
  98. res = union_categoricals([c1, c2])
  99. exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan],
  100. categories=[1, 2, 3, 4])
  101. tm.assert_categorical_equal(res, exp)
  102. c1 = Categorical(['z', 'z', 'z'], categories=['x', 'y', 'z'])
  103. c2 = Categorical(['x', 'x', 'x'], categories=['x', 'y', 'z'])
  104. res = union_categoricals([c1, c2])
  105. exp = Categorical(['z', 'z', 'z', 'x', 'x', 'x'],
  106. categories=['x', 'y', 'z'])
  107. tm.assert_categorical_equal(res, exp)
  108. def test_union_categorical_same_categories_different_order(self):
  109. # https://github.com/pandas-dev/pandas/issues/19096
  110. c1 = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c'])
  111. c2 = Categorical(['a', 'b', 'c'], categories=['b', 'a', 'c'])
  112. result = union_categoricals([c1, c2])
  113. expected = Categorical(['a', 'b', 'c', 'a', 'b', 'c'],
  114. categories=['a', 'b', 'c'])
  115. tm.assert_categorical_equal(result, expected)
  116. def test_union_categoricals_ordered(self):
  117. c1 = Categorical([1, 2, 3], ordered=True)
  118. c2 = Categorical([1, 2, 3], ordered=False)
  119. msg = 'Categorical.ordered must be the same'
  120. with pytest.raises(TypeError, match=msg):
  121. union_categoricals([c1, c2])
  122. res = union_categoricals([c1, c1])
  123. exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True)
  124. tm.assert_categorical_equal(res, exp)
  125. c1 = Categorical([1, 2, 3, np.nan], ordered=True)
  126. c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
  127. res = union_categoricals([c1, c2])
  128. exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True)
  129. tm.assert_categorical_equal(res, exp)
  130. c1 = Categorical([1, 2, 3], ordered=True)
  131. c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
  132. msg = "to union ordered Categoricals, all categories must be the same"
  133. with pytest.raises(TypeError, match=msg):
  134. union_categoricals([c1, c2])
  135. def test_union_categoricals_ignore_order(self):
  136. # GH 15219
  137. c1 = Categorical([1, 2, 3], ordered=True)
  138. c2 = Categorical([1, 2, 3], ordered=False)
  139. res = union_categoricals([c1, c2], ignore_order=True)
  140. exp = Categorical([1, 2, 3, 1, 2, 3])
  141. tm.assert_categorical_equal(res, exp)
  142. msg = 'Categorical.ordered must be the same'
  143. with pytest.raises(TypeError, match=msg):
  144. union_categoricals([c1, c2], ignore_order=False)
  145. res = union_categoricals([c1, c1], ignore_order=True)
  146. exp = Categorical([1, 2, 3, 1, 2, 3])
  147. tm.assert_categorical_equal(res, exp)
  148. res = union_categoricals([c1, c1], ignore_order=False)
  149. exp = Categorical([1, 2, 3, 1, 2, 3],
  150. categories=[1, 2, 3], ordered=True)
  151. tm.assert_categorical_equal(res, exp)
  152. c1 = Categorical([1, 2, 3, np.nan], ordered=True)
  153. c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
  154. res = union_categoricals([c1, c2], ignore_order=True)
  155. exp = Categorical([1, 2, 3, np.nan, 3, 2])
  156. tm.assert_categorical_equal(res, exp)
  157. c1 = Categorical([1, 2, 3], ordered=True)
  158. c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
  159. res = union_categoricals([c1, c2], ignore_order=True)
  160. exp = Categorical([1, 2, 3, 1, 2, 3])
  161. tm.assert_categorical_equal(res, exp)
  162. res = union_categoricals([c2, c1], ignore_order=True,
  163. sort_categories=True)
  164. exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
  165. tm.assert_categorical_equal(res, exp)
  166. c1 = Categorical([1, 2, 3], ordered=True)
  167. c2 = Categorical([4, 5, 6], ordered=True)
  168. result = union_categoricals([c1, c2], ignore_order=True)
  169. expected = Categorical([1, 2, 3, 4, 5, 6])
  170. tm.assert_categorical_equal(result, expected)
  171. msg = "to union ordered Categoricals, all categories must be the same"
  172. with pytest.raises(TypeError, match=msg):
  173. union_categoricals([c1, c2], ignore_order=False)
  174. with pytest.raises(TypeError, match=msg):
  175. union_categoricals([c1, c2])
  176. def test_union_categoricals_sort(self):
  177. # GH 13846
  178. c1 = Categorical(['x', 'y', 'z'])
  179. c2 = Categorical(['a', 'b', 'c'])
  180. result = union_categoricals([c1, c2], sort_categories=True)
  181. expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
  182. categories=['a', 'b', 'c', 'x', 'y', 'z'])
  183. tm.assert_categorical_equal(result, expected)
  184. # fastpath
  185. c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c'])
  186. c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c'])
  187. result = union_categoricals([c1, c2], sort_categories=True)
  188. expected = Categorical(['a', 'b', 'b', 'c'],
  189. categories=['a', 'b', 'c'])
  190. tm.assert_categorical_equal(result, expected)
  191. c1 = Categorical(['a', 'b'], categories=['c', 'a', 'b'])
  192. c2 = Categorical(['b', 'c'], categories=['c', 'a', 'b'])
  193. result = union_categoricals([c1, c2], sort_categories=True)
  194. expected = Categorical(['a', 'b', 'b', 'c'],
  195. categories=['a', 'b', 'c'])
  196. tm.assert_categorical_equal(result, expected)
  197. # fastpath - skip resort
  198. c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
  199. c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c'])
  200. result = union_categoricals([c1, c2], sort_categories=True)
  201. expected = Categorical(['a', 'b', 'b', 'c'],
  202. categories=['a', 'b', 'c'])
  203. tm.assert_categorical_equal(result, expected)
  204. c1 = Categorical(['x', np.nan])
  205. c2 = Categorical([np.nan, 'b'])
  206. result = union_categoricals([c1, c2], sort_categories=True)
  207. expected = Categorical(['x', np.nan, np.nan, 'b'],
  208. categories=['b', 'x'])
  209. tm.assert_categorical_equal(result, expected)
  210. c1 = Categorical([np.nan])
  211. c2 = Categorical([np.nan])
  212. result = union_categoricals([c1, c2], sort_categories=True)
  213. expected = Categorical([np.nan, np.nan])
  214. tm.assert_categorical_equal(result, expected)
  215. c1 = Categorical([])
  216. c2 = Categorical([])
  217. result = union_categoricals([c1, c2], sort_categories=True)
  218. expected = Categorical([])
  219. tm.assert_categorical_equal(result, expected)
  220. c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True)
  221. c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True)
  222. with pytest.raises(TypeError):
  223. union_categoricals([c1, c2], sort_categories=True)
  224. def test_union_categoricals_sort_false(self):
  225. # GH 13846
  226. c1 = Categorical(['x', 'y', 'z'])
  227. c2 = Categorical(['a', 'b', 'c'])
  228. result = union_categoricals([c1, c2], sort_categories=False)
  229. expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
  230. categories=['x', 'y', 'z', 'a', 'b', 'c'])
  231. tm.assert_categorical_equal(result, expected)
  232. # fastpath
  233. c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c'])
  234. c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c'])
  235. result = union_categoricals([c1, c2], sort_categories=False)
  236. expected = Categorical(['a', 'b', 'b', 'c'],
  237. categories=['b', 'a', 'c'])
  238. tm.assert_categorical_equal(result, expected)
  239. # fastpath - skip resort
  240. c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
  241. c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c'])
  242. result = union_categoricals([c1, c2], sort_categories=False)
  243. expected = Categorical(['a', 'b', 'b', 'c'],
  244. categories=['a', 'b', 'c'])
  245. tm.assert_categorical_equal(result, expected)
  246. c1 = Categorical(['x', np.nan])
  247. c2 = Categorical([np.nan, 'b'])
  248. result = union_categoricals([c1, c2], sort_categories=False)
  249. expected = Categorical(['x', np.nan, np.nan, 'b'],
  250. categories=['x', 'b'])
  251. tm.assert_categorical_equal(result, expected)
  252. c1 = Categorical([np.nan])
  253. c2 = Categorical([np.nan])
  254. result = union_categoricals([c1, c2], sort_categories=False)
  255. expected = Categorical([np.nan, np.nan])
  256. tm.assert_categorical_equal(result, expected)
  257. c1 = Categorical([])
  258. c2 = Categorical([])
  259. result = union_categoricals([c1, c2], sort_categories=False)
  260. expected = Categorical([])
  261. tm.assert_categorical_equal(result, expected)
  262. c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True)
  263. c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True)
  264. result = union_categoricals([c1, c2], sort_categories=False)
  265. expected = Categorical(['b', 'a', 'a', 'c'],
  266. categories=['b', 'a', 'c'], ordered=True)
  267. tm.assert_categorical_equal(result, expected)
  268. def test_union_categorical_unwrap(self):
  269. # GH 14173
  270. c1 = Categorical(['a', 'b'])
  271. c2 = pd.Series(['b', 'c'], dtype='category')
  272. result = union_categoricals([c1, c2])
  273. expected = Categorical(['a', 'b', 'b', 'c'])
  274. tm.assert_categorical_equal(result, expected)
  275. c2 = CategoricalIndex(c2)
  276. result = union_categoricals([c1, c2])
  277. tm.assert_categorical_equal(result, expected)
  278. c1 = Series(c1)
  279. result = union_categoricals([c1, c2])
  280. tm.assert_categorical_equal(result, expected)
  281. with pytest.raises(TypeError):
  282. union_categoricals([c1, ['a', 'b', 'c']])