123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346 |
- import numpy as np
- import pytest
- from pandas.core.dtypes.concat import union_categoricals
- import pandas as pd
- from pandas import Categorical, CategoricalIndex, Series
- from pandas.util import testing as tm
- class TestUnionCategoricals(object):
- def test_union_categorical(self):
- # GH 13361
- data = [
- (list('abc'), list('abd'), list('abcabd')),
- ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
- ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),
- (['b', 'b', np.nan, 'a'], ['a', np.nan, 'c'],
- ['b', 'b', np.nan, 'a', 'a', np.nan, 'c']),
- (pd.date_range('2014-01-01', '2014-01-05'),
- pd.date_range('2014-01-06', '2014-01-07'),
- pd.date_range('2014-01-01', '2014-01-07')),
- (pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'),
- pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'),
- pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')),
- (pd.period_range('2014-01-01', '2014-01-05'),
- pd.period_range('2014-01-06', '2014-01-07'),
- pd.period_range('2014-01-01', '2014-01-07')),
- ]
- for a, b, combined in data:
- for box in [Categorical, CategoricalIndex, Series]:
- result = union_categoricals([box(Categorical(a)),
- box(Categorical(b))])
- expected = Categorical(combined)
- tm.assert_categorical_equal(result, expected,
- check_category_order=True)
- # new categories ordered by appearance
- s = Categorical(['x', 'y', 'z'])
- s2 = Categorical(['a', 'b', 'c'])
- result = union_categoricals([s, s2])
- expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
- categories=['x', 'y', 'z', 'a', 'b', 'c'])
- tm.assert_categorical_equal(result, expected)
- s = Categorical([0, 1.2, 2], ordered=True)
- s2 = Categorical([0, 1.2, 2], ordered=True)
- result = union_categoricals([s, s2])
- expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True)
- tm.assert_categorical_equal(result, expected)
- # must exactly match types
- s = Categorical([0, 1.2, 2])
- s2 = Categorical([2, 3, 4])
- msg = 'dtype of categories must be the same'
- with pytest.raises(TypeError, match=msg):
- union_categoricals([s, s2])
- msg = 'No Categoricals to union'
- with pytest.raises(ValueError, match=msg):
- union_categoricals([])
- def test_union_categoricals_nan(self):
- # GH 13759
- res = union_categoricals([pd.Categorical([1, 2, np.nan]),
- pd.Categorical([3, 2, np.nan])])
- exp = Categorical([1, 2, np.nan, 3, 2, np.nan])
- tm.assert_categorical_equal(res, exp)
- res = union_categoricals([pd.Categorical(['A', 'B']),
- pd.Categorical(['B', 'B', np.nan])])
- exp = Categorical(['A', 'B', 'B', 'B', np.nan])
- tm.assert_categorical_equal(res, exp)
- val1 = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'),
- pd.NaT]
- val2 = [pd.NaT, pd.Timestamp('2011-01-01'),
- pd.Timestamp('2011-02-01')]
- res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)])
- exp = Categorical(val1 + val2,
- categories=[pd.Timestamp('2011-01-01'),
- pd.Timestamp('2011-03-01'),
- pd.Timestamp('2011-02-01')])
- tm.assert_categorical_equal(res, exp)
- # all NaN
- res = union_categoricals([pd.Categorical(np.array([np.nan, np.nan],
- dtype=object)),
- pd.Categorical(['X'])])
- exp = Categorical([np.nan, np.nan, 'X'])
- tm.assert_categorical_equal(res, exp)
- res = union_categoricals([pd.Categorical([np.nan, np.nan]),
- pd.Categorical([np.nan, np.nan])])
- exp = Categorical([np.nan, np.nan, np.nan, np.nan])
- tm.assert_categorical_equal(res, exp)
- def test_union_categoricals_empty(self):
- # GH 13759
- res = union_categoricals([pd.Categorical([]),
- pd.Categorical([])])
- exp = Categorical([])
- tm.assert_categorical_equal(res, exp)
- res = union_categoricals([Categorical([]),
- Categorical(['1'])])
- exp = Categorical(['1'])
- tm.assert_categorical_equal(res, exp)
- def test_union_categorical_same_category(self):
- # check fastpath
- c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
- c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4])
- res = union_categoricals([c1, c2])
- exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan],
- categories=[1, 2, 3, 4])
- tm.assert_categorical_equal(res, exp)
- c1 = Categorical(['z', 'z', 'z'], categories=['x', 'y', 'z'])
- c2 = Categorical(['x', 'x', 'x'], categories=['x', 'y', 'z'])
- res = union_categoricals([c1, c2])
- exp = Categorical(['z', 'z', 'z', 'x', 'x', 'x'],
- categories=['x', 'y', 'z'])
- tm.assert_categorical_equal(res, exp)
- def test_union_categorical_same_categories_different_order(self):
- # https://github.com/pandas-dev/pandas/issues/19096
- c1 = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c'])
- c2 = Categorical(['a', 'b', 'c'], categories=['b', 'a', 'c'])
- result = union_categoricals([c1, c2])
- expected = Categorical(['a', 'b', 'c', 'a', 'b', 'c'],
- categories=['a', 'b', 'c'])
- tm.assert_categorical_equal(result, expected)
- def test_union_categoricals_ordered(self):
- c1 = Categorical([1, 2, 3], ordered=True)
- c2 = Categorical([1, 2, 3], ordered=False)
- msg = 'Categorical.ordered must be the same'
- with pytest.raises(TypeError, match=msg):
- union_categoricals([c1, c2])
- res = union_categoricals([c1, c1])
- exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True)
- tm.assert_categorical_equal(res, exp)
- c1 = Categorical([1, 2, 3, np.nan], ordered=True)
- c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
- res = union_categoricals([c1, c2])
- exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True)
- tm.assert_categorical_equal(res, exp)
- c1 = Categorical([1, 2, 3], ordered=True)
- c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
- msg = "to union ordered Categoricals, all categories must be the same"
- with pytest.raises(TypeError, match=msg):
- union_categoricals([c1, c2])
- def test_union_categoricals_ignore_order(self):
- # GH 15219
- c1 = Categorical([1, 2, 3], ordered=True)
- c2 = Categorical([1, 2, 3], ordered=False)
- res = union_categoricals([c1, c2], ignore_order=True)
- exp = Categorical([1, 2, 3, 1, 2, 3])
- tm.assert_categorical_equal(res, exp)
- msg = 'Categorical.ordered must be the same'
- with pytest.raises(TypeError, match=msg):
- union_categoricals([c1, c2], ignore_order=False)
- res = union_categoricals([c1, c1], ignore_order=True)
- exp = Categorical([1, 2, 3, 1, 2, 3])
- tm.assert_categorical_equal(res, exp)
- res = union_categoricals([c1, c1], ignore_order=False)
- exp = Categorical([1, 2, 3, 1, 2, 3],
- categories=[1, 2, 3], ordered=True)
- tm.assert_categorical_equal(res, exp)
- c1 = Categorical([1, 2, 3, np.nan], ordered=True)
- c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
- res = union_categoricals([c1, c2], ignore_order=True)
- exp = Categorical([1, 2, 3, np.nan, 3, 2])
- tm.assert_categorical_equal(res, exp)
- c1 = Categorical([1, 2, 3], ordered=True)
- c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
- res = union_categoricals([c1, c2], ignore_order=True)
- exp = Categorical([1, 2, 3, 1, 2, 3])
- tm.assert_categorical_equal(res, exp)
- res = union_categoricals([c2, c1], ignore_order=True,
- sort_categories=True)
- exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
- tm.assert_categorical_equal(res, exp)
- c1 = Categorical([1, 2, 3], ordered=True)
- c2 = Categorical([4, 5, 6], ordered=True)
- result = union_categoricals([c1, c2], ignore_order=True)
- expected = Categorical([1, 2, 3, 4, 5, 6])
- tm.assert_categorical_equal(result, expected)
- msg = "to union ordered Categoricals, all categories must be the same"
- with pytest.raises(TypeError, match=msg):
- union_categoricals([c1, c2], ignore_order=False)
- with pytest.raises(TypeError, match=msg):
- union_categoricals([c1, c2])
- def test_union_categoricals_sort(self):
- # GH 13846
- c1 = Categorical(['x', 'y', 'z'])
- c2 = Categorical(['a', 'b', 'c'])
- result = union_categoricals([c1, c2], sort_categories=True)
- expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
- categories=['a', 'b', 'c', 'x', 'y', 'z'])
- tm.assert_categorical_equal(result, expected)
- # fastpath
- c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c'])
- c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c'])
- result = union_categoricals([c1, c2], sort_categories=True)
- expected = Categorical(['a', 'b', 'b', 'c'],
- categories=['a', 'b', 'c'])
- tm.assert_categorical_equal(result, expected)
- c1 = Categorical(['a', 'b'], categories=['c', 'a', 'b'])
- c2 = Categorical(['b', 'c'], categories=['c', 'a', 'b'])
- result = union_categoricals([c1, c2], sort_categories=True)
- expected = Categorical(['a', 'b', 'b', 'c'],
- categories=['a', 'b', 'c'])
- tm.assert_categorical_equal(result, expected)
- # fastpath - skip resort
- c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
- c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c'])
- result = union_categoricals([c1, c2], sort_categories=True)
- expected = Categorical(['a', 'b', 'b', 'c'],
- categories=['a', 'b', 'c'])
- tm.assert_categorical_equal(result, expected)
- c1 = Categorical(['x', np.nan])
- c2 = Categorical([np.nan, 'b'])
- result = union_categoricals([c1, c2], sort_categories=True)
- expected = Categorical(['x', np.nan, np.nan, 'b'],
- categories=['b', 'x'])
- tm.assert_categorical_equal(result, expected)
- c1 = Categorical([np.nan])
- c2 = Categorical([np.nan])
- result = union_categoricals([c1, c2], sort_categories=True)
- expected = Categorical([np.nan, np.nan])
- tm.assert_categorical_equal(result, expected)
- c1 = Categorical([])
- c2 = Categorical([])
- result = union_categoricals([c1, c2], sort_categories=True)
- expected = Categorical([])
- tm.assert_categorical_equal(result, expected)
- c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True)
- c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True)
- with pytest.raises(TypeError):
- union_categoricals([c1, c2], sort_categories=True)
- def test_union_categoricals_sort_false(self):
- # GH 13846
- c1 = Categorical(['x', 'y', 'z'])
- c2 = Categorical(['a', 'b', 'c'])
- result = union_categoricals([c1, c2], sort_categories=False)
- expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
- categories=['x', 'y', 'z', 'a', 'b', 'c'])
- tm.assert_categorical_equal(result, expected)
- # fastpath
- c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c'])
- c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c'])
- result = union_categoricals([c1, c2], sort_categories=False)
- expected = Categorical(['a', 'b', 'b', 'c'],
- categories=['b', 'a', 'c'])
- tm.assert_categorical_equal(result, expected)
- # fastpath - skip resort
- c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
- c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c'])
- result = union_categoricals([c1, c2], sort_categories=False)
- expected = Categorical(['a', 'b', 'b', 'c'],
- categories=['a', 'b', 'c'])
- tm.assert_categorical_equal(result, expected)
- c1 = Categorical(['x', np.nan])
- c2 = Categorical([np.nan, 'b'])
- result = union_categoricals([c1, c2], sort_categories=False)
- expected = Categorical(['x', np.nan, np.nan, 'b'],
- categories=['x', 'b'])
- tm.assert_categorical_equal(result, expected)
- c1 = Categorical([np.nan])
- c2 = Categorical([np.nan])
- result = union_categoricals([c1, c2], sort_categories=False)
- expected = Categorical([np.nan, np.nan])
- tm.assert_categorical_equal(result, expected)
- c1 = Categorical([])
- c2 = Categorical([])
- result = union_categoricals([c1, c2], sort_categories=False)
- expected = Categorical([])
- tm.assert_categorical_equal(result, expected)
- c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True)
- c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True)
- result = union_categoricals([c1, c2], sort_categories=False)
- expected = Categorical(['b', 'a', 'a', 'c'],
- categories=['b', 'a', 'c'], ordered=True)
- tm.assert_categorical_equal(result, expected)
- def test_union_categorical_unwrap(self):
- # GH 14173
- c1 = Categorical(['a', 'b'])
- c2 = pd.Series(['b', 'c'], dtype='category')
- result = union_categoricals([c1, c2])
- expected = Categorical(['a', 'b', 'b', 'c'])
- tm.assert_categorical_equal(result, expected)
- c2 = CategoricalIndex(c2)
- result = union_categoricals([c1, c2])
- tm.assert_categorical_equal(result, expected)
- c1 = Series(c1)
- result = union_categoricals([c1, c2])
- tm.assert_categorical_equal(result, expected)
- with pytest.raises(TypeError):
- union_categoricals([c1, ['a', 'b', 'c']])
|