123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508 |
- # -*- coding: utf-8 -*-
- import numpy as np
- import pytest
- from pandas import Categorical, CategoricalIndex, DataFrame, Index, Series
- from pandas.core.arrays.categorical import _recode_for_categories
- from pandas.tests.arrays.categorical.common import TestCategorical
- import pandas.util.testing as tm
- class TestCategoricalAPI(object):
- def test_ordered_api(self):
- # GH 9347
- cat1 = Categorical(list('acb'), ordered=False)
- tm.assert_index_equal(cat1.categories, Index(['a', 'b', 'c']))
- assert not cat1.ordered
- cat2 = Categorical(list('acb'), categories=list('bca'), ordered=False)
- tm.assert_index_equal(cat2.categories, Index(['b', 'c', 'a']))
- assert not cat2.ordered
- cat3 = Categorical(list('acb'), ordered=True)
- tm.assert_index_equal(cat3.categories, Index(['a', 'b', 'c']))
- assert cat3.ordered
- cat4 = Categorical(list('acb'), categories=list('bca'), ordered=True)
- tm.assert_index_equal(cat4.categories, Index(['b', 'c', 'a']))
- assert cat4.ordered
- def test_set_ordered(self):
- cat = Categorical(["a", "b", "c", "a"], ordered=True)
- cat2 = cat.as_unordered()
- assert not cat2.ordered
- cat2 = cat.as_ordered()
- assert cat2.ordered
- cat2.as_unordered(inplace=True)
- assert not cat2.ordered
- cat2.as_ordered(inplace=True)
- assert cat2.ordered
- assert cat2.set_ordered(True).ordered
- assert not cat2.set_ordered(False).ordered
- cat2.set_ordered(True, inplace=True)
- assert cat2.ordered
- cat2.set_ordered(False, inplace=True)
- assert not cat2.ordered
- # removed in 0.19.0
- msg = "can\'t set attribute"
- with pytest.raises(AttributeError, match=msg):
- cat.ordered = True
- with pytest.raises(AttributeError, match=msg):
- cat.ordered = False
- def test_rename_categories(self):
- cat = Categorical(["a", "b", "c", "a"])
- # inplace=False: the old one must not be changed
- res = cat.rename_categories([1, 2, 3])
- tm.assert_numpy_array_equal(res.__array__(), np.array([1, 2, 3, 1],
- dtype=np.int64))
- tm.assert_index_equal(res.categories, Index([1, 2, 3]))
- exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_)
- tm.assert_numpy_array_equal(cat.__array__(), exp_cat)
- exp_cat = Index(["a", "b", "c"])
- tm.assert_index_equal(cat.categories, exp_cat)
- # GH18862 (let rename_categories take callables)
- result = cat.rename_categories(lambda x: x.upper())
- expected = Categorical(["A", "B", "C", "A"])
- tm.assert_categorical_equal(result, expected)
- # and now inplace
- res = cat.rename_categories([1, 2, 3], inplace=True)
- assert res is None
- tm.assert_numpy_array_equal(cat.__array__(), np.array([1, 2, 3, 1],
- dtype=np.int64))
- tm.assert_index_equal(cat.categories, Index([1, 2, 3]))
- # Lengthen
- with pytest.raises(ValueError):
- cat.rename_categories([1, 2, 3, 4])
- # Shorten
- with pytest.raises(ValueError):
- cat.rename_categories([1, 2])
- def test_rename_categories_series(self):
- # https://github.com/pandas-dev/pandas/issues/17981
- c = Categorical(['a', 'b'])
- xpr = "Treating Series 'new_categories' as a list-like "
- with tm.assert_produces_warning(FutureWarning) as rec:
- result = c.rename_categories(Series([0, 1]))
- assert len(rec) == 1
- assert xpr in str(rec[0].message)
- expected = Categorical([0, 1])
- tm.assert_categorical_equal(result, expected)
- def test_rename_categories_dict(self):
- # GH 17336
- cat = Categorical(['a', 'b', 'c', 'd'])
- res = cat.rename_categories({'a': 4, 'b': 3, 'c': 2, 'd': 1})
- expected = Index([4, 3, 2, 1])
- tm.assert_index_equal(res.categories, expected)
- # Test for inplace
- res = cat.rename_categories({'a': 4, 'b': 3, 'c': 2, 'd': 1},
- inplace=True)
- assert res is None
- tm.assert_index_equal(cat.categories, expected)
- # Test for dicts of smaller length
- cat = Categorical(['a', 'b', 'c', 'd'])
- res = cat.rename_categories({'a': 1, 'c': 3})
- expected = Index([1, 'b', 3, 'd'])
- tm.assert_index_equal(res.categories, expected)
- # Test for dicts with bigger length
- cat = Categorical(['a', 'b', 'c', 'd'])
- res = cat.rename_categories({'a': 1, 'b': 2, 'c': 3,
- 'd': 4, 'e': 5, 'f': 6})
- expected = Index([1, 2, 3, 4])
- tm.assert_index_equal(res.categories, expected)
- # Test for dicts with no items from old categories
- cat = Categorical(['a', 'b', 'c', 'd'])
- res = cat.rename_categories({'f': 1, 'g': 3})
- expected = Index(['a', 'b', 'c', 'd'])
- tm.assert_index_equal(res.categories, expected)
- def test_reorder_categories(self):
- cat = Categorical(["a", "b", "c", "a"], ordered=True)
- old = cat.copy()
- new = Categorical(["a", "b", "c", "a"], categories=["c", "b", "a"],
- ordered=True)
- # first inplace == False
- res = cat.reorder_categories(["c", "b", "a"])
- # cat must be the same as before
- tm.assert_categorical_equal(cat, old)
- # only res is changed
- tm.assert_categorical_equal(res, new)
- # inplace == True
- res = cat.reorder_categories(["c", "b", "a"], inplace=True)
- assert res is None
- tm.assert_categorical_equal(cat, new)
- # not all "old" included in "new"
- cat = Categorical(["a", "b", "c", "a"], ordered=True)
- with pytest.raises(ValueError):
- cat.reorder_categories(["a"])
- # still not all "old" in "new"
- with pytest.raises(ValueError):
- cat.reorder_categories(["a", "b", "d"])
- # all "old" included in "new", but too long
- with pytest.raises(ValueError):
- cat.reorder_categories(["a", "b", "c", "d"])
- def test_add_categories(self):
- cat = Categorical(["a", "b", "c", "a"], ordered=True)
- old = cat.copy()
- new = Categorical(["a", "b", "c", "a"],
- categories=["a", "b", "c", "d"], ordered=True)
- # first inplace == False
- res = cat.add_categories("d")
- tm.assert_categorical_equal(cat, old)
- tm.assert_categorical_equal(res, new)
- res = cat.add_categories(["d"])
- tm.assert_categorical_equal(cat, old)
- tm.assert_categorical_equal(res, new)
- # inplace == True
- res = cat.add_categories("d", inplace=True)
- tm.assert_categorical_equal(cat, new)
- assert res is None
- # new is in old categories
- with pytest.raises(ValueError):
- cat.add_categories(["d"])
- # GH 9927
- cat = Categorical(list("abc"), ordered=True)
- expected = Categorical(
- list("abc"), categories=list("abcde"), ordered=True)
- # test with Series, np.array, index, list
- res = cat.add_categories(Series(["d", "e"]))
- tm.assert_categorical_equal(res, expected)
- res = cat.add_categories(np.array(["d", "e"]))
- tm.assert_categorical_equal(res, expected)
- res = cat.add_categories(Index(["d", "e"]))
- tm.assert_categorical_equal(res, expected)
- res = cat.add_categories(["d", "e"])
- tm.assert_categorical_equal(res, expected)
- def test_set_categories(self):
- cat = Categorical(["a", "b", "c", "a"], ordered=True)
- exp_categories = Index(["c", "b", "a"])
- exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_)
- res = cat.set_categories(["c", "b", "a"], inplace=True)
- tm.assert_index_equal(cat.categories, exp_categories)
- tm.assert_numpy_array_equal(cat.__array__(), exp_values)
- assert res is None
- res = cat.set_categories(["a", "b", "c"])
- # cat must be the same as before
- tm.assert_index_equal(cat.categories, exp_categories)
- tm.assert_numpy_array_equal(cat.__array__(), exp_values)
- # only res is changed
- exp_categories_back = Index(["a", "b", "c"])
- tm.assert_index_equal(res.categories, exp_categories_back)
- tm.assert_numpy_array_equal(res.__array__(), exp_values)
- # not all "old" included in "new" -> all not included ones are now
- # np.nan
- cat = Categorical(["a", "b", "c", "a"], ordered=True)
- res = cat.set_categories(["a"])
- tm.assert_numpy_array_equal(res.codes, np.array([0, -1, -1, 0],
- dtype=np.int8))
- # still not all "old" in "new"
- res = cat.set_categories(["a", "b", "d"])
- tm.assert_numpy_array_equal(res.codes, np.array([0, 1, -1, 0],
- dtype=np.int8))
- tm.assert_index_equal(res.categories, Index(["a", "b", "d"]))
- # all "old" included in "new"
- cat = cat.set_categories(["a", "b", "c", "d"])
- exp_categories = Index(["a", "b", "c", "d"])
- tm.assert_index_equal(cat.categories, exp_categories)
- # internals...
- c = Categorical([1, 2, 3, 4, 1], categories=[1, 2, 3, 4], ordered=True)
- tm.assert_numpy_array_equal(c._codes, np.array([0, 1, 2, 3, 0],
- dtype=np.int8))
- tm.assert_index_equal(c.categories, Index([1, 2, 3, 4]))
- exp = np.array([1, 2, 3, 4, 1], dtype=np.int64)
- tm.assert_numpy_array_equal(c.get_values(), exp)
- # all "pointers" to '4' must be changed from 3 to 0,...
- c = c.set_categories([4, 3, 2, 1])
- # positions are changed
- tm.assert_numpy_array_equal(c._codes, np.array([3, 2, 1, 0, 3],
- dtype=np.int8))
- # categories are now in new order
- tm.assert_index_equal(c.categories, Index([4, 3, 2, 1]))
- # output is the same
- exp = np.array([1, 2, 3, 4, 1], dtype=np.int64)
- tm.assert_numpy_array_equal(c.get_values(), exp)
- assert c.min() == 4
- assert c.max() == 1
- # set_categories should set the ordering if specified
- c2 = c.set_categories([4, 3, 2, 1], ordered=False)
- assert not c2.ordered
- tm.assert_numpy_array_equal(c.get_values(), c2.get_values())
- # set_categories should pass thru the ordering
- c2 = c.set_ordered(False).set_categories([4, 3, 2, 1])
- assert not c2.ordered
- tm.assert_numpy_array_equal(c.get_values(), c2.get_values())
- @pytest.mark.parametrize('values, categories, new_categories', [
- # No NaNs, same cats, same order
- (['a', 'b', 'a'], ['a', 'b'], ['a', 'b'],),
- # No NaNs, same cats, different order
- (['a', 'b', 'a'], ['a', 'b'], ['b', 'a'],),
- # Same, unsorted
- (['b', 'a', 'a'], ['a', 'b'], ['a', 'b'],),
- # No NaNs, same cats, different order
- (['b', 'a', 'a'], ['a', 'b'], ['b', 'a'],),
- # NaNs
- (['a', 'b', 'c'], ['a', 'b'], ['a', 'b']),
- (['a', 'b', 'c'], ['a', 'b'], ['b', 'a']),
- (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
- (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
- # Introduce NaNs
- (['a', 'b', 'c'], ['a', 'b'], ['a']),
- (['a', 'b', 'c'], ['a', 'b'], ['b']),
- (['b', 'a', 'c'], ['a', 'b'], ['a']),
- (['b', 'a', 'c'], ['a', 'b'], ['a']),
- # No overlap
- (['a', 'b', 'c'], ['a', 'b'], ['d', 'e']),
- ])
- @pytest.mark.parametrize('ordered', [True, False])
- def test_set_categories_many(self, values, categories, new_categories,
- ordered):
- c = Categorical(values, categories)
- expected = Categorical(values, new_categories, ordered)
- result = c.set_categories(new_categories, ordered=ordered)
- tm.assert_categorical_equal(result, expected)
- def test_set_categories_rename_less(self):
- # GH 24675
- cat = Categorical(['A', 'B'])
- result = cat.set_categories(['A'], rename=True)
- expected = Categorical(['A', np.nan])
- tm.assert_categorical_equal(result, expected)
- def test_set_categories_private(self):
- cat = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c', 'd'])
- cat._set_categories(['a', 'c', 'd', 'e'])
- expected = Categorical(['a', 'c', 'd'], categories=list('acde'))
- tm.assert_categorical_equal(cat, expected)
- # fastpath
- cat = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c', 'd'])
- cat._set_categories(['a', 'c', 'd', 'e'], fastpath=True)
- expected = Categorical(['a', 'c', 'd'], categories=list('acde'))
- tm.assert_categorical_equal(cat, expected)
- def test_remove_categories(self):
- cat = Categorical(["a", "b", "c", "a"], ordered=True)
- old = cat.copy()
- new = Categorical(["a", "b", np.nan, "a"], categories=["a", "b"],
- ordered=True)
- # first inplace == False
- res = cat.remove_categories("c")
- tm.assert_categorical_equal(cat, old)
- tm.assert_categorical_equal(res, new)
- res = cat.remove_categories(["c"])
- tm.assert_categorical_equal(cat, old)
- tm.assert_categorical_equal(res, new)
- # inplace == True
- res = cat.remove_categories("c", inplace=True)
- tm.assert_categorical_equal(cat, new)
- assert res is None
- # removal is not in categories
- with pytest.raises(ValueError):
- cat.remove_categories(["c"])
- def test_remove_unused_categories(self):
- c = Categorical(["a", "b", "c", "d", "a"],
- categories=["a", "b", "c", "d", "e"])
- exp_categories_all = Index(["a", "b", "c", "d", "e"])
- exp_categories_dropped = Index(["a", "b", "c", "d"])
- tm.assert_index_equal(c.categories, exp_categories_all)
- res = c.remove_unused_categories()
- tm.assert_index_equal(res.categories, exp_categories_dropped)
- tm.assert_index_equal(c.categories, exp_categories_all)
- res = c.remove_unused_categories(inplace=True)
- tm.assert_index_equal(c.categories, exp_categories_dropped)
- assert res is None
- # with NaN values (GH11599)
- c = Categorical(["a", "b", "c", np.nan],
- categories=["a", "b", "c", "d", "e"])
- res = c.remove_unused_categories()
- tm.assert_index_equal(res.categories,
- Index(np.array(["a", "b", "c"])))
- exp_codes = np.array([0, 1, 2, -1], dtype=np.int8)
- tm.assert_numpy_array_equal(res.codes, exp_codes)
- tm.assert_index_equal(c.categories, exp_categories_all)
- val = ['F', np.nan, 'D', 'B', 'D', 'F', np.nan]
- cat = Categorical(values=val, categories=list('ABCDEFG'))
- out = cat.remove_unused_categories()
- tm.assert_index_equal(out.categories, Index(['B', 'D', 'F']))
- exp_codes = np.array([2, -1, 1, 0, 1, 2, -1], dtype=np.int8)
- tm.assert_numpy_array_equal(out.codes, exp_codes)
- assert out.get_values().tolist() == val
- alpha = list('abcdefghijklmnopqrstuvwxyz')
- val = np.random.choice(alpha[::2], 10000).astype('object')
- val[np.random.choice(len(val), 100)] = np.nan
- cat = Categorical(values=val, categories=alpha)
- out = cat.remove_unused_categories()
- assert out.get_values().tolist() == val.tolist()
- class TestCategoricalAPIWithFactor(TestCategorical):
- def test_describe(self):
- # string type
- desc = self.factor.describe()
- assert self.factor.ordered
- exp_index = CategoricalIndex(['a', 'b', 'c'], name='categories',
- ordered=self.factor.ordered)
- expected = DataFrame({'counts': [3, 2, 3],
- 'freqs': [3 / 8., 2 / 8., 3 / 8.]},
- index=exp_index)
- tm.assert_frame_equal(desc, expected)
- # check unused categories
- cat = self.factor.copy()
- cat.set_categories(["a", "b", "c", "d"], inplace=True)
- desc = cat.describe()
- exp_index = CategoricalIndex(
- list('abcd'), ordered=self.factor.ordered, name='categories')
- expected = DataFrame({'counts': [3, 2, 3, 0],
- 'freqs': [3 / 8., 2 / 8., 3 / 8., 0]},
- index=exp_index)
- tm.assert_frame_equal(desc, expected)
- # check an integer one
- cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1])
- desc = cat.describe()
- exp_index = CategoricalIndex([1, 2, 3], ordered=cat.ordered,
- name='categories')
- expected = DataFrame({'counts': [5, 3, 3],
- 'freqs': [5 / 11., 3 / 11., 3 / 11.]},
- index=exp_index)
- tm.assert_frame_equal(desc, expected)
- # https://github.com/pandas-dev/pandas/issues/3678
- # describe should work with NaN
- cat = Categorical([np.nan, 1, 2, 2])
- desc = cat.describe()
- expected = DataFrame({'counts': [1, 2, 1],
- 'freqs': [1 / 4., 2 / 4., 1 / 4.]},
- index=CategoricalIndex([1, 2, np.nan],
- categories=[1, 2],
- name='categories'))
- tm.assert_frame_equal(desc, expected)
- def test_set_categories_inplace(self):
- cat = self.factor.copy()
- cat.set_categories(['a', 'b', 'c', 'd'], inplace=True)
- tm.assert_index_equal(cat.categories, Index(['a', 'b', 'c', 'd']))
- class TestPrivateCategoricalAPI(object):
- def test_codes_immutable(self):
- # Codes should be read only
- c = Categorical(["a", "b", "c", "a", np.nan])
- exp = np.array([0, 1, 2, 0, -1], dtype='int8')
- tm.assert_numpy_array_equal(c.codes, exp)
- # Assignments to codes should raise
- with pytest.raises(ValueError):
- c.codes = np.array([0, 1, 2, 0, 1], dtype='int8')
- # changes in the codes array should raise
- codes = c.codes
- with pytest.raises(ValueError):
- codes[4] = 1
- # But even after getting the codes, the original array should still be
- # writeable!
- c[4] = "a"
- exp = np.array([0, 1, 2, 0, 0], dtype='int8')
- tm.assert_numpy_array_equal(c.codes, exp)
- c._codes[4] = 2
- exp = np.array([0, 1, 2, 0, 2], dtype='int8')
- tm.assert_numpy_array_equal(c.codes, exp)
- @pytest.mark.parametrize('codes, old, new, expected', [
- ([0, 1], ['a', 'b'], ['a', 'b'], [0, 1]),
- ([0, 1], ['b', 'a'], ['b', 'a'], [0, 1]),
- ([0, 1], ['a', 'b'], ['b', 'a'], [1, 0]),
- ([0, 1], ['b', 'a'], ['a', 'b'], [1, 0]),
- ([0, 1, 0, 1], ['a', 'b'], ['a', 'b', 'c'], [0, 1, 0, 1]),
- ([0, 1, 2, 2], ['a', 'b', 'c'], ['a', 'b'], [0, 1, -1, -1]),
- ([0, 1, -1], ['a', 'b', 'c'], ['a', 'b', 'c'], [0, 1, -1]),
- ([0, 1, -1], ['a', 'b', 'c'], ['b'], [-1, 0, -1]),
- ([0, 1, -1], ['a', 'b', 'c'], ['d'], [-1, -1, -1]),
- ([0, 1, -1], ['a', 'b', 'c'], [], [-1, -1, -1]),
- ([-1, -1], [], ['a', 'b'], [-1, -1]),
- ([1, 0], ['b', 'a'], ['a', 'b'], [0, 1]),
- ])
- def test_recode_to_categories(self, codes, old, new, expected):
- codes = np.asanyarray(codes, dtype=np.int8)
- expected = np.asanyarray(expected, dtype=np.int8)
- old = Index(old)
- new = Index(new)
- result = _recode_for_categories(codes, old, new)
- tm.assert_numpy_array_equal(result, expected)
- def test_recode_to_categories_large(self):
- N = 1000
- codes = np.arange(N)
- old = Index(codes)
- expected = np.arange(N - 1, -1, -1, dtype=np.int16)
- new = Index(expected)
- result = _recode_for_categories(codes, old, new)
- tm.assert_numpy_array_equal(result, expected)
|