123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466 |
- # -*- coding: utf-8 -*-
- from __future__ import print_function
- import numpy as np
- import pytest
- from pandas.compat import lrange, string_types
- from pandas import DataFrame, Series
- import pandas.util.testing as tm
- @pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']])
- def test_duplicated_with_misspelled_column_name(subset):
- # GH 19730
- df = DataFrame({'A': [0, 0, 1],
- 'B': [0, 0, 1],
- 'C': [0, 0, 1]})
- with pytest.raises(KeyError):
- df.duplicated(subset)
- with pytest.raises(KeyError):
- df.drop_duplicates(subset)
- @pytest.mark.slow
- def test_duplicated_do_not_fail_on_wide_dataframes():
- # gh-21524
- # Given the wide dataframe with a lot of columns
- # with different (important!) values
- data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000)
- for i in range(100)}
- df = DataFrame(data).T
- result = df.duplicated()
- # Then duplicates produce the bool Series as a result and don't fail during
- # calculation. Actual values doesn't matter here, though usually it's all
- # False in this case
- assert isinstance(result, Series)
- assert result.dtype == np.bool
- @pytest.mark.parametrize('keep, expected', [
- ('first', Series([False, False, True, False, True])),
- ('last', Series([True, True, False, False, False])),
- (False, Series([True, True, True, False, True]))
- ])
- def test_duplicated_keep(keep, expected):
- df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']})
- result = df.duplicated(keep=keep)
- tm.assert_series_equal(result, expected)
- @pytest.mark.xfail(reason="GH#21720; nan/None falsely considered equal")
- @pytest.mark.parametrize('keep, expected', [
- ('first', Series([False, False, True, False, True])),
- ('last', Series([True, True, False, False, False])),
- (False, Series([True, True, True, False, True]))
- ])
- def test_duplicated_nan_none(keep, expected):
- df = DataFrame({'C': [np.nan, 3, 3, None, np.nan]}, dtype=object)
- result = df.duplicated(keep=keep)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize('keep', ['first', 'last', False])
- @pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A'])
- def test_duplicated_subset(subset, keep):
- df = DataFrame({'A': [0, 1, 1, 2, 0],
- 'B': ['a', 'b', 'b', 'c', 'a'],
- 'C': [np.nan, 3, 3, None, np.nan]})
- if subset is None:
- subset = list(df.columns)
- elif isinstance(subset, string_types):
- # need to have a DataFrame, not a Series
- # -> select columns with singleton list, not string
- subset = [subset]
- expected = df[subset].duplicated(keep=keep)
- result = df.duplicated(keep=keep, subset=subset)
- tm.assert_series_equal(result, expected)
- def test_drop_duplicates():
- df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar',
- 'foo', 'bar', 'bar', 'foo'],
- 'B': ['one', 'one', 'two', 'two',
- 'two', 'two', 'one', 'two'],
- 'C': [1, 1, 2, 2, 2, 2, 1, 2],
- 'D': lrange(8)})
- # single column
- result = df.drop_duplicates('AAA')
- expected = df[:2]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates('AAA', keep='last')
- expected = df.loc[[6, 7]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates('AAA', keep=False)
- expected = df.loc[[]]
- tm.assert_frame_equal(result, expected)
- assert len(result) == 0
- # multi column
- expected = df.loc[[0, 1, 2, 3]]
- result = df.drop_duplicates(np.array(['AAA', 'B']))
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates(['AAA', 'B'])
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates(('AAA', 'B'), keep='last')
- expected = df.loc[[0, 5, 6, 7]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates(('AAA', 'B'), keep=False)
- expected = df.loc[[0]]
- tm.assert_frame_equal(result, expected)
- # consider everything
- df2 = df.loc[:, ['AAA', 'B', 'C']]
- result = df2.drop_duplicates()
- # in this case only
- expected = df2.drop_duplicates(['AAA', 'B'])
- tm.assert_frame_equal(result, expected)
- result = df2.drop_duplicates(keep='last')
- expected = df2.drop_duplicates(['AAA', 'B'], keep='last')
- tm.assert_frame_equal(result, expected)
- result = df2.drop_duplicates(keep=False)
- expected = df2.drop_duplicates(['AAA', 'B'], keep=False)
- tm.assert_frame_equal(result, expected)
- # integers
- result = df.drop_duplicates('C')
- expected = df.iloc[[0, 2]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates('C', keep='last')
- expected = df.iloc[[-2, -1]]
- tm.assert_frame_equal(result, expected)
- df['E'] = df['C'].astype('int8')
- result = df.drop_duplicates('E')
- expected = df.iloc[[0, 2]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates('E', keep='last')
- expected = df.iloc[[-2, -1]]
- tm.assert_frame_equal(result, expected)
- # GH 11376
- df = DataFrame({'x': [7, 6, 3, 3, 4, 8, 0],
- 'y': [0, 6, 5, 5, 9, 1, 2]})
- expected = df.loc[df.index != 3]
- tm.assert_frame_equal(df.drop_duplicates(), expected)
- df = DataFrame([[1, 0], [0, 2]])
- tm.assert_frame_equal(df.drop_duplicates(), df)
- df = DataFrame([[-2, 0], [0, -4]])
- tm.assert_frame_equal(df.drop_duplicates(), df)
- x = np.iinfo(np.int64).max / 3 * 2
- df = DataFrame([[-x, x], [0, x + 4]])
- tm.assert_frame_equal(df.drop_duplicates(), df)
- df = DataFrame([[-x, x], [x, x + 4]])
- tm.assert_frame_equal(df.drop_duplicates(), df)
- # GH 11864
- df = DataFrame([i] * 9 for i in range(16))
- df = df.append([[1] + [0] * 8], ignore_index=True)
- for keep in ['first', 'last', False]:
- assert df.duplicated(keep=keep).sum() == 0
- def test_duplicated_on_empty_frame():
- # GH 25184
- df = DataFrame(columns=['a', 'b'])
- dupes = df.duplicated('a')
- result = df[dupes]
- expected = df.copy()
- tm.assert_frame_equal(result, expected)
- def test_drop_duplicates_with_duplicate_column_names():
- # GH17836
- df = DataFrame([
- [1, 2, 5],
- [3, 4, 6],
- [3, 4, 7]
- ], columns=['a', 'a', 'b'])
- result0 = df.drop_duplicates()
- tm.assert_frame_equal(result0, df)
- result1 = df.drop_duplicates('a')
- expected1 = df[:2]
- tm.assert_frame_equal(result1, expected1)
- def test_drop_duplicates_for_take_all():
- df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar',
- 'foo', 'bar', 'qux', 'foo'],
- 'B': ['one', 'one', 'two', 'two',
- 'two', 'two', 'one', 'two'],
- 'C': [1, 1, 2, 2, 2, 2, 1, 2],
- 'D': lrange(8)})
- # single column
- result = df.drop_duplicates('AAA')
- expected = df.iloc[[0, 1, 2, 6]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates('AAA', keep='last')
- expected = df.iloc[[2, 5, 6, 7]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates('AAA', keep=False)
- expected = df.iloc[[2, 6]]
- tm.assert_frame_equal(result, expected)
- # multiple columns
- result = df.drop_duplicates(['AAA', 'B'])
- expected = df.iloc[[0, 1, 2, 3, 4, 6]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates(['AAA', 'B'], keep='last')
- expected = df.iloc[[0, 1, 2, 5, 6, 7]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates(['AAA', 'B'], keep=False)
- expected = df.iloc[[0, 1, 2, 6]]
- tm.assert_frame_equal(result, expected)
- def test_drop_duplicates_tuple():
- df = DataFrame({('AA', 'AB'): ['foo', 'bar', 'foo', 'bar',
- 'foo', 'bar', 'bar', 'foo'],
- 'B': ['one', 'one', 'two', 'two',
- 'two', 'two', 'one', 'two'],
- 'C': [1, 1, 2, 2, 2, 2, 1, 2],
- 'D': lrange(8)})
- # single column
- result = df.drop_duplicates(('AA', 'AB'))
- expected = df[:2]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates(('AA', 'AB'), keep='last')
- expected = df.loc[[6, 7]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates(('AA', 'AB'), keep=False)
- expected = df.loc[[]] # empty df
- assert len(result) == 0
- tm.assert_frame_equal(result, expected)
- # multi column
- expected = df.loc[[0, 1, 2, 3]]
- result = df.drop_duplicates((('AA', 'AB'), 'B'))
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize('df', [
- DataFrame(),
- DataFrame(columns=[]),
- DataFrame(columns=['A', 'B', 'C']),
- DataFrame(index=[]),
- DataFrame(index=['A', 'B', 'C'])
- ])
- def test_drop_duplicates_empty(df):
- # GH 20516
- result = df.drop_duplicates()
- tm.assert_frame_equal(result, df)
- result = df.copy()
- result.drop_duplicates(inplace=True)
- tm.assert_frame_equal(result, df)
- def test_drop_duplicates_NA():
- # none
- df = DataFrame({'A': [None, None, 'foo', 'bar',
- 'foo', 'bar', 'bar', 'foo'],
- 'B': ['one', 'one', 'two', 'two',
- 'two', 'two', 'one', 'two'],
- 'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.],
- 'D': lrange(8)})
- # single column
- result = df.drop_duplicates('A')
- expected = df.loc[[0, 2, 3]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates('A', keep='last')
- expected = df.loc[[1, 6, 7]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates('A', keep=False)
- expected = df.loc[[]] # empty df
- tm.assert_frame_equal(result, expected)
- assert len(result) == 0
- # multi column
- result = df.drop_duplicates(['A', 'B'])
- expected = df.loc[[0, 2, 3, 6]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates(['A', 'B'], keep='last')
- expected = df.loc[[1, 5, 6, 7]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates(['A', 'B'], keep=False)
- expected = df.loc[[6]]
- tm.assert_frame_equal(result, expected)
- # nan
- df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
- 'foo', 'bar', 'bar', 'foo'],
- 'B': ['one', 'one', 'two', 'two',
- 'two', 'two', 'one', 'two'],
- 'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.],
- 'D': lrange(8)})
- # single column
- result = df.drop_duplicates('C')
- expected = df[:2]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates('C', keep='last')
- expected = df.loc[[3, 7]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates('C', keep=False)
- expected = df.loc[[]] # empty df
- tm.assert_frame_equal(result, expected)
- assert len(result) == 0
- # multi column
- result = df.drop_duplicates(['C', 'B'])
- expected = df.loc[[0, 1, 2, 4]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates(['C', 'B'], keep='last')
- expected = df.loc[[1, 3, 6, 7]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates(['C', 'B'], keep=False)
- expected = df.loc[[1]]
- tm.assert_frame_equal(result, expected)
- def test_drop_duplicates_NA_for_take_all():
- # none
- df = DataFrame({'A': [None, None, 'foo', 'bar',
- 'foo', 'baz', 'bar', 'qux'],
- 'C': [1.0, np.nan, np.nan, np.nan, 1., 2., 3, 1.]})
- # single column
- result = df.drop_duplicates('A')
- expected = df.iloc[[0, 2, 3, 5, 7]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates('A', keep='last')
- expected = df.iloc[[1, 4, 5, 6, 7]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates('A', keep=False)
- expected = df.iloc[[5, 7]]
- tm.assert_frame_equal(result, expected)
- # nan
- # single column
- result = df.drop_duplicates('C')
- expected = df.iloc[[0, 1, 5, 6]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates('C', keep='last')
- expected = df.iloc[[3, 5, 6, 7]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates('C', keep=False)
- expected = df.iloc[[5, 6]]
- tm.assert_frame_equal(result, expected)
- def test_drop_duplicates_inplace():
- orig = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
- 'foo', 'bar', 'bar', 'foo'],
- 'B': ['one', 'one', 'two', 'two',
- 'two', 'two', 'one', 'two'],
- 'C': [1, 1, 2, 2, 2, 2, 1, 2],
- 'D': lrange(8)})
- # single column
- df = orig.copy()
- df.drop_duplicates('A', inplace=True)
- expected = orig[:2]
- result = df
- tm.assert_frame_equal(result, expected)
- df = orig.copy()
- df.drop_duplicates('A', keep='last', inplace=True)
- expected = orig.loc[[6, 7]]
- result = df
- tm.assert_frame_equal(result, expected)
- df = orig.copy()
- df.drop_duplicates('A', keep=False, inplace=True)
- expected = orig.loc[[]]
- result = df
- tm.assert_frame_equal(result, expected)
- assert len(df) == 0
- # multi column
- df = orig.copy()
- df.drop_duplicates(['A', 'B'], inplace=True)
- expected = orig.loc[[0, 1, 2, 3]]
- result = df
- tm.assert_frame_equal(result, expected)
- df = orig.copy()
- df.drop_duplicates(['A', 'B'], keep='last', inplace=True)
- expected = orig.loc[[0, 5, 6, 7]]
- result = df
- tm.assert_frame_equal(result, expected)
- df = orig.copy()
- df.drop_duplicates(['A', 'B'], keep=False, inplace=True)
- expected = orig.loc[[0]]
- result = df
- tm.assert_frame_equal(result, expected)
- # consider everything
- orig2 = orig.loc[:, ['A', 'B', 'C']].copy()
- df2 = orig2.copy()
- df2.drop_duplicates(inplace=True)
- # in this case only
- expected = orig2.drop_duplicates(['A', 'B'])
- result = df2
- tm.assert_frame_equal(result, expected)
- df2 = orig2.copy()
- df2.drop_duplicates(keep='last', inplace=True)
- expected = orig2.drop_duplicates(['A', 'B'], keep='last')
- result = df2
- tm.assert_frame_equal(result, expected)
- df2 = orig2.copy()
- df2.drop_duplicates(keep=False, inplace=True)
- expected = orig2.drop_duplicates(['A', 'B'], keep=False)
- result = df2
- tm.assert_frame_equal(result, expected)
|