test_duplicates.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. # coding=utf-8
  2. import numpy as np
  3. import pytest
  4. from pandas import Categorical, Series
  5. import pandas.util.testing as tm
  6. def test_value_counts_nunique():
  7. # basics.rst doc example
  8. series = Series(np.random.randn(500))
  9. series[20:500] = np.nan
  10. series[10:20] = 5000
  11. result = series.nunique()
  12. assert result == 11
  13. # GH 18051
  14. s = Series(Categorical([]))
  15. assert s.nunique() == 0
  16. s = Series(Categorical([np.nan]))
  17. assert s.nunique() == 0
  18. def test_unique():
  19. # GH714 also, dtype=float
  20. s = Series([1.2345] * 100)
  21. s[::2] = np.nan
  22. result = s.unique()
  23. assert len(result) == 2
  24. s = Series([1.2345] * 100, dtype='f4')
  25. s[::2] = np.nan
  26. result = s.unique()
  27. assert len(result) == 2
  28. # NAs in object arrays #714
  29. s = Series(['foo'] * 100, dtype='O')
  30. s[::2] = np.nan
  31. result = s.unique()
  32. assert len(result) == 2
  33. # decision about None
  34. s = Series([1, 2, 3, None, None, None], dtype=object)
  35. result = s.unique()
  36. expected = np.array([1, 2, 3, None], dtype=object)
  37. tm.assert_numpy_array_equal(result, expected)
  38. # GH 18051
  39. s = Series(Categorical([]))
  40. tm.assert_categorical_equal(s.unique(), Categorical([]), check_dtype=False)
  41. s = Series(Categorical([np.nan]))
  42. tm.assert_categorical_equal(s.unique(), Categorical([np.nan]),
  43. check_dtype=False)
  44. def test_unique_data_ownership():
  45. # it works! #1807
  46. Series(Series(["a", "c", "b"]).unique()).sort_values()
  47. @pytest.mark.parametrize('data, expected', [
  48. (np.random.randint(0, 10, size=1000), False),
  49. (np.arange(1000), True),
  50. ([], True),
  51. ([np.nan], True),
  52. (['foo', 'bar', np.nan], True),
  53. (['foo', 'foo', np.nan], False),
  54. (['foo', 'bar', np.nan, np.nan], False)])
  55. def test_is_unique(data, expected):
  56. # GH11946 / GH25180
  57. s = Series(data)
  58. assert s.is_unique is expected
  59. def test_is_unique_class_ne(capsys):
  60. # GH 20661
  61. class Foo(object):
  62. def __init__(self, val):
  63. self._value = val
  64. def __ne__(self, other):
  65. raise Exception("NEQ not supported")
  66. with capsys.disabled():
  67. li = [Foo(i) for i in range(5)]
  68. s = Series(li, index=[i for i in range(5)])
  69. s.is_unique
  70. captured = capsys.readouterr()
  71. assert len(captured.err) == 0
  72. @pytest.mark.parametrize(
  73. 'keep, expected',
  74. [
  75. ('first', Series([False, False, False, False, True, True, False])),
  76. ('last', Series([False, True, True, False, False, False, False])),
  77. (False, Series([False, True, True, False, True, True, False]))
  78. ])
  79. def test_drop_duplicates(any_numpy_dtype, keep, expected):
  80. tc = Series([1, 0, 3, 5, 3, 0, 4], dtype=np.dtype(any_numpy_dtype))
  81. if tc.dtype == 'bool':
  82. pytest.skip('tested separately in test_drop_duplicates_bool')
  83. tm.assert_series_equal(tc.duplicated(keep=keep), expected)
  84. tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected])
  85. sc = tc.copy()
  86. sc.drop_duplicates(keep=keep, inplace=True)
  87. tm.assert_series_equal(sc, tc[~expected])
  88. @pytest.mark.parametrize('keep, expected',
  89. [('first', Series([False, False, True, True])),
  90. ('last', Series([True, True, False, False])),
  91. (False, Series([True, True, True, True]))])
  92. def test_drop_duplicates_bool(keep, expected):
  93. tc = Series([True, False, True, False])
  94. tm.assert_series_equal(tc.duplicated(keep=keep), expected)
  95. tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected])
  96. sc = tc.copy()
  97. sc.drop_duplicates(keep=keep, inplace=True)
  98. tm.assert_series_equal(sc, tc[~expected])
  99. @pytest.mark.parametrize('keep, expected', [
  100. ('first', Series([False, False, True, False, True], name='name')),
  101. ('last', Series([True, True, False, False, False], name='name')),
  102. (False, Series([True, True, True, False, True], name='name'))
  103. ])
  104. def test_duplicated_keep(keep, expected):
  105. s = Series(['a', 'b', 'b', 'c', 'a'], name='name')
  106. result = s.duplicated(keep=keep)
  107. tm.assert_series_equal(result, expected)
  108. @pytest.mark.parametrize('keep, expected', [
  109. ('first', Series([False, False, True, False, True])),
  110. ('last', Series([True, True, False, False, False])),
  111. (False, Series([True, True, True, False, True]))
  112. ])
  113. def test_duplicated_nan_none(keep, expected):
  114. s = Series([np.nan, 3, 3, None, np.nan], dtype=object)
  115. result = s.duplicated(keep=keep)
  116. tm.assert_series_equal(result, expected)