test_duplicates.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466
  1. # -*- coding: utf-8 -*-
  2. from __future__ import print_function
  3. import numpy as np
  4. import pytest
  5. from pandas.compat import lrange, string_types
  6. from pandas import DataFrame, Series
  7. import pandas.util.testing as tm
  8. @pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']])
  9. def test_duplicated_with_misspelled_column_name(subset):
  10. # GH 19730
  11. df = DataFrame({'A': [0, 0, 1],
  12. 'B': [0, 0, 1],
  13. 'C': [0, 0, 1]})
  14. with pytest.raises(KeyError):
  15. df.duplicated(subset)
  16. with pytest.raises(KeyError):
  17. df.drop_duplicates(subset)
  18. @pytest.mark.slow
  19. def test_duplicated_do_not_fail_on_wide_dataframes():
  20. # gh-21524
  21. # Given the wide dataframe with a lot of columns
  22. # with different (important!) values
  23. data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000)
  24. for i in range(100)}
  25. df = DataFrame(data).T
  26. result = df.duplicated()
  27. # Then duplicates produce the bool Series as a result and don't fail during
  28. # calculation. Actual values doesn't matter here, though usually it's all
  29. # False in this case
  30. assert isinstance(result, Series)
  31. assert result.dtype == np.bool
  32. @pytest.mark.parametrize('keep, expected', [
  33. ('first', Series([False, False, True, False, True])),
  34. ('last', Series([True, True, False, False, False])),
  35. (False, Series([True, True, True, False, True]))
  36. ])
  37. def test_duplicated_keep(keep, expected):
  38. df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']})
  39. result = df.duplicated(keep=keep)
  40. tm.assert_series_equal(result, expected)
  41. @pytest.mark.xfail(reason="GH#21720; nan/None falsely considered equal")
  42. @pytest.mark.parametrize('keep, expected', [
  43. ('first', Series([False, False, True, False, True])),
  44. ('last', Series([True, True, False, False, False])),
  45. (False, Series([True, True, True, False, True]))
  46. ])
  47. def test_duplicated_nan_none(keep, expected):
  48. df = DataFrame({'C': [np.nan, 3, 3, None, np.nan]}, dtype=object)
  49. result = df.duplicated(keep=keep)
  50. tm.assert_series_equal(result, expected)
  51. @pytest.mark.parametrize('keep', ['first', 'last', False])
  52. @pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A'])
  53. def test_duplicated_subset(subset, keep):
  54. df = DataFrame({'A': [0, 1, 1, 2, 0],
  55. 'B': ['a', 'b', 'b', 'c', 'a'],
  56. 'C': [np.nan, 3, 3, None, np.nan]})
  57. if subset is None:
  58. subset = list(df.columns)
  59. elif isinstance(subset, string_types):
  60. # need to have a DataFrame, not a Series
  61. # -> select columns with singleton list, not string
  62. subset = [subset]
  63. expected = df[subset].duplicated(keep=keep)
  64. result = df.duplicated(keep=keep, subset=subset)
  65. tm.assert_series_equal(result, expected)
  66. def test_drop_duplicates():
  67. df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar',
  68. 'foo', 'bar', 'bar', 'foo'],
  69. 'B': ['one', 'one', 'two', 'two',
  70. 'two', 'two', 'one', 'two'],
  71. 'C': [1, 1, 2, 2, 2, 2, 1, 2],
  72. 'D': lrange(8)})
  73. # single column
  74. result = df.drop_duplicates('AAA')
  75. expected = df[:2]
  76. tm.assert_frame_equal(result, expected)
  77. result = df.drop_duplicates('AAA', keep='last')
  78. expected = df.loc[[6, 7]]
  79. tm.assert_frame_equal(result, expected)
  80. result = df.drop_duplicates('AAA', keep=False)
  81. expected = df.loc[[]]
  82. tm.assert_frame_equal(result, expected)
  83. assert len(result) == 0
  84. # multi column
  85. expected = df.loc[[0, 1, 2, 3]]
  86. result = df.drop_duplicates(np.array(['AAA', 'B']))
  87. tm.assert_frame_equal(result, expected)
  88. result = df.drop_duplicates(['AAA', 'B'])
  89. tm.assert_frame_equal(result, expected)
  90. result = df.drop_duplicates(('AAA', 'B'), keep='last')
  91. expected = df.loc[[0, 5, 6, 7]]
  92. tm.assert_frame_equal(result, expected)
  93. result = df.drop_duplicates(('AAA', 'B'), keep=False)
  94. expected = df.loc[[0]]
  95. tm.assert_frame_equal(result, expected)
  96. # consider everything
  97. df2 = df.loc[:, ['AAA', 'B', 'C']]
  98. result = df2.drop_duplicates()
  99. # in this case only
  100. expected = df2.drop_duplicates(['AAA', 'B'])
  101. tm.assert_frame_equal(result, expected)
  102. result = df2.drop_duplicates(keep='last')
  103. expected = df2.drop_duplicates(['AAA', 'B'], keep='last')
  104. tm.assert_frame_equal(result, expected)
  105. result = df2.drop_duplicates(keep=False)
  106. expected = df2.drop_duplicates(['AAA', 'B'], keep=False)
  107. tm.assert_frame_equal(result, expected)
  108. # integers
  109. result = df.drop_duplicates('C')
  110. expected = df.iloc[[0, 2]]
  111. tm.assert_frame_equal(result, expected)
  112. result = df.drop_duplicates('C', keep='last')
  113. expected = df.iloc[[-2, -1]]
  114. tm.assert_frame_equal(result, expected)
  115. df['E'] = df['C'].astype('int8')
  116. result = df.drop_duplicates('E')
  117. expected = df.iloc[[0, 2]]
  118. tm.assert_frame_equal(result, expected)
  119. result = df.drop_duplicates('E', keep='last')
  120. expected = df.iloc[[-2, -1]]
  121. tm.assert_frame_equal(result, expected)
  122. # GH 11376
  123. df = DataFrame({'x': [7, 6, 3, 3, 4, 8, 0],
  124. 'y': [0, 6, 5, 5, 9, 1, 2]})
  125. expected = df.loc[df.index != 3]
  126. tm.assert_frame_equal(df.drop_duplicates(), expected)
  127. df = DataFrame([[1, 0], [0, 2]])
  128. tm.assert_frame_equal(df.drop_duplicates(), df)
  129. df = DataFrame([[-2, 0], [0, -4]])
  130. tm.assert_frame_equal(df.drop_duplicates(), df)
  131. x = np.iinfo(np.int64).max / 3 * 2
  132. df = DataFrame([[-x, x], [0, x + 4]])
  133. tm.assert_frame_equal(df.drop_duplicates(), df)
  134. df = DataFrame([[-x, x], [x, x + 4]])
  135. tm.assert_frame_equal(df.drop_duplicates(), df)
  136. # GH 11864
  137. df = DataFrame([i] * 9 for i in range(16))
  138. df = df.append([[1] + [0] * 8], ignore_index=True)
  139. for keep in ['first', 'last', False]:
  140. assert df.duplicated(keep=keep).sum() == 0
  141. def test_duplicated_on_empty_frame():
  142. # GH 25184
  143. df = DataFrame(columns=['a', 'b'])
  144. dupes = df.duplicated('a')
  145. result = df[dupes]
  146. expected = df.copy()
  147. tm.assert_frame_equal(result, expected)
  148. def test_drop_duplicates_with_duplicate_column_names():
  149. # GH17836
  150. df = DataFrame([
  151. [1, 2, 5],
  152. [3, 4, 6],
  153. [3, 4, 7]
  154. ], columns=['a', 'a', 'b'])
  155. result0 = df.drop_duplicates()
  156. tm.assert_frame_equal(result0, df)
  157. result1 = df.drop_duplicates('a')
  158. expected1 = df[:2]
  159. tm.assert_frame_equal(result1, expected1)
  160. def test_drop_duplicates_for_take_all():
  161. df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar',
  162. 'foo', 'bar', 'qux', 'foo'],
  163. 'B': ['one', 'one', 'two', 'two',
  164. 'two', 'two', 'one', 'two'],
  165. 'C': [1, 1, 2, 2, 2, 2, 1, 2],
  166. 'D': lrange(8)})
  167. # single column
  168. result = df.drop_duplicates('AAA')
  169. expected = df.iloc[[0, 1, 2, 6]]
  170. tm.assert_frame_equal(result, expected)
  171. result = df.drop_duplicates('AAA', keep='last')
  172. expected = df.iloc[[2, 5, 6, 7]]
  173. tm.assert_frame_equal(result, expected)
  174. result = df.drop_duplicates('AAA', keep=False)
  175. expected = df.iloc[[2, 6]]
  176. tm.assert_frame_equal(result, expected)
  177. # multiple columns
  178. result = df.drop_duplicates(['AAA', 'B'])
  179. expected = df.iloc[[0, 1, 2, 3, 4, 6]]
  180. tm.assert_frame_equal(result, expected)
  181. result = df.drop_duplicates(['AAA', 'B'], keep='last')
  182. expected = df.iloc[[0, 1, 2, 5, 6, 7]]
  183. tm.assert_frame_equal(result, expected)
  184. result = df.drop_duplicates(['AAA', 'B'], keep=False)
  185. expected = df.iloc[[0, 1, 2, 6]]
  186. tm.assert_frame_equal(result, expected)
  187. def test_drop_duplicates_tuple():
  188. df = DataFrame({('AA', 'AB'): ['foo', 'bar', 'foo', 'bar',
  189. 'foo', 'bar', 'bar', 'foo'],
  190. 'B': ['one', 'one', 'two', 'two',
  191. 'two', 'two', 'one', 'two'],
  192. 'C': [1, 1, 2, 2, 2, 2, 1, 2],
  193. 'D': lrange(8)})
  194. # single column
  195. result = df.drop_duplicates(('AA', 'AB'))
  196. expected = df[:2]
  197. tm.assert_frame_equal(result, expected)
  198. result = df.drop_duplicates(('AA', 'AB'), keep='last')
  199. expected = df.loc[[6, 7]]
  200. tm.assert_frame_equal(result, expected)
  201. result = df.drop_duplicates(('AA', 'AB'), keep=False)
  202. expected = df.loc[[]] # empty df
  203. assert len(result) == 0
  204. tm.assert_frame_equal(result, expected)
  205. # multi column
  206. expected = df.loc[[0, 1, 2, 3]]
  207. result = df.drop_duplicates((('AA', 'AB'), 'B'))
  208. tm.assert_frame_equal(result, expected)
  209. @pytest.mark.parametrize('df', [
  210. DataFrame(),
  211. DataFrame(columns=[]),
  212. DataFrame(columns=['A', 'B', 'C']),
  213. DataFrame(index=[]),
  214. DataFrame(index=['A', 'B', 'C'])
  215. ])
  216. def test_drop_duplicates_empty(df):
  217. # GH 20516
  218. result = df.drop_duplicates()
  219. tm.assert_frame_equal(result, df)
  220. result = df.copy()
  221. result.drop_duplicates(inplace=True)
  222. tm.assert_frame_equal(result, df)
  223. def test_drop_duplicates_NA():
  224. # none
  225. df = DataFrame({'A': [None, None, 'foo', 'bar',
  226. 'foo', 'bar', 'bar', 'foo'],
  227. 'B': ['one', 'one', 'two', 'two',
  228. 'two', 'two', 'one', 'two'],
  229. 'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.],
  230. 'D': lrange(8)})
  231. # single column
  232. result = df.drop_duplicates('A')
  233. expected = df.loc[[0, 2, 3]]
  234. tm.assert_frame_equal(result, expected)
  235. result = df.drop_duplicates('A', keep='last')
  236. expected = df.loc[[1, 6, 7]]
  237. tm.assert_frame_equal(result, expected)
  238. result = df.drop_duplicates('A', keep=False)
  239. expected = df.loc[[]] # empty df
  240. tm.assert_frame_equal(result, expected)
  241. assert len(result) == 0
  242. # multi column
  243. result = df.drop_duplicates(['A', 'B'])
  244. expected = df.loc[[0, 2, 3, 6]]
  245. tm.assert_frame_equal(result, expected)
  246. result = df.drop_duplicates(['A', 'B'], keep='last')
  247. expected = df.loc[[1, 5, 6, 7]]
  248. tm.assert_frame_equal(result, expected)
  249. result = df.drop_duplicates(['A', 'B'], keep=False)
  250. expected = df.loc[[6]]
  251. tm.assert_frame_equal(result, expected)
  252. # nan
  253. df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
  254. 'foo', 'bar', 'bar', 'foo'],
  255. 'B': ['one', 'one', 'two', 'two',
  256. 'two', 'two', 'one', 'two'],
  257. 'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.],
  258. 'D': lrange(8)})
  259. # single column
  260. result = df.drop_duplicates('C')
  261. expected = df[:2]
  262. tm.assert_frame_equal(result, expected)
  263. result = df.drop_duplicates('C', keep='last')
  264. expected = df.loc[[3, 7]]
  265. tm.assert_frame_equal(result, expected)
  266. result = df.drop_duplicates('C', keep=False)
  267. expected = df.loc[[]] # empty df
  268. tm.assert_frame_equal(result, expected)
  269. assert len(result) == 0
  270. # multi column
  271. result = df.drop_duplicates(['C', 'B'])
  272. expected = df.loc[[0, 1, 2, 4]]
  273. tm.assert_frame_equal(result, expected)
  274. result = df.drop_duplicates(['C', 'B'], keep='last')
  275. expected = df.loc[[1, 3, 6, 7]]
  276. tm.assert_frame_equal(result, expected)
  277. result = df.drop_duplicates(['C', 'B'], keep=False)
  278. expected = df.loc[[1]]
  279. tm.assert_frame_equal(result, expected)
  280. def test_drop_duplicates_NA_for_take_all():
  281. # none
  282. df = DataFrame({'A': [None, None, 'foo', 'bar',
  283. 'foo', 'baz', 'bar', 'qux'],
  284. 'C': [1.0, np.nan, np.nan, np.nan, 1., 2., 3, 1.]})
  285. # single column
  286. result = df.drop_duplicates('A')
  287. expected = df.iloc[[0, 2, 3, 5, 7]]
  288. tm.assert_frame_equal(result, expected)
  289. result = df.drop_duplicates('A', keep='last')
  290. expected = df.iloc[[1, 4, 5, 6, 7]]
  291. tm.assert_frame_equal(result, expected)
  292. result = df.drop_duplicates('A', keep=False)
  293. expected = df.iloc[[5, 7]]
  294. tm.assert_frame_equal(result, expected)
  295. # nan
  296. # single column
  297. result = df.drop_duplicates('C')
  298. expected = df.iloc[[0, 1, 5, 6]]
  299. tm.assert_frame_equal(result, expected)
  300. result = df.drop_duplicates('C', keep='last')
  301. expected = df.iloc[[3, 5, 6, 7]]
  302. tm.assert_frame_equal(result, expected)
  303. result = df.drop_duplicates('C', keep=False)
  304. expected = df.iloc[[5, 6]]
  305. tm.assert_frame_equal(result, expected)
  306. def test_drop_duplicates_inplace():
  307. orig = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
  308. 'foo', 'bar', 'bar', 'foo'],
  309. 'B': ['one', 'one', 'two', 'two',
  310. 'two', 'two', 'one', 'two'],
  311. 'C': [1, 1, 2, 2, 2, 2, 1, 2],
  312. 'D': lrange(8)})
  313. # single column
  314. df = orig.copy()
  315. df.drop_duplicates('A', inplace=True)
  316. expected = orig[:2]
  317. result = df
  318. tm.assert_frame_equal(result, expected)
  319. df = orig.copy()
  320. df.drop_duplicates('A', keep='last', inplace=True)
  321. expected = orig.loc[[6, 7]]
  322. result = df
  323. tm.assert_frame_equal(result, expected)
  324. df = orig.copy()
  325. df.drop_duplicates('A', keep=False, inplace=True)
  326. expected = orig.loc[[]]
  327. result = df
  328. tm.assert_frame_equal(result, expected)
  329. assert len(df) == 0
  330. # multi column
  331. df = orig.copy()
  332. df.drop_duplicates(['A', 'B'], inplace=True)
  333. expected = orig.loc[[0, 1, 2, 3]]
  334. result = df
  335. tm.assert_frame_equal(result, expected)
  336. df = orig.copy()
  337. df.drop_duplicates(['A', 'B'], keep='last', inplace=True)
  338. expected = orig.loc[[0, 5, 6, 7]]
  339. result = df
  340. tm.assert_frame_equal(result, expected)
  341. df = orig.copy()
  342. df.drop_duplicates(['A', 'B'], keep=False, inplace=True)
  343. expected = orig.loc[[0]]
  344. result = df
  345. tm.assert_frame_equal(result, expected)
  346. # consider everything
  347. orig2 = orig.loc[:, ['A', 'B', 'C']].copy()
  348. df2 = orig2.copy()
  349. df2.drop_duplicates(inplace=True)
  350. # in this case only
  351. expected = orig2.drop_duplicates(['A', 'B'])
  352. result = df2
  353. tm.assert_frame_equal(result, expected)
  354. df2 = orig2.copy()
  355. df2.drop_duplicates(keep='last', inplace=True)
  356. expected = orig2.drop_duplicates(['A', 'B'], keep='last')
  357. result = df2
  358. tm.assert_frame_equal(result, expected)
  359. df2 = orig2.copy()
  360. df2.drop_duplicates(keep=False, inplace=True)
  361. expected = orig2.drop_duplicates(['A', 'B'], keep=False)
  362. result = df2
  363. tm.assert_frame_equal(result, expected)