test_mangle_dupes.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. # -*- coding: utf-8 -*-
  2. """
  3. Tests that duplicate columns are handled appropriately when parsed by the
  4. CSV engine. In general, the expected result is that they are either thoroughly
  5. de-duplicated (if mangling requested) or ignored otherwise.
  6. """
  7. import pytest
  8. from pandas.compat import StringIO
  9. from pandas import DataFrame
  10. import pandas.util.testing as tm
  11. @pytest.mark.parametrize("kwargs", [dict(), dict(mangle_dupe_cols=True)])
  12. def test_basic(all_parsers, kwargs):
  13. # TODO: add test for condition "mangle_dupe_cols=False"
  14. # once it is actually supported (gh-12935)
  15. parser = all_parsers
  16. data = "a,a,b,b,b\n1,2,3,4,5"
  17. result = parser.read_csv(StringIO(data), sep=",", **kwargs)
  18. expected = DataFrame([[1, 2, 3, 4, 5]],
  19. columns=["a", "a.1", "b", "b.1", "b.2"])
  20. tm.assert_frame_equal(result, expected)
  21. def test_basic_names(all_parsers):
  22. # See gh-7160
  23. parser = all_parsers
  24. data = "a,b,a\n0,1,2\n3,4,5"
  25. expected = DataFrame([[0, 1, 2], [3, 4, 5]],
  26. columns=["a", "b", "a.1"])
  27. result = parser.read_csv(StringIO(data))
  28. tm.assert_frame_equal(result, expected)
  29. def test_basic_names_warn(all_parsers):
  30. # See gh-7160
  31. parser = all_parsers
  32. data = "0,1,2\n3,4,5"
  33. expected = DataFrame([[0, 1, 2], [3, 4, 5]],
  34. columns=["a", "b", "a.1"])
  35. with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
  36. result = parser.read_csv(StringIO(data), names=["a", "b", "a"])
  37. tm.assert_frame_equal(result, expected)
  38. @pytest.mark.parametrize("data,expected", [
  39. ("a,a,a.1\n1,2,3",
  40. DataFrame([[1, 2, 3]], columns=["a", "a.1", "a.1.1"])),
  41. ("a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6",
  42. DataFrame([[1, 2, 3, 4, 5, 6]], columns=["a", "a.1", "a.1.1", "a.1.1.1",
  43. "a.1.1.1.1", "a.1.1.1.1.1"])),
  44. ("a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7",
  45. DataFrame([[1, 2, 3, 4, 5, 6, 7]], columns=["a", "a.1", "a.3", "a.1.1",
  46. "a.2", "a.2.1", "a.3.1"]))
  47. ])
  48. def test_thorough_mangle_columns(all_parsers, data, expected):
  49. # see gh-17060
  50. parser = all_parsers
  51. result = parser.read_csv(StringIO(data))
  52. tm.assert_frame_equal(result, expected)
  53. @pytest.mark.parametrize("data,names,expected", [
  54. ("a,b,b\n1,2,3",
  55. ["a.1", "a.1", "a.1.1"],
  56. DataFrame([["a", "b", "b"], ["1", "2", "3"]],
  57. columns=["a.1", "a.1.1", "a.1.1.1"])),
  58. ("a,b,c,d,e,f\n1,2,3,4,5,6",
  59. ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"],
  60. DataFrame([["a", "b", "c", "d", "e", "f"],
  61. ["1", "2", "3", "4", "5", "6"]],
  62. columns=["a", "a.1", "a.1.1", "a.1.1.1",
  63. "a.1.1.1.1", "a.1.1.1.1.1"])),
  64. ("a,b,c,d,e,f,g\n1,2,3,4,5,6,7",
  65. ["a", "a", "a.3", "a.1", "a.2", "a", "a"],
  66. DataFrame([["a", "b", "c", "d", "e", "f", "g"],
  67. ["1", "2", "3", "4", "5", "6", "7"]],
  68. columns=["a", "a.1", "a.3", "a.1.1",
  69. "a.2", "a.2.1", "a.3.1"])),
  70. ])
  71. def test_thorough_mangle_names(all_parsers, data, names, expected):
  72. # see gh-17095
  73. parser = all_parsers
  74. with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
  75. result = parser.read_csv(StringIO(data), names=names)
  76. tm.assert_frame_equal(result, expected)
  77. def test_mangled_unnamed_placeholders(all_parsers):
  78. # xref gh-13017
  79. orig_key = "0"
  80. parser = all_parsers
  81. orig_value = [1, 2, 3]
  82. df = DataFrame({orig_key: orig_value})
  83. # This test recursively updates `df`.
  84. for i in range(3):
  85. expected = DataFrame()
  86. for j in range(i + 1):
  87. expected["Unnamed: 0" + ".1" * j] = [0, 1, 2]
  88. expected[orig_key] = orig_value
  89. df = parser.read_csv(StringIO(df.to_csv()))
  90. tm.assert_frame_equal(df, expected)