123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119 |
- # -*- coding: utf-8 -*-
- """
- Tests that duplicate columns are handled appropriately when parsed by the
- CSV engine. In general, the expected result is that they are either thoroughly
- de-duplicated (if mangling requested) or ignored otherwise.
- """
- import pytest
- from pandas.compat import StringIO
- from pandas import DataFrame
- import pandas.util.testing as tm
- @pytest.mark.parametrize("kwargs", [dict(), dict(mangle_dupe_cols=True)])
- def test_basic(all_parsers, kwargs):
- # TODO: add test for condition "mangle_dupe_cols=False"
- # once it is actually supported (gh-12935)
- parser = all_parsers
- data = "a,a,b,b,b\n1,2,3,4,5"
- result = parser.read_csv(StringIO(data), sep=",", **kwargs)
- expected = DataFrame([[1, 2, 3, 4, 5]],
- columns=["a", "a.1", "b", "b.1", "b.2"])
- tm.assert_frame_equal(result, expected)
- def test_basic_names(all_parsers):
- # See gh-7160
- parser = all_parsers
- data = "a,b,a\n0,1,2\n3,4,5"
- expected = DataFrame([[0, 1, 2], [3, 4, 5]],
- columns=["a", "b", "a.1"])
- result = parser.read_csv(StringIO(data))
- tm.assert_frame_equal(result, expected)
- def test_basic_names_warn(all_parsers):
- # See gh-7160
- parser = all_parsers
- data = "0,1,2\n3,4,5"
- expected = DataFrame([[0, 1, 2], [3, 4, 5]],
- columns=["a", "b", "a.1"])
- with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
- result = parser.read_csv(StringIO(data), names=["a", "b", "a"])
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("data,expected", [
- ("a,a,a.1\n1,2,3",
- DataFrame([[1, 2, 3]], columns=["a", "a.1", "a.1.1"])),
- ("a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6",
- DataFrame([[1, 2, 3, 4, 5, 6]], columns=["a", "a.1", "a.1.1", "a.1.1.1",
- "a.1.1.1.1", "a.1.1.1.1.1"])),
- ("a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7",
- DataFrame([[1, 2, 3, 4, 5, 6, 7]], columns=["a", "a.1", "a.3", "a.1.1",
- "a.2", "a.2.1", "a.3.1"]))
- ])
- def test_thorough_mangle_columns(all_parsers, data, expected):
- # see gh-17060
- parser = all_parsers
- result = parser.read_csv(StringIO(data))
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("data,names,expected", [
- ("a,b,b\n1,2,3",
- ["a.1", "a.1", "a.1.1"],
- DataFrame([["a", "b", "b"], ["1", "2", "3"]],
- columns=["a.1", "a.1.1", "a.1.1.1"])),
- ("a,b,c,d,e,f\n1,2,3,4,5,6",
- ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"],
- DataFrame([["a", "b", "c", "d", "e", "f"],
- ["1", "2", "3", "4", "5", "6"]],
- columns=["a", "a.1", "a.1.1", "a.1.1.1",
- "a.1.1.1.1", "a.1.1.1.1.1"])),
- ("a,b,c,d,e,f,g\n1,2,3,4,5,6,7",
- ["a", "a", "a.3", "a.1", "a.2", "a", "a"],
- DataFrame([["a", "b", "c", "d", "e", "f", "g"],
- ["1", "2", "3", "4", "5", "6", "7"]],
- columns=["a", "a.1", "a.3", "a.1.1",
- "a.2", "a.2.1", "a.3.1"])),
- ])
- def test_thorough_mangle_names(all_parsers, data, names, expected):
- # see gh-17095
- parser = all_parsers
- with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
- result = parser.read_csv(StringIO(data), names=names)
- tm.assert_frame_equal(result, expected)
- def test_mangled_unnamed_placeholders(all_parsers):
- # xref gh-13017
- orig_key = "0"
- parser = all_parsers
- orig_value = [1, 2, 3]
- df = DataFrame({orig_key: orig_value})
- # This test recursively updates `df`.
- for i in range(3):
- expected = DataFrame()
- for j in range(i + 1):
- expected["Unnamed: 0" + ".1" * j] = [0, 1, 2]
- expected[orig_key] = orig_value
- df = parser.read_csv(StringIO(df.to_csv()))
- tm.assert_frame_equal(df, expected)
|