123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152 |
- # -*- coding: utf-8 -*-
- """
- Tests that the specified index column (a.k.a "index_col")
- is properly handled or inferred during parsing for all of
- the parsers defined in parsers.py
- """
- import pytest
- from pandas.compat import StringIO
- from pandas import DataFrame, Index, MultiIndex
- import pandas.util.testing as tm
- @pytest.mark.parametrize("with_header", [True, False])
- def test_index_col_named(all_parsers, with_header):
- parser = all_parsers
- no_header = """\
- KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
- KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
- KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
- KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
- KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
- KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa
- header = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" # noqa
- if with_header:
- data = header + no_header
- result = parser.read_csv(StringIO(data), index_col="ID")
- expected = parser.read_csv(StringIO(data), header=0).set_index("ID")
- tm.assert_frame_equal(result, expected)
- else:
- data = no_header
- msg = "Index ID invalid"
- with pytest.raises(ValueError, match=msg):
- parser.read_csv(StringIO(data), index_col="ID")
- def test_index_col_named2(all_parsers):
- parser = all_parsers
- data = """\
- 1,2,3,4,hello
- 5,6,7,8,world
- 9,10,11,12,foo
- """
- expected = DataFrame({"a": [1, 5, 9], "b": [2, 6, 10],
- "c": [3, 7, 11], "d": [4, 8, 12]},
- index=Index(["hello", "world", "foo"],
- name="message"))
- names = ["a", "b", "c", "d", "message"]
- result = parser.read_csv(StringIO(data), names=names,
- index_col=["message"])
- tm.assert_frame_equal(result, expected)
- def test_index_col_is_true(all_parsers):
- # see gh-9798
- data = "a,b\n1,2"
- parser = all_parsers
- with pytest.raises(ValueError, match="The value of index_col "
- "couldn't be 'True'"):
- parser.read_csv(StringIO(data), index_col=True)
- def test_infer_index_col(all_parsers):
- data = """A,B,C
- foo,1,2,3
- bar,4,5,6
- baz,7,8,9
- """
- parser = all_parsers
- result = parser.read_csv(StringIO(data))
- expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
- index=["foo", "bar", "baz"],
- columns=["A", "B", "C"])
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("index_col,kwargs", [
- (None, dict(columns=["x", "y", "z"])),
- (False, dict(columns=["x", "y", "z"])),
- (0, dict(columns=["y", "z"], index=Index([], name="x"))),
- (1, dict(columns=["x", "z"], index=Index([], name="y"))),
- ("x", dict(columns=["y", "z"], index=Index([], name="x"))),
- ("y", dict(columns=["x", "z"], index=Index([], name="y"))),
- ([0, 1], dict(columns=["z"], index=MultiIndex.from_arrays(
- [[]] * 2, names=["x", "y"]))),
- (["x", "y"], dict(columns=["z"], index=MultiIndex.from_arrays(
- [[]] * 2, names=["x", "y"]))),
- ([1, 0], dict(columns=["z"], index=MultiIndex.from_arrays(
- [[]] * 2, names=["y", "x"]))),
- (["y", "x"], dict(columns=["z"], index=MultiIndex.from_arrays(
- [[]] * 2, names=["y", "x"]))),
- ])
- def test_index_col_empty_data(all_parsers, index_col, kwargs):
- data = "x,y,z"
- parser = all_parsers
- result = parser.read_csv(StringIO(data), index_col=index_col)
- expected = DataFrame([], **kwargs)
- tm.assert_frame_equal(result, expected)
- def test_empty_with_index_col_false(all_parsers):
- # see gh-10413
- data = "x,y"
- parser = all_parsers
- result = parser.read_csv(StringIO(data), index_col=False)
- expected = DataFrame([], columns=["x", "y"])
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("index_names", [
- ["", ""],
- ["foo", ""],
- ["", "bar"],
- ["foo", "bar"],
- ["NotReallyUnnamed", "Unnamed: 0"],
- ])
- def test_multi_index_naming(all_parsers, index_names):
- parser = all_parsers
- # We don't want empty index names being replaced with "Unnamed: 0"
- data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"])
- result = parser.read_csv(StringIO(data), index_col=[0, 1])
- expected = DataFrame({"col": [1, 2, 3, 4]},
- index=MultiIndex.from_product([["a", "b"],
- ["c", "d"]]))
- expected.index.names = [name if name else None for name in index_names]
- tm.assert_frame_equal(result, expected)
- def test_multi_index_naming_not_all_at_beginning(all_parsers):
- parser = all_parsers
- data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4"
- result = parser.read_csv(StringIO(data), index_col=[0, 2])
- expected = DataFrame({"Unnamed: 2": ["c", "d", "c", "d"]},
- index=MultiIndex(
- levels=[['a', 'b'], [1, 2, 3, 4]],
- codes=[[0, 0, 1, 1], [0, 1, 2, 3]]))
- tm.assert_frame_equal(result, expected)
|