test_index_col.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. # -*- coding: utf-8 -*-
  2. """
  3. Tests that the specified index column (a.k.a "index_col")
  4. is properly handled or inferred during parsing for all of
  5. the parsers defined in parsers.py
  6. """
  7. import pytest
  8. from pandas.compat import StringIO
  9. from pandas import DataFrame, Index, MultiIndex
  10. import pandas.util.testing as tm
  11. @pytest.mark.parametrize("with_header", [True, False])
  12. def test_index_col_named(all_parsers, with_header):
  13. parser = all_parsers
  14. no_header = """\
  15. KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
  16. KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
  17. KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
  18. KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
  19. KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
  20. KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa
  21. header = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" # noqa
  22. if with_header:
  23. data = header + no_header
  24. result = parser.read_csv(StringIO(data), index_col="ID")
  25. expected = parser.read_csv(StringIO(data), header=0).set_index("ID")
  26. tm.assert_frame_equal(result, expected)
  27. else:
  28. data = no_header
  29. msg = "Index ID invalid"
  30. with pytest.raises(ValueError, match=msg):
  31. parser.read_csv(StringIO(data), index_col="ID")
  32. def test_index_col_named2(all_parsers):
  33. parser = all_parsers
  34. data = """\
  35. 1,2,3,4,hello
  36. 5,6,7,8,world
  37. 9,10,11,12,foo
  38. """
  39. expected = DataFrame({"a": [1, 5, 9], "b": [2, 6, 10],
  40. "c": [3, 7, 11], "d": [4, 8, 12]},
  41. index=Index(["hello", "world", "foo"],
  42. name="message"))
  43. names = ["a", "b", "c", "d", "message"]
  44. result = parser.read_csv(StringIO(data), names=names,
  45. index_col=["message"])
  46. tm.assert_frame_equal(result, expected)
  47. def test_index_col_is_true(all_parsers):
  48. # see gh-9798
  49. data = "a,b\n1,2"
  50. parser = all_parsers
  51. with pytest.raises(ValueError, match="The value of index_col "
  52. "couldn't be 'True'"):
  53. parser.read_csv(StringIO(data), index_col=True)
  54. def test_infer_index_col(all_parsers):
  55. data = """A,B,C
  56. foo,1,2,3
  57. bar,4,5,6
  58. baz,7,8,9
  59. """
  60. parser = all_parsers
  61. result = parser.read_csv(StringIO(data))
  62. expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
  63. index=["foo", "bar", "baz"],
  64. columns=["A", "B", "C"])
  65. tm.assert_frame_equal(result, expected)
  66. @pytest.mark.parametrize("index_col,kwargs", [
  67. (None, dict(columns=["x", "y", "z"])),
  68. (False, dict(columns=["x", "y", "z"])),
  69. (0, dict(columns=["y", "z"], index=Index([], name="x"))),
  70. (1, dict(columns=["x", "z"], index=Index([], name="y"))),
  71. ("x", dict(columns=["y", "z"], index=Index([], name="x"))),
  72. ("y", dict(columns=["x", "z"], index=Index([], name="y"))),
  73. ([0, 1], dict(columns=["z"], index=MultiIndex.from_arrays(
  74. [[]] * 2, names=["x", "y"]))),
  75. (["x", "y"], dict(columns=["z"], index=MultiIndex.from_arrays(
  76. [[]] * 2, names=["x", "y"]))),
  77. ([1, 0], dict(columns=["z"], index=MultiIndex.from_arrays(
  78. [[]] * 2, names=["y", "x"]))),
  79. (["y", "x"], dict(columns=["z"], index=MultiIndex.from_arrays(
  80. [[]] * 2, names=["y", "x"]))),
  81. ])
  82. def test_index_col_empty_data(all_parsers, index_col, kwargs):
  83. data = "x,y,z"
  84. parser = all_parsers
  85. result = parser.read_csv(StringIO(data), index_col=index_col)
  86. expected = DataFrame([], **kwargs)
  87. tm.assert_frame_equal(result, expected)
  88. def test_empty_with_index_col_false(all_parsers):
  89. # see gh-10413
  90. data = "x,y"
  91. parser = all_parsers
  92. result = parser.read_csv(StringIO(data), index_col=False)
  93. expected = DataFrame([], columns=["x", "y"])
  94. tm.assert_frame_equal(result, expected)
  95. @pytest.mark.parametrize("index_names", [
  96. ["", ""],
  97. ["foo", ""],
  98. ["", "bar"],
  99. ["foo", "bar"],
  100. ["NotReallyUnnamed", "Unnamed: 0"],
  101. ])
  102. def test_multi_index_naming(all_parsers, index_names):
  103. parser = all_parsers
  104. # We don't want empty index names being replaced with "Unnamed: 0"
  105. data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"])
  106. result = parser.read_csv(StringIO(data), index_col=[0, 1])
  107. expected = DataFrame({"col": [1, 2, 3, 4]},
  108. index=MultiIndex.from_product([["a", "b"],
  109. ["c", "d"]]))
  110. expected.index.names = [name if name else None for name in index_names]
  111. tm.assert_frame_equal(result, expected)
  112. def test_multi_index_naming_not_all_at_beginning(all_parsers):
  113. parser = all_parsers
  114. data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4"
  115. result = parser.read_csv(StringIO(data), index_col=[0, 2])
  116. expected = DataFrame({"Unnamed: 2": ["c", "d", "c", "d"]},
  117. index=MultiIndex(
  118. levels=[['a', 'b'], [1, 2, 3, 4]],
  119. codes=[[0, 0, 1, 1], [0, 1, 2, 3]]))
  120. tm.assert_frame_equal(result, expected)