test_comment.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. # -*- coding: utf-8 -*-
  2. """
  3. Tests that comments are properly handled during parsing
  4. for all of the parsers defined in parsers.py
  5. """
  6. import numpy as np
  7. import pytest
  8. from pandas.compat import StringIO
  9. from pandas import DataFrame
  10. import pandas.util.testing as tm
  11. @pytest.mark.parametrize("na_values", [None, ["NaN"]])
  12. def test_comment(all_parsers, na_values):
  13. parser = all_parsers
  14. data = """A,B,C
  15. 1,2.,4.#hello world
  16. 5.,NaN,10.0
  17. """
  18. expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]],
  19. columns=["A", "B", "C"])
  20. result = parser.read_csv(StringIO(data), comment="#",
  21. na_values=na_values)
  22. tm.assert_frame_equal(result, expected)
  23. @pytest.mark.parametrize("read_kwargs", [
  24. dict(),
  25. dict(lineterminator="*"),
  26. dict(delim_whitespace=True),
  27. ])
  28. def test_line_comment(all_parsers, read_kwargs):
  29. parser = all_parsers
  30. data = """# empty
  31. A,B,C
  32. 1,2.,4.#hello world
  33. #ignore this line
  34. 5.,NaN,10.0
  35. """
  36. if read_kwargs.get("delim_whitespace"):
  37. data = data.replace(",", " ")
  38. elif read_kwargs.get("lineterminator"):
  39. if parser.engine != "c":
  40. pytest.skip("Custom terminator not supported with Python engine")
  41. data = data.replace("\n", read_kwargs.get("lineterminator"))
  42. read_kwargs["comment"] = "#"
  43. result = parser.read_csv(StringIO(data), **read_kwargs)
  44. expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]],
  45. columns=["A", "B", "C"])
  46. tm.assert_frame_equal(result, expected)
  47. def test_comment_skiprows(all_parsers):
  48. parser = all_parsers
  49. data = """# empty
  50. random line
  51. # second empty line
  52. 1,2,3
  53. A,B,C
  54. 1,2.,4.
  55. 5.,NaN,10.0
  56. """
  57. # This should ignore the first four lines (including comments).
  58. expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]],
  59. columns=["A", "B", "C"])
  60. result = parser.read_csv(StringIO(data), comment="#", skiprows=4)
  61. tm.assert_frame_equal(result, expected)
  62. def test_comment_header(all_parsers):
  63. parser = all_parsers
  64. data = """# empty
  65. # second empty line
  66. 1,2,3
  67. A,B,C
  68. 1,2.,4.
  69. 5.,NaN,10.0
  70. """
  71. # Header should begin at the second non-comment line.
  72. expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]],
  73. columns=["A", "B", "C"])
  74. result = parser.read_csv(StringIO(data), comment="#", header=1)
  75. tm.assert_frame_equal(result, expected)
  76. def test_comment_skiprows_header(all_parsers):
  77. parser = all_parsers
  78. data = """# empty
  79. # second empty line
  80. # third empty line
  81. X,Y,Z
  82. 1,2,3
  83. A,B,C
  84. 1,2.,4.
  85. 5.,NaN,10.0
  86. """
  87. # Skiprows should skip the first 4 lines (including comments),
  88. # while header should start from the second non-commented line,
  89. # starting with line 5.
  90. expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]],
  91. columns=["A", "B", "C"])
  92. result = parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1)
  93. tm.assert_frame_equal(result, expected)
  94. @pytest.mark.parametrize("comment_char", ["#", "~", "&", "^", "*", "@"])
  95. def test_custom_comment_char(all_parsers, comment_char):
  96. parser = all_parsers
  97. data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo"
  98. result = parser.read_csv(StringIO(data.replace("#", comment_char)),
  99. comment=comment_char)
  100. expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])
  101. tm.assert_frame_equal(result, expected)
  102. @pytest.mark.parametrize("header", ["infer", None])
  103. def test_comment_first_line(all_parsers, header):
  104. # see gh-4623
  105. parser = all_parsers
  106. data = "# notes\na,b,c\n# more notes\n1,2,3"
  107. if header is None:
  108. expected = DataFrame({0: ["a", "1"], 1: ["b", "2"], 2: ["c", "3"]})
  109. else:
  110. expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
  111. result = parser.read_csv(StringIO(data), comment="#", header=header)
  112. tm.assert_frame_equal(result, expected)