test_dialect.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. # -*- coding: utf-8 -*-
  2. """
  3. Tests that dialects are properly handled during parsing
  4. for all of the parsers defined in parsers.py
  5. """
  6. import csv
  7. import pytest
  8. from pandas.compat import StringIO
  9. from pandas.errors import ParserWarning
  10. from pandas import DataFrame
  11. import pandas.util.testing as tm
  12. @pytest.fixture
  13. def custom_dialect():
  14. dialect_name = "weird"
  15. dialect_kwargs = dict(doublequote=False, escapechar="~", delimiter=":",
  16. skipinitialspace=False, quotechar="~", quoting=3)
  17. return dialect_name, dialect_kwargs
  18. def test_dialect(all_parsers):
  19. parser = all_parsers
  20. data = """\
  21. label1,label2,label3
  22. index1,"a,c,e
  23. index2,b,d,f
  24. """
  25. dia = csv.excel()
  26. dia.quoting = csv.QUOTE_NONE
  27. df = parser.read_csv(StringIO(data), dialect=dia)
  28. data = """\
  29. label1,label2,label3
  30. index1,a,c,e
  31. index2,b,d,f
  32. """
  33. exp = parser.read_csv(StringIO(data))
  34. exp.replace("a", "\"a", inplace=True)
  35. tm.assert_frame_equal(df, exp)
  36. def test_dialect_str(all_parsers):
  37. dialect_name = "mydialect"
  38. parser = all_parsers
  39. data = """\
  40. fruit:vegetable
  41. apple:broccoli
  42. pear:tomato
  43. """
  44. exp = DataFrame({
  45. "fruit": ["apple", "pear"],
  46. "vegetable": ["broccoli", "tomato"]
  47. })
  48. with tm.with_csv_dialect(dialect_name, delimiter=":"):
  49. df = parser.read_csv(StringIO(data), dialect=dialect_name)
  50. tm.assert_frame_equal(df, exp)
  51. def test_invalid_dialect(all_parsers):
  52. class InvalidDialect(object):
  53. pass
  54. data = "a\n1"
  55. parser = all_parsers
  56. msg = "Invalid dialect"
  57. with pytest.raises(ValueError, match=msg):
  58. parser.read_csv(StringIO(data), dialect=InvalidDialect)
  59. @pytest.mark.parametrize("arg", [None, "doublequote", "escapechar",
  60. "skipinitialspace", "quotechar", "quoting"])
  61. @pytest.mark.parametrize("value", ["dialect", "default", "other"])
  62. def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect,
  63. arg, value):
  64. # see gh-23761.
  65. dialect_name, dialect_kwargs = custom_dialect
  66. parser = all_parsers
  67. expected = DataFrame({"a": [1], "b": [2]})
  68. data = "a:b\n1:2"
  69. warning_klass = None
  70. kwds = dict()
  71. # arg=None tests when we pass in the dialect without any other arguments.
  72. if arg is not None:
  73. if "value" == "dialect": # No conflict --> no warning.
  74. kwds[arg] = dialect_kwargs[arg]
  75. elif "value" == "default": # Default --> no warning.
  76. from pandas.io.parsers import _parser_defaults
  77. kwds[arg] = _parser_defaults[arg]
  78. else: # Non-default + conflict with dialect --> warning.
  79. warning_klass = ParserWarning
  80. kwds[arg] = "blah"
  81. with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
  82. with tm.assert_produces_warning(warning_klass):
  83. result = parser.read_csv(StringIO(data),
  84. dialect=dialect_name, **kwds)
  85. tm.assert_frame_equal(result, expected)
  86. @pytest.mark.parametrize("kwargs,warning_klass", [
  87. (dict(sep=","), None), # sep is default --> sep_override=True
  88. (dict(sep="."), ParserWarning), # sep isn't default --> sep_override=False
  89. (dict(delimiter=":"), None), # No conflict
  90. (dict(delimiter=None), None), # Default arguments --> sep_override=True
  91. (dict(delimiter=","), ParserWarning), # Conflict
  92. (dict(delimiter="."), ParserWarning), # Conflict
  93. ], ids=["sep-override-true", "sep-override-false",
  94. "delimiter-no-conflict", "delimiter-default-arg",
  95. "delimiter-conflict", "delimiter-conflict2"])
  96. def test_dialect_conflict_delimiter(all_parsers, custom_dialect,
  97. kwargs, warning_klass):
  98. # see gh-23761.
  99. dialect_name, dialect_kwargs = custom_dialect
  100. parser = all_parsers
  101. expected = DataFrame({"a": [1], "b": [2]})
  102. data = "a:b\n1:2"
  103. with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
  104. with tm.assert_produces_warning(warning_klass):
  105. result = parser.read_csv(StringIO(data),
  106. dialect=dialect_name, **kwargs)
  107. tm.assert_frame_equal(result, expected)