# -*- coding: utf-8 -*- """ Tests that quoting specifications are properly handled during parsing for all of the parsers defined in parsers.py """ import csv import pytest from pandas.compat import PY2, StringIO, u from pandas.errors import ParserError from pandas import DataFrame import pandas.util.testing as tm @pytest.mark.parametrize("kwargs,msg", [ (dict(quotechar="foo"), '"quotechar" must be a(n)? 1-character string'), (dict(quotechar=None, quoting=csv.QUOTE_MINIMAL), "quotechar must be set if quoting enabled"), (dict(quotechar=2), '"quotechar" must be string, not int') ]) def test_bad_quote_char(all_parsers, kwargs, msg): data = "1,2,3" parser = all_parsers with pytest.raises(TypeError, match=msg): parser.read_csv(StringIO(data), **kwargs) @pytest.mark.parametrize("quoting,msg", [ ("foo", '"quoting" must be an integer'), (5, 'bad "quoting" value'), # quoting must be in the range [0, 3] ]) def test_bad_quoting(all_parsers, quoting, msg): data = "1,2,3" parser = all_parsers with pytest.raises(TypeError, match=msg): parser.read_csv(StringIO(data), quoting=quoting) def test_quote_char_basic(all_parsers): parser = all_parsers data = 'a,b,c\n1,2,"cat"' expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"]) result = parser.read_csv(StringIO(data), quotechar='"') tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"]) def test_quote_char_various(all_parsers, quote_char): parser = all_parsers expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"]) data = 'a,b,c\n1,2,"cat"' new_data = data.replace('"', quote_char) result = parser.read_csv(StringIO(new_data), quotechar=quote_char) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE]) @pytest.mark.parametrize("quote_char", ["", None]) def test_null_quote_char(all_parsers, quoting, quote_char): kwargs = dict(quotechar=quote_char, quoting=quoting) data = "a,b,c\n1,2,3" parser = all_parsers if quoting != csv.QUOTE_NONE: # Sanity checking. msg = "quotechar must be set if quoting enabled" with pytest.raises(TypeError, match=msg): parser.read_csv(StringIO(data), **kwargs) else: expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) result = parser.read_csv(StringIO(data), **kwargs) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("kwargs,exp_data", [ (dict(), [[1, 2, "foo"]]), # Test default. # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading. (dict(quotechar='"', quoting=csv.QUOTE_MINIMAL), [[1, 2, "foo"]]), # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading. (dict(quotechar='"', quoting=csv.QUOTE_ALL), [[1, 2, "foo"]]), # QUOTE_NONE tells the reader to do no special handling # of quote characters and leave them alone. (dict(quotechar='"', quoting=csv.QUOTE_NONE), [[1, 2, '"foo"']]), # QUOTE_NONNUMERIC tells the reader to cast # all non-quoted fields to float (dict(quotechar='"', quoting=csv.QUOTE_NONNUMERIC), [[1.0, 2.0, "foo"]]) ]) def test_quoting_various(all_parsers, kwargs, exp_data): data = '1,2,"foo"' parser = all_parsers columns = ["a", "b", "c"] result = parser.read_csv(StringIO(data), names=columns, **kwargs) expected = DataFrame(exp_data, columns=columns) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("doublequote,exp_data", [ (True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']]), ]) def test_double_quote(all_parsers, doublequote, exp_data): parser = all_parsers data = 'a,b\n3,"4 "" 5"' result = parser.read_csv(StringIO(data), quotechar='"', doublequote=doublequote) expected = DataFrame(exp_data, columns=["a", "b"]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("quotechar", [ u('"'), pytest.param(u('\u0001'), marks=pytest.mark.skipif( PY2, reason="Python 2.x does not handle unicode well."))]) def test_quotechar_unicode(all_parsers, quotechar): # see gh-14477 data = "a\n1" parser = all_parsers expected = DataFrame({"a": [1]}) result = parser.read_csv(StringIO(data), quotechar=quotechar) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("balanced", [True, False]) def test_unbalanced_quoting(all_parsers, balanced): # see gh-22789. parser = all_parsers data = "a,b,c\n1,2,\"3" if balanced: # Re-balance the quoting and read in without errors. expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) result = parser.read_csv(StringIO(data + '"')) tm.assert_frame_equal(result, expected) else: msg = ("EOF inside string starting at row 1" if parser.engine == "c" else "unexpected end of data") with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data))