test_na_values.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441
  1. # -*- coding: utf-8 -*-
  2. """
  3. Tests that NA values are properly handled during
  4. parsing for all of the parsers defined in parsers.py
  5. """
  6. import numpy as np
  7. import pytest
  8. from pandas.compat import StringIO, range
  9. from pandas import DataFrame, Index, MultiIndex
  10. import pandas.util.testing as tm
  11. import pandas.io.common as com
  12. def test_string_nas(all_parsers):
  13. parser = all_parsers
  14. data = """A,B,C
  15. a,b,c
  16. d,,f
  17. ,g,h
  18. """
  19. result = parser.read_csv(StringIO(data))
  20. expected = DataFrame([["a", "b", "c"],
  21. ["d", np.nan, "f"],
  22. [np.nan, "g", "h"]],
  23. columns=["A", "B", "C"])
  24. tm.assert_frame_equal(result, expected)
  25. def test_detect_string_na(all_parsers):
  26. parser = all_parsers
  27. data = """A,B
  28. foo,bar
  29. NA,baz
  30. NaN,nan
  31. """
  32. expected = DataFrame([["foo", "bar"], [np.nan, "baz"],
  33. [np.nan, np.nan]], columns=["A", "B"])
  34. result = parser.read_csv(StringIO(data))
  35. tm.assert_frame_equal(result, expected)
  36. @pytest.mark.parametrize("na_values", [
  37. ["-999.0", "-999"],
  38. [-999, -999.0],
  39. [-999.0, -999],
  40. ["-999.0"], ["-999"],
  41. [-999.0], [-999]
  42. ])
  43. @pytest.mark.parametrize("data", [
  44. """A,B
  45. -999,1.2
  46. 2,-999
  47. 3,4.5
  48. """,
  49. """A,B
  50. -999,1.200
  51. 2,-999.000
  52. 3,4.500
  53. """
  54. ])
  55. def test_non_string_na_values(all_parsers, data, na_values):
  56. # see gh-3611: with an odd float format, we can't match
  57. # the string "999.0" exactly but still need float matching
  58. parser = all_parsers
  59. expected = DataFrame([[np.nan, 1.2], [2.0, np.nan],
  60. [3.0, 4.5]], columns=["A", "B"])
  61. result = parser.read_csv(StringIO(data), na_values=na_values)
  62. tm.assert_frame_equal(result, expected)
  63. def test_default_na_values(all_parsers):
  64. _NA_VALUES = {"-1.#IND", "1.#QNAN", "1.#IND", "-1.#QNAN", "#N/A",
  65. "N/A", "n/a", "NA", "#NA", "NULL", "null", "NaN", "nan",
  66. "-NaN", "-nan", "#N/A N/A", ""}
  67. assert _NA_VALUES == com._NA_VALUES
  68. parser = all_parsers
  69. nv = len(_NA_VALUES)
  70. def f(i, v):
  71. if i == 0:
  72. buf = ""
  73. elif i > 0:
  74. buf = "".join([","] * i)
  75. buf = "{0}{1}".format(buf, v)
  76. if i < nv - 1:
  77. buf = "{0}{1}".format(buf, "".join([","] * (nv - i - 1)))
  78. return buf
  79. data = StringIO("\n".join(f(i, v) for i, v in enumerate(_NA_VALUES)))
  80. expected = DataFrame(np.nan, columns=range(nv), index=range(nv))
  81. result = parser.read_csv(data, header=None)
  82. tm.assert_frame_equal(result, expected)
  83. @pytest.mark.parametrize("na_values", ["baz", ["baz"]])
  84. def test_custom_na_values(all_parsers, na_values):
  85. parser = all_parsers
  86. data = """A,B,C
  87. ignore,this,row
  88. 1,NA,3
  89. -1.#IND,5,baz
  90. 7,8,NaN
  91. """
  92. expected = DataFrame([[1., np.nan, 3], [np.nan, 5, np.nan],
  93. [7, 8, np.nan]], columns=["A", "B", "C"])
  94. result = parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1])
  95. tm.assert_frame_equal(result, expected)
  96. def test_bool_na_values(all_parsers):
  97. data = """A,B,C
  98. True,False,True
  99. NA,True,False
  100. False,NA,True"""
  101. parser = all_parsers
  102. result = parser.read_csv(StringIO(data))
  103. expected = DataFrame({"A": np.array([True, np.nan, False], dtype=object),
  104. "B": np.array([False, True, np.nan], dtype=object),
  105. "C": [True, False, True]})
  106. tm.assert_frame_equal(result, expected)
  107. def test_na_value_dict(all_parsers):
  108. data = """A,B,C
  109. foo,bar,NA
  110. bar,foo,foo
  111. foo,bar,NA
  112. bar,foo,foo"""
  113. parser = all_parsers
  114. df = parser.read_csv(StringIO(data),
  115. na_values={"A": ["foo"], "B": ["bar"]})
  116. expected = DataFrame({"A": [np.nan, "bar", np.nan, "bar"],
  117. "B": [np.nan, "foo", np.nan, "foo"],
  118. "C": [np.nan, "foo", np.nan, "foo"]})
  119. tm.assert_frame_equal(df, expected)
  120. @pytest.mark.parametrize("index_col,expected", [
  121. ([0], DataFrame({"b": [np.nan], "c": [1], "d": [5]},
  122. index=Index([0], name="a"))),
  123. ([0, 2], DataFrame({"b": [np.nan], "d": [5]},
  124. index=MultiIndex.from_tuples(
  125. [(0, 1)], names=["a", "c"]))),
  126. (["a", "c"], DataFrame({"b": [np.nan], "d": [5]},
  127. index=MultiIndex.from_tuples(
  128. [(0, 1)], names=["a", "c"]))),
  129. ])
  130. def test_na_value_dict_multi_index(all_parsers, index_col, expected):
  131. data = """\
  132. a,b,c,d
  133. 0,NA,1,5
  134. """
  135. parser = all_parsers
  136. result = parser.read_csv(StringIO(data), na_values=set(),
  137. index_col=index_col)
  138. tm.assert_frame_equal(result, expected)
  139. @pytest.mark.parametrize("kwargs,expected", [
  140. (dict(), DataFrame({"A": ["a", "b", np.nan, "d", "e", np.nan, "g"],
  141. "B": [1, 2, 3, 4, 5, 6, 7],
  142. "C": ["one", "two", "three", np.nan, "five",
  143. np.nan, "seven"]})),
  144. (dict(na_values={"A": [], "C": []}, keep_default_na=False),
  145. DataFrame({"A": ["a", "b", "", "d", "e", "nan", "g"],
  146. "B": [1, 2, 3, 4, 5, 6, 7],
  147. "C": ["one", "two", "three", "nan", "five", "", "seven"]})),
  148. (dict(na_values=["a"], keep_default_na=False),
  149. DataFrame({"A": [np.nan, "b", "", "d", "e", "nan", "g"],
  150. "B": [1, 2, 3, 4, 5, 6, 7],
  151. "C": ["one", "two", "three", "nan", "five", "", "seven"]})),
  152. (dict(na_values={"A": [], "C": []}),
  153. DataFrame({"A": ["a", "b", np.nan, "d", "e", np.nan, "g"],
  154. "B": [1, 2, 3, 4, 5, 6, 7],
  155. "C": ["one", "two", "three", np.nan,
  156. "five", np.nan, "seven"]})),
  157. ])
  158. def test_na_values_keep_default(all_parsers, kwargs, expected):
  159. data = """\
  160. A,B,C
  161. a,1,one
  162. b,2,two
  163. ,3,three
  164. d,4,nan
  165. e,5,five
  166. nan,6,
  167. g,7,seven
  168. """
  169. parser = all_parsers
  170. result = parser.read_csv(StringIO(data), **kwargs)
  171. tm.assert_frame_equal(result, expected)
  172. def test_no_na_values_no_keep_default(all_parsers):
  173. # see gh-4318: passing na_values=None and
  174. # keep_default_na=False yields 'None" as a na_value
  175. data = """\
  176. A,B,C
  177. a,1,None
  178. b,2,two
  179. ,3,None
  180. d,4,nan
  181. e,5,five
  182. nan,6,
  183. g,7,seven
  184. """
  185. parser = all_parsers
  186. result = parser.read_csv(StringIO(data), keep_default_na=False)
  187. expected = DataFrame({"A": ["a", "b", "", "d", "e", "nan", "g"],
  188. "B": [1, 2, 3, 4, 5, 6, 7],
  189. "C": ["None", "two", "None", "nan",
  190. "five", "", "seven"]})
  191. tm.assert_frame_equal(result, expected)
  192. def test_no_keep_default_na_dict_na_values(all_parsers):
  193. # see gh-19227
  194. data = "a,b\n,2"
  195. parser = all_parsers
  196. result = parser.read_csv(StringIO(data), na_values={"b": ["2"]},
  197. keep_default_na=False)
  198. expected = DataFrame({"a": [""], "b": [np.nan]})
  199. tm.assert_frame_equal(result, expected)
  200. def test_no_keep_default_na_dict_na_scalar_values(all_parsers):
  201. # see gh-19227
  202. #
  203. # Scalar values shouldn't cause the parsing to crash or fail.
  204. data = "a,b\n1,2"
  205. parser = all_parsers
  206. df = parser.read_csv(StringIO(data), na_values={"b": 2},
  207. keep_default_na=False)
  208. expected = DataFrame({"a": [1], "b": [np.nan]})
  209. tm.assert_frame_equal(df, expected)
  210. @pytest.mark.parametrize("col_zero_na_values", [
  211. 113125, "113125"
  212. ])
  213. def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers,
  214. col_zero_na_values):
  215. # see gh-19227
  216. data = """\
  217. 113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008
  218. 729639,"qwer","",asdfkj,466.681,,252.373
  219. """
  220. parser = all_parsers
  221. expected = DataFrame({0: [np.nan, 729639.0],
  222. 1: [np.nan, "qwer"],
  223. 2: ["/blaha", np.nan],
  224. 3: ["kjsdkj", "asdfkj"],
  225. 4: [412.166, 466.681],
  226. 5: ["225.874", ""],
  227. 6: [np.nan, 252.373]})
  228. result = parser.read_csv(StringIO(data), header=None,
  229. keep_default_na=False,
  230. na_values={2: "", 6: "214.008",
  231. 1: "blah", 0: col_zero_na_values})
  232. tm.assert_frame_equal(result, expected)
  233. @pytest.mark.parametrize("na_filter,row_data", [
  234. (True, [[1, "A"], [np.nan, np.nan], [3, "C"]]),
  235. (False, [["1", "A"], ["nan", "B"], ["3", "C"]]),
  236. ])
  237. def test_na_values_na_filter_override(all_parsers, na_filter, row_data):
  238. data = """\
  239. A,B
  240. 1,A
  241. nan,B
  242. 3,C
  243. """
  244. parser = all_parsers
  245. result = parser.read_csv(StringIO(data), na_values=["B"],
  246. na_filter=na_filter)
  247. expected = DataFrame(row_data, columns=["A", "B"])
  248. tm.assert_frame_equal(result, expected)
  249. def test_na_trailing_columns(all_parsers):
  250. parser = all_parsers
  251. data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax
  252. 2012-03-14,USD,AAPL,BUY,1000
  253. 2012-05-12,USD,SBUX,SELL,500"""
  254. # Trailing columns should be all NaN.
  255. result = parser.read_csv(StringIO(data))
  256. expected = DataFrame([
  257. ["2012-03-14", "USD", "AAPL", "BUY", 1000, np.nan, np.nan, np.nan],
  258. ["2012-05-12", "USD", "SBUX", "SELL", 500, np.nan, np.nan, np.nan],
  259. ], columns=["Date", "Currency", "Symbol", "Type",
  260. "Units", "UnitPrice", "Cost", "Tax"])
  261. tm.assert_frame_equal(result, expected)
  262. @pytest.mark.parametrize("na_values,row_data", [
  263. (1, [[np.nan, 2.0], [2.0, np.nan]]),
  264. ({"a": 2, "b": 1}, [[1.0, 2.0], [np.nan, np.nan]]),
  265. ])
  266. def test_na_values_scalar(all_parsers, na_values, row_data):
  267. # see gh-12224
  268. parser = all_parsers
  269. names = ["a", "b"]
  270. data = "1,2\n2,1"
  271. result = parser.read_csv(StringIO(data), names=names, na_values=na_values)
  272. expected = DataFrame(row_data, columns=names)
  273. tm.assert_frame_equal(result, expected)
  274. def test_na_values_dict_aliasing(all_parsers):
  275. parser = all_parsers
  276. na_values = {"a": 2, "b": 1}
  277. na_values_copy = na_values.copy()
  278. names = ["a", "b"]
  279. data = "1,2\n2,1"
  280. expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names)
  281. result = parser.read_csv(StringIO(data), names=names, na_values=na_values)
  282. tm.assert_frame_equal(result, expected)
  283. tm.assert_dict_equal(na_values, na_values_copy)
  284. def test_na_values_dict_col_index(all_parsers):
  285. # see gh-14203
  286. data = "a\nfoo\n1"
  287. parser = all_parsers
  288. na_values = {0: "foo"}
  289. result = parser.read_csv(StringIO(data), na_values=na_values)
  290. expected = DataFrame({"a": [np.nan, 1]})
  291. tm.assert_frame_equal(result, expected)
  292. @pytest.mark.parametrize("data,kwargs,expected", [
  293. (str(2**63) + "\n" + str(2**63 + 1),
  294. dict(na_values=[2**63]), DataFrame([str(2**63), str(2**63 + 1)])),
  295. (str(2**63) + ",1" + "\n,2",
  296. dict(), DataFrame([[str(2**63), 1], ['', 2]])),
  297. (str(2**63) + "\n1",
  298. dict(na_values=[2**63]), DataFrame([np.nan, 1])),
  299. ])
  300. def test_na_values_uint64(all_parsers, data, kwargs, expected):
  301. # see gh-14983
  302. parser = all_parsers
  303. result = parser.read_csv(StringIO(data), header=None, **kwargs)
  304. tm.assert_frame_equal(result, expected)
  305. def test_empty_na_values_no_default_with_index(all_parsers):
  306. # see gh-15835
  307. data = "a,1\nb,2"
  308. parser = all_parsers
  309. expected = DataFrame({"1": [2]}, index=Index(["b"], name="a"))
  310. result = parser.read_csv(StringIO(data), index_col=0,
  311. keep_default_na=False)
  312. tm.assert_frame_equal(result, expected)
  313. @pytest.mark.parametrize("na_filter,index_data", [
  314. (False, ["", "5"]),
  315. (True, [np.nan, 5.0]),
  316. ])
  317. def test_no_na_filter_on_index(all_parsers, na_filter, index_data):
  318. # see gh-5239
  319. #
  320. # Don't parse NA-values in index unless na_filter=True
  321. parser = all_parsers
  322. data = "a,b,c\n1,,3\n4,5,6"
  323. expected = DataFrame({"a": [1, 4], "c": [3, 6]},
  324. index=Index(index_data, name="b"))
  325. result = parser.read_csv(StringIO(data), index_col=[1],
  326. na_filter=na_filter)
  327. tm.assert_frame_equal(result, expected)
  328. def test_inf_na_values_with_int_index(all_parsers):
  329. # see gh-17128
  330. parser = all_parsers
  331. data = "idx,col1,col2\n1,3,4\n2,inf,-inf"
  332. # Don't fail with OverflowError with inf's and integer index column.
  333. out = parser.read_csv(StringIO(data), index_col=[0],
  334. na_values=["inf", "-inf"])
  335. expected = DataFrame({"col1": [3, np.nan], "col2": [4, np.nan]},
  336. index=Index([1, 2], name="idx"))
  337. tm.assert_frame_equal(out, expected)
  338. @pytest.mark.parametrize("na_filter", [True, False])
  339. def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
  340. # see gh-20377
  341. parser = all_parsers
  342. data = "a,b,c\n1,,3\n4,5,6"
  343. # na_filter=True --> missing value becomes NaN.
  344. # na_filter=False --> missing value remains empty string.
  345. empty = np.nan if na_filter else ""
  346. expected = DataFrame({"a": ["1", "4"],
  347. "b": [empty, "5"],
  348. "c": ["3", "6"]})
  349. result = parser.read_csv(StringIO(data), na_filter=na_filter, dtype=str)
  350. tm.assert_frame_equal(result, expected)
  351. @pytest.mark.parametrize("data, na_values", [
  352. ("false,1\n,1\ntrue", None),
  353. ("false,1\nnull,1\ntrue", None),
  354. ("false,1\nnan,1\ntrue", None),
  355. ("false,1\nfoo,1\ntrue", 'foo'),
  356. ("false,1\nfoo,1\ntrue", ['foo']),
  357. ("false,1\nfoo,1\ntrue", {'a': 'foo'}),
  358. ])
  359. def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values):
  360. parser = all_parsers
  361. msg = ("(Bool column has NA values in column [0a])|"
  362. "(cannot safely convert passed user dtype of "
  363. "bool for object dtyped data in column 0)")
  364. with pytest.raises(ValueError, match=msg):
  365. parser.read_csv(StringIO(data), header=None, names=['a', 'b'],
  366. dtype={'a': 'bool'}, na_values=na_values)