test_usecols.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534
  1. # -*- coding: utf-8 -*-
  2. """
  3. Tests the usecols functionality during parsing
  4. for all of the parsers defined in parsers.py
  5. """
  6. import numpy as np
  7. import pytest
  8. from pandas._libs.tslib import Timestamp
  9. from pandas.compat import StringIO
  10. from pandas import DataFrame, Index
  11. import pandas.util.testing as tm
  12. _msg_validate_usecols_arg = ("'usecols' must either be list-like "
  13. "of all strings, all unicode, all "
  14. "integers or a callable.")
  15. _msg_validate_usecols_names = ("Usecols do not match columns, columns "
  16. "expected but not found: {0}")
  17. def test_raise_on_mixed_dtype_usecols(all_parsers):
  18. # See gh-12678
  19. data = """a,b,c
  20. 1000,2000,3000
  21. 4000,5000,6000
  22. """
  23. usecols = [0, "b", 2]
  24. parser = all_parsers
  25. with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
  26. parser.read_csv(StringIO(data), usecols=usecols)
  27. @pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")])
  28. def test_usecols(all_parsers, usecols):
  29. data = """\
  30. a,b,c
  31. 1,2,3
  32. 4,5,6
  33. 7,8,9
  34. 10,11,12"""
  35. parser = all_parsers
  36. result = parser.read_csv(StringIO(data), usecols=usecols)
  37. expected = DataFrame([[2, 3], [5, 6], [8, 9],
  38. [11, 12]], columns=["b", "c"])
  39. tm.assert_frame_equal(result, expected)
  40. def test_usecols_with_names(all_parsers):
  41. data = """\
  42. a,b,c
  43. 1,2,3
  44. 4,5,6
  45. 7,8,9
  46. 10,11,12"""
  47. parser = all_parsers
  48. names = ["foo", "bar"]
  49. result = parser.read_csv(StringIO(data), names=names,
  50. usecols=[1, 2], header=0)
  51. expected = DataFrame([[2, 3], [5, 6], [8, 9],
  52. [11, 12]], columns=names)
  53. tm.assert_frame_equal(result, expected)
  54. @pytest.mark.parametrize("names,usecols", [
  55. (["b", "c"], [1, 2]),
  56. (["a", "b", "c"], ["b", "c"])
  57. ])
  58. def test_usecols_relative_to_names(all_parsers, names, usecols):
  59. data = """\
  60. 1,2,3
  61. 4,5,6
  62. 7,8,9
  63. 10,11,12"""
  64. parser = all_parsers
  65. result = parser.read_csv(StringIO(data), names=names,
  66. header=None, usecols=usecols)
  67. expected = DataFrame([[2, 3], [5, 6], [8, 9],
  68. [11, 12]], columns=["b", "c"])
  69. tm.assert_frame_equal(result, expected)
  70. def test_usecols_relative_to_names2(all_parsers):
  71. # see gh-5766
  72. data = """\
  73. 1,2,3
  74. 4,5,6
  75. 7,8,9
  76. 10,11,12"""
  77. parser = all_parsers
  78. result = parser.read_csv(StringIO(data), names=["a", "b"],
  79. header=None, usecols=[0, 1])
  80. expected = DataFrame([[1, 2], [4, 5], [7, 8],
  81. [10, 11]], columns=["a", "b"])
  82. tm.assert_frame_equal(result, expected)
  83. def test_usecols_name_length_conflict(all_parsers):
  84. data = """\
  85. 1,2,3
  86. 4,5,6
  87. 7,8,9
  88. 10,11,12"""
  89. parser = all_parsers
  90. msg = ("Number of passed names did not "
  91. "match number of header fields in the file"
  92. if parser.engine == "python" else
  93. "Passed header names mismatches usecols")
  94. with pytest.raises(ValueError, match=msg):
  95. parser.read_csv(StringIO(data), names=["a", "b"],
  96. header=None, usecols=[1])
  97. def test_usecols_single_string(all_parsers):
  98. # see gh-20558
  99. parser = all_parsers
  100. data = """foo, bar, baz
  101. 1000, 2000, 3000
  102. 4000, 5000, 6000"""
  103. with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
  104. parser.read_csv(StringIO(data), usecols="foo")
  105. @pytest.mark.parametrize("data", ["a,b,c,d\n1,2,3,4\n5,6,7,8",
  106. "a,b,c,d\n1,2,3,4,\n5,6,7,8,"])
  107. def test_usecols_index_col_false(all_parsers, data):
  108. # see gh-9082
  109. parser = all_parsers
  110. usecols = ["a", "c", "d"]
  111. expected = DataFrame({"a": [1, 5], "c": [3, 7], "d": [4, 8]})
  112. result = parser.read_csv(StringIO(data), usecols=usecols, index_col=False)
  113. tm.assert_frame_equal(result, expected)
  114. @pytest.mark.parametrize("index_col", ["b", 0])
  115. @pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]])
  116. def test_usecols_index_col_conflict(all_parsers, usecols, index_col):
  117. # see gh-4201: test that index_col as integer reflects usecols
  118. parser = all_parsers
  119. data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
  120. expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b"))
  121. result = parser.read_csv(StringIO(data), usecols=usecols,
  122. index_col=index_col)
  123. tm.assert_frame_equal(result, expected)
  124. def test_usecols_index_col_conflict2(all_parsers):
  125. # see gh-4201: test that index_col as integer reflects usecols
  126. parser = all_parsers
  127. data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
  128. expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")})
  129. expected = expected.set_index(["b", "c"])
  130. result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"],
  131. index_col=["b", "c"])
  132. tm.assert_frame_equal(result, expected)
  133. def test_usecols_implicit_index_col(all_parsers):
  134. # see gh-2654
  135. parser = all_parsers
  136. data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10"
  137. result = parser.read_csv(StringIO(data), usecols=["a", "b"])
  138. expected = DataFrame({"a": ["apple", "orange"],
  139. "b": ["bat", "cow"]}, index=[4, 8])
  140. tm.assert_frame_equal(result, expected)
  141. def test_usecols_regex_sep(all_parsers):
  142. # see gh-2733
  143. parser = all_parsers
  144. data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
  145. result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b"))
  146. expected = DataFrame({"a": ["apple", "orange"],
  147. "b": ["bat", "cow"]}, index=[4, 8])
  148. tm.assert_frame_equal(result, expected)
  149. def test_usecols_with_whitespace(all_parsers):
  150. parser = all_parsers
  151. data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
  152. result = parser.read_csv(StringIO(data), delim_whitespace=True,
  153. usecols=("a", "b"))
  154. expected = DataFrame({"a": ["apple", "orange"],
  155. "b": ["bat", "cow"]}, index=[4, 8])
  156. tm.assert_frame_equal(result, expected)
  157. @pytest.mark.parametrize("usecols,expected", [
  158. # Column selection by index.
  159. ([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]],
  160. columns=["2", "0"])),
  161. # Column selection by name.
  162. (["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]],
  163. columns=["0", "1"])),
  164. ])
  165. def test_usecols_with_integer_like_header(all_parsers, usecols, expected):
  166. parser = all_parsers
  167. data = """2,0,1
  168. 1000,2000,3000
  169. 4000,5000,6000"""
  170. result = parser.read_csv(StringIO(data), usecols=usecols)
  171. tm.assert_frame_equal(result, expected)
  172. @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
  173. def test_usecols_with_parse_dates(all_parsers, usecols):
  174. # see gh-9755
  175. data = """a,b,c,d,e
  176. 0,1,20140101,0900,4
  177. 0,1,20140102,1000,4"""
  178. parser = all_parsers
  179. parse_dates = [[1, 2]]
  180. cols = {
  181. "a": [0, 0],
  182. "c_d": [
  183. Timestamp("2014-01-01 09:00:00"),
  184. Timestamp("2014-01-02 10:00:00")
  185. ]
  186. }
  187. expected = DataFrame(cols, columns=["c_d", "a"])
  188. result = parser.read_csv(StringIO(data), usecols=usecols,
  189. parse_dates=parse_dates)
  190. tm.assert_frame_equal(result, expected)
  191. def test_usecols_with_parse_dates2(all_parsers):
  192. # see gh-13604
  193. parser = all_parsers
  194. data = """2008-02-07 09:40,1032.43
  195. 2008-02-07 09:50,1042.54
  196. 2008-02-07 10:00,1051.65"""
  197. names = ["date", "values"]
  198. usecols = names[:]
  199. parse_dates = [0]
  200. index = Index([Timestamp("2008-02-07 09:40"),
  201. Timestamp("2008-02-07 09:50"),
  202. Timestamp("2008-02-07 10:00")],
  203. name="date")
  204. cols = {"values": [1032.43, 1042.54, 1051.65]}
  205. expected = DataFrame(cols, index=index)
  206. result = parser.read_csv(StringIO(data), parse_dates=parse_dates,
  207. index_col=0, usecols=usecols,
  208. header=None, names=names)
  209. tm.assert_frame_equal(result, expected)
  210. def test_usecols_with_parse_dates3(all_parsers):
  211. # see gh-14792
  212. parser = all_parsers
  213. data = """a,b,c,d,e,f,g,h,i,j
  214. 2016/09/21,1,1,2,3,4,5,6,7,8"""
  215. usecols = list("abcdefghij")
  216. parse_dates = [0]
  217. cols = {"a": Timestamp("2016-09-21"),
  218. "b": [1], "c": [1], "d": [2],
  219. "e": [3], "f": [4], "g": [5],
  220. "h": [6], "i": [7], "j": [8]}
  221. expected = DataFrame(cols, columns=usecols)
  222. result = parser.read_csv(StringIO(data), usecols=usecols,
  223. parse_dates=parse_dates)
  224. tm.assert_frame_equal(result, expected)
  225. def test_usecols_with_parse_dates4(all_parsers):
  226. data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"
  227. usecols = list("abcdefghij")
  228. parse_dates = [[0, 1]]
  229. parser = all_parsers
  230. cols = {"a_b": "2016/09/21 1",
  231. "c": [1], "d": [2], "e": [3], "f": [4],
  232. "g": [5], "h": [6], "i": [7], "j": [8]}
  233. expected = DataFrame(cols, columns=["a_b"] + list("cdefghij"))
  234. result = parser.read_csv(StringIO(data), usecols=usecols,
  235. parse_dates=parse_dates)
  236. tm.assert_frame_equal(result, expected)
  237. @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
  238. @pytest.mark.parametrize("names", [
  239. list("abcde"), # Names span all columns in original data.
  240. list("acd"), # Names span only the selected columns.
  241. ])
  242. def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names):
  243. # see gh-9755
  244. s = """0,1,20140101,0900,4
  245. 0,1,20140102,1000,4"""
  246. parse_dates = [[1, 2]]
  247. parser = all_parsers
  248. cols = {
  249. "a": [0, 0],
  250. "c_d": [
  251. Timestamp("2014-01-01 09:00:00"),
  252. Timestamp("2014-01-02 10:00:00")
  253. ]
  254. }
  255. expected = DataFrame(cols, columns=["c_d", "a"])
  256. result = parser.read_csv(StringIO(s), names=names,
  257. parse_dates=parse_dates,
  258. usecols=usecols)
  259. tm.assert_frame_equal(result, expected)
  260. def test_usecols_with_unicode_strings(all_parsers):
  261. # see gh-13219
  262. data = """AAA,BBB,CCC,DDD
  263. 0.056674973,8,True,a
  264. 2.613230982,2,False,b
  265. 3.568935038,7,False,a"""
  266. parser = all_parsers
  267. exp_data = {
  268. "AAA": {
  269. 0: 0.056674972999999997,
  270. 1: 2.6132309819999997,
  271. 2: 3.5689350380000002
  272. },
  273. "BBB": {0: 8, 1: 2, 2: 7}
  274. }
  275. expected = DataFrame(exp_data)
  276. result = parser.read_csv(StringIO(data), usecols=[u"AAA", u"BBB"])
  277. tm.assert_frame_equal(result, expected)
  278. def test_usecols_with_single_byte_unicode_strings(all_parsers):
  279. # see gh-13219
  280. data = """A,B,C,D
  281. 0.056674973,8,True,a
  282. 2.613230982,2,False,b
  283. 3.568935038,7,False,a"""
  284. parser = all_parsers
  285. exp_data = {
  286. "A": {
  287. 0: 0.056674972999999997,
  288. 1: 2.6132309819999997,
  289. 2: 3.5689350380000002
  290. },
  291. "B": {0: 8, 1: 2, 2: 7}
  292. }
  293. expected = DataFrame(exp_data)
  294. result = parser.read_csv(StringIO(data), usecols=[u"A", u"B"])
  295. tm.assert_frame_equal(result, expected)
  296. @pytest.mark.parametrize("usecols", [[u"AAA", b"BBB"], [b"AAA", u"BBB"]])
  297. def test_usecols_with_mixed_encoding_strings(all_parsers, usecols):
  298. data = """AAA,BBB,CCC,DDD
  299. 0.056674973,8,True,a
  300. 2.613230982,2,False,b
  301. 3.568935038,7,False,a"""
  302. parser = all_parsers
  303. with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
  304. parser.read_csv(StringIO(data), usecols=usecols)
  305. @pytest.mark.parametrize("usecols", [
  306. ["あああ", "いい"],
  307. [u"あああ", u"いい"]
  308. ])
  309. def test_usecols_with_multi_byte_characters(all_parsers, usecols):
  310. data = """あああ,いい,ううう,ええええ
  311. 0.056674973,8,True,a
  312. 2.613230982,2,False,b
  313. 3.568935038,7,False,a"""
  314. parser = all_parsers
  315. exp_data = {
  316. "あああ": {
  317. 0: 0.056674972999999997,
  318. 1: 2.6132309819999997,
  319. 2: 3.5689350380000002
  320. },
  321. "いい": {0: 8, 1: 2, 2: 7}
  322. }
  323. expected = DataFrame(exp_data)
  324. result = parser.read_csv(StringIO(data), usecols=usecols)
  325. tm.assert_frame_equal(result, expected)
  326. def test_empty_usecols(all_parsers):
  327. data = "a,b,c\n1,2,3\n4,5,6"
  328. expected = DataFrame()
  329. parser = all_parsers
  330. result = parser.read_csv(StringIO(data), usecols=set())
  331. tm.assert_frame_equal(result, expected)
  332. def test_np_array_usecols(all_parsers):
  333. # see gh-12546
  334. parser = all_parsers
  335. data = "a,b,c\n1,2,3"
  336. usecols = np.array(["a", "b"])
  337. expected = DataFrame([[1, 2]], columns=usecols)
  338. result = parser.read_csv(StringIO(data), usecols=usecols)
  339. tm.assert_frame_equal(result, expected)
  340. @pytest.mark.parametrize("usecols,expected", [
  341. (lambda x: x.upper() in ["AAA", "BBB", "DDD"],
  342. DataFrame({
  343. "AaA": {
  344. 0: 0.056674972999999997,
  345. 1: 2.6132309819999997,
  346. 2: 3.5689350380000002
  347. },
  348. "bBb": {0: 8, 1: 2, 2: 7},
  349. "ddd": {0: "a", 1: "b", 2: "a"}
  350. })),
  351. (lambda x: False, DataFrame()),
  352. ])
  353. def test_callable_usecols(all_parsers, usecols, expected):
  354. # see gh-14154
  355. data = """AaA,bBb,CCC,ddd
  356. 0.056674973,8,True,a
  357. 2.613230982,2,False,b
  358. 3.568935038,7,False,a"""
  359. parser = all_parsers
  360. result = parser.read_csv(StringIO(data), usecols=usecols)
  361. tm.assert_frame_equal(result, expected)
  362. @pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]])
  363. def test_incomplete_first_row(all_parsers, usecols):
  364. # see gh-6710
  365. data = "1,2\n1,2,3"
  366. parser = all_parsers
  367. names = ["a", "b", "c"]
  368. expected = DataFrame({"a": [1, 1], "c": [np.nan, 3]})
  369. result = parser.read_csv(StringIO(data), names=names, usecols=usecols)
  370. tm.assert_frame_equal(result, expected)
  371. @pytest.mark.parametrize("data,usecols,kwargs,expected", [
  372. # see gh-8985
  373. ("19,29,39\n" * 2 + "10,20,30,40", [0, 1, 2],
  374. dict(header=None), DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]])),
  375. # see gh-9549
  376. (("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n"
  377. "1,2,3,,,1,\n1,2,3\n5,6,7"), ["A", "B", "C"],
  378. dict(), DataFrame({"A": [1, 3, 1, 1, 1, 5],
  379. "B": [2, 4, 2, 2, 2, 6],
  380. "C": [3, 5, 4, 3, 3, 7]})),
  381. ])
  382. def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected):
  383. # see gh-8985
  384. parser = all_parsers
  385. result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs)
  386. tm.assert_frame_equal(result, expected)
  387. @pytest.mark.parametrize("usecols,kwargs,expected,msg", [
  388. (["a", "b", "c", "d"], dict(),
  389. DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}), None),
  390. (["a", "b", "c", "f"], dict(), None,
  391. _msg_validate_usecols_names.format(r"\['f'\]")),
  392. (["a", "b", "f"], dict(), None,
  393. _msg_validate_usecols_names.format(r"\['f'\]")),
  394. (["a", "b", "f", "g"], dict(), None,
  395. _msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]")),
  396. # see gh-14671
  397. (None, dict(header=0, names=["A", "B", "C", "D"]),
  398. DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7],
  399. "D": [4, 8]}), None),
  400. (["A", "B", "C", "f"], dict(header=0, names=["A", "B", "C", "D"]),
  401. None, _msg_validate_usecols_names.format(r"\['f'\]")),
  402. (["A", "B", "f"], dict(names=["A", "B", "C", "D"]),
  403. None, _msg_validate_usecols_names.format(r"\['f'\]")),
  404. ])
  405. def test_raises_on_usecols_names_mismatch(all_parsers, usecols,
  406. kwargs, expected, msg):
  407. data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
  408. kwargs.update(usecols=usecols)
  409. parser = all_parsers
  410. if expected is None:
  411. with pytest.raises(ValueError, match=msg):
  412. parser.read_csv(StringIO(data), **kwargs)
  413. else:
  414. result = parser.read_csv(StringIO(data), **kwargs)
  415. tm.assert_frame_equal(result, expected)
  416. @pytest.mark.xfail(
  417. reason="see gh-16469: works on the C engine but not the Python engine",
  418. strict=False)
  419. @pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]])
  420. def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols):
  421. data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
  422. names = ["A", "B", "C", "D"]
  423. parser = all_parsers
  424. result = parser.read_csv(StringIO(data), header=0,
  425. names=names, usecols=usecols)
  426. expected = DataFrame({"A": [1, 5], "C": [3, 7]})
  427. tm.assert_frame_equal(result, expected)